psx_gpu: consolidate C code, implement exnhancement asm
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
CommitLineData
75e28f62
E
1/*
2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
59d15d23 3 * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
75e28f62
E
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of
8 * the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15
16#define MAX_SPANS 512
17#define MAX_BLOCKS 64
18#define MAX_BLOCKS_PER_ROW 128
19
cb88320b 20#include "psx_gpu_offsets.h"
75e28f62 21
cb88320b 22#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
75e28f62 23
75e28f62
E
24#define edge_data_left_x_offset 0
25#define edge_data_num_blocks_offset 2
26#define edge_data_right_mask_offset 4
27#define edge_data_y_offset 6
28
29
30#define psx_gpu r0
31#define v_a r1
32#define v_b r2
33#define v_c r3
34
35#define x0 r4
36#define x1 r5
37#define x2 r6
38#define x0_x1 r5
39#define x1_x2 r6
40#define y0 r7
41#define y1 r8
42#define y2 r9
43#define y0_y1 r7
44#define y1_y2 r8
45#define b0 r9
46#define b1 r10
47#define b2 r11
48#define b0_b1 r10
49#define b1_b2 r11
50
51
52#define area_r_s r5
53
54#define g_bx0 r2
55#define g_bx r3
56#define g_bx2 r4
57#define g_bx3 r5
58#define b_base r6
59#define g_by r8
60
61#define gs_bx r7
62#define gs_by r10
63
64#define ga_bx g_bx
65#define ga_by g_by
66
67#define gw_bx_h g_bx
68#define gw_by_h g_by
69
70#define gw_bx_l r11
71#define gw_by_l gw_bx_l
72
73#define store_a r0
74#define store_b r1
75#define store_inc r5
76
77
78#define v0 q0
79#define uvrgb0 d0
80#define x0_y0 d1
81
82#define v1 q1
83#define uvrgb1 d2
84#define x1_y1 d3
85
86#define v2 q2
87#define uvrgb2 d4
88#define x2_y2 d5
89
90#define x0_ab q3
91#define uvrg_xxxx0 q3
92#define uvrg0 d6
93#define xxxx0 d7
94
95#define x1_ab q4
96#define uvrg_xxxx1 q4
97#define uvrg1 d8
98#define xxxx1 d9
99
100#define x2_ab q5
101#define uvrg_xxxx2 q5
102#define uvrg2 d10
103#define xxxx2 d11
104
105#define y0_ab q6
106#define yyyy_uvrg0 q6
107#define yyyy0 d12
108#define uvrg0b d13
109
110#define y1_ab q7
111#define yyyy_uvrg1 q7
112#define yyyy1 d14
113#define uvrg1b d15
114
115#define y2_ab q8
116#define yyyy_uvrg2 q8
117#define yyyy2 d16
118#define uvrg2b d17
119
120#define d0_ab q9
121#define d0_a d18
122#define d0_b d19
123
124#define d1_ab q10
125#define d1_a d20
126#define d1_b d21
127
128#define d2_ab q11
129#define d2_a d22
130#define d2_b d23
131
132#define d3_ab q12
133#define d3_a d24
134#define d3_b d25
135
136#define ga_uvrg_x q1
137#define ga_uvrg_y q4
138
139#define dx x0_x1
140#define dy y0_y1
141#define db b0_b1
142
143#define uvrg_base q11
144
145#define gs_uvrg_x q5
146#define gs_uvrg_y q6
147
148#define g_uvrg_x q1
149#define ga_uv_x d2
150#define g_uv_x d2
151#define ga_rg_x d3
152#define g_rg_x d3
153
154#define g_uvrg_y q4
155#define ga_uv_y d8
156#define g_uv_y d8
157#define ga_rg_y d9
158#define g_rg_y d9
159
160#define gw_uv_x q1
161#define gw_rg_x q2
162#define gw_uv_y q4
163#define gw_rg_y q3
164
165#define w_mask q9
166#define w_mask_l d18
167
168#define r_shift q10
169
170#define uvrg_dx0 q0
171#define uvrg_dx0l d0
172#define uvrg_dx0h d1
173
174#define uvrg_dx1 q1
175#define uvrg_dx1l d2
176#define uvrg_dx1h d3
177
178#define uvrg_dx2 q2
179#define uvrg_dx2l d4
180#define uvrg_dx2h d5
181
182#define uvrg_dx3 q3
183#define uvrg_dx3l d6
184#define uvrg_dx3h d7
185
c6063f89 186#define uvrgb_phase q13
75e28f62
E
187
188.align 4
189
5d834c08 190/* FIXME: users of this should be in psx_gpu instead */
191#ifndef __PIC__
192#define load_pointer(register, pointer) \
193 movw register, :lower16:pointer; \
194 movt register, :upper16:pointer; \
195
196#else
197#define load_pointer(register, pointer) \
198 ldr register, =pointer \
199
200#endif
201
75e28f62
E
202#define function(name) \
203 .global name; \
204 name: \
205
206@ r0: psx_gpu
207@ r1: v_a
208@ r2: v_b
209@ r3: v_c
210
211function(compute_all_gradients)
212 // First compute the triangle area reciprocal and shift. The division will
213 // happen concurrently with much of the work which follows.
214 @ r12 = psx_gpu->triangle_area
215 ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
216 stmdb sp!, { r4 - r11, lr }
217
218 @ load exponent of 62 into upper half of double
219 movw r4, #0
220 clz r14, r12 @ r14 = shift
221
222 movt r4, #((62 + 1023) << 4)
223 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
224
225 @ load area normalized into lower half of double
226 mov r5, r12, lsr #10
227 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
228
229 movt r4, #((1022 + 31) << 4)
230 mov r5, r12, lsl #20
231
232 add r4, r4, r12, lsr #11
233 vmov.f64 d31, r5, r4
234
235 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
236
237 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
238 // ( d0 * d1 ) - ( d2 * d3 ) =
239 // ( m0 ) - ( m1 ) = gradient
240
241 // This is split to do 12 elements at a time over three sets: a, b, and c.
242 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
243 // two of the slots are unused.
244
245 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
246 // is g.
247
248 // First type is: uvrg bxxx xxxx
249 // Second type is: yyyy ybyy uvrg
250 // Since x_a and y_c are the same the same variable is used for both.
251
252 vld1.u32 { v0 }, [ v_a, : 128 ] @ v0 = { uvrg0, b0, x0, y0 }
253 ldrsh x0, [ v_a, #8 ] @ load x0
254
255 vld1.u32 { v1 }, [ v_b, : 128 ] @ v1 = { uvrg1, b1, x1, y1}
256 ldrh x1, [ v_b, #8 ] @ load x1
257
258 vld1.u32 { v2 }, [ v_c, : 128 ] @ v2 = { uvrg2, b2, x2, y2 }
259 ldrh x2, [ v_c, #8 ] @ load x2
260
261 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
262 ldrh y0, [ v_a, #10 ] @ load y0
263
264 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
265 ldrh y1, [ v_b, #10 ] @ load y1
266
267 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
268 ldrh y2, [ v_c, #10 ] @ load y2
269
270 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
271 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
272
273 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
274 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
275
276 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
277 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
278
279 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
280 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
281
282 ldrb b2, [ v_c, #4 ] @ load b2
283 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
284
285 ldrb b1, [ v_b, #4 ] @ load b1
286 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
287
288 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
289 vsub.s16 d0_ab, x1_ab, x0_ab
290
291 ldrb b0, [ v_a, #4 ] @ load b0
292 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
293
294 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
295 vsub.s16 d2_ab, x2_ab, x1_ab
296
297 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
298 vsub.s16 d1_ab, y2_ab, y1_ab
299
300 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
301 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
302
303 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
304 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
305
306 vsub.s16 d3_ab, y1_ab, y0_ab
307 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
308 @ ((x2 - X1) * (b1 - b0))
309 vmull.s16 ga_uvrg_x, d0_a, d1_a
310 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
311 @ ((b2 - b1) * (y1 - y0))
312 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
313 movs gs_bx, ga_bx, asr #31
314
315 vmull.s16 ga_uvrg_y, d0_b, d1_b
316 rsbmi ga_bx, ga_bx, #0
317
c6063f89 318 @ r12 = psx_gpu->uvrgb_phase
319 ldr r12, [ psx_gpu, #psx_gpu_uvrgb_phase_offset ]
320
75e28f62
E
321 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
322 movs gs_by, ga_by, asr #31
323
324 vshr.u64 d0, d30, #22
c6063f89 325 add b_base, r12, b0, lsl #16
326
327 vdup.u32 uvrgb_phase, r12
75e28f62
E
328
329 rsbmi ga_by, ga_by, #0
330 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
331
332 @ r12 = psx_gpu->triangle_winding_offset
333 ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
334 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
335
75e28f62
E
336 rsb r12, r12, #0 @ r12 = -(triangle->winding)
337
338 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
339 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
340
341 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
342 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
343
c6063f89 344 vadd.u32 uvrg_base, uvrgb_phase
75e28f62
E
345 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
346
347 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
348 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
349
350 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
351 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
352 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
353 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
354
355 vshl.u64 gw_rg_x, gw_rg_x, r_shift
356 vshl.u64 gw_uv_x, gw_uv_x, r_shift
357 vshl.u64 gw_rg_y, gw_rg_y, r_shift
358 vshl.u64 gw_uv_y, gw_uv_y, r_shift
359
360 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
361 vmovn.u64 g_uv_x, gw_uv_x
362
363 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
364 vmovn.u64 g_rg_x, gw_rg_x
365
366 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
367 vmovn.u64 g_uv_y, gw_uv_y
368
369 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
370 vmovn.u64 g_rg_y, gw_rg_y
371
372 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
373 mov ga_bx, ga_bx, lsl #13
374
375 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
376 mov ga_by, ga_by, lsl #13
377
378 vdup.u32 x0_y0, x0
379 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
380
381 vshl.u32 g_uvrg_x, g_uvrg_x, #4
382 vshl.u32 g_uvrg_y, g_uvrg_y, #4
383
384 umull gw_by_l, gw_by_h, ga_by, area_r_s
385 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
386
387 eor gs_bx, gs_bx, r12
388 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
389
390 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
391 eor gs_by, gs_by, r12
392
393 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
394 add store_a, psx_gpu, #psx_gpu_uvrg_offset
395
396 sub r11, r11, #(32 - 13)
397
398 add store_b, store_a, #16
399 mov store_inc, #32
400
401 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
402 vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
403
404 vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
405 mov g_bx, gw_bx_h, lsr r11
406
407 vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
408 mov g_by, gw_by_h, lsr r11
409
410 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
411 [ store_b, : 128 ], store_inc
412 eor g_bx, g_bx, gs_bx
413
414 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
415 [ store_b, : 128 ], store_inc
416 sub g_bx, g_bx, gs_bx
417
418 lsl g_bx, g_bx, #4
419 eor g_by, g_by, gs_by
420
421 mls b_base, g_bx, x0, b_base
422 sub g_by, g_by, gs_by
423
424 lsl g_by, g_by, #4
425 mov g_bx0, #0
426
427 add g_bx2, g_bx, g_bx
428 add g_bx3, g_bx, g_bx2
429
430 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
431
432 ldmia sp!, { r4 - r11, pc }
433
434
435#define psx_gpu r0
436#define v_a r1
437#define v_b r2
438#define v_c r3
439
440#define temp r14
441
442#define x_a r4
443#define x_b r5
444#define x_c r6
445#define y_a r1
446#define y_b r2
447#define y_c r3
448
449#define height_minor_a r7
450#define height_minor_b r8
451#define height_major r9
452#define height r9
453
454#define reciprocal_table_ptr r10
455
456#define edge_alt_low r4
457#define edge_alt_high r5
458#define edge_dx_dy_alt r6
459#define edge_shift_alt r10
460
461#define edge_dx_dy_alt_low r4
462#define edge_dx_dy_alt_high r5
463
464#define span_edge_data r4
465#define span_uvrg_offset r5
466#define span_b_offset r6
467
468#define clip r14
469
470#define b r11
471#define b_dy r12
472
473
474#define alternate_x q0
475#define alternate_dx_dy q1
476#define alternate_x_32 q2
477
478#define alternate_x_low d0
479#define alternate_x_high d1
480#define alternate_dx_dy_low d2
481#define alternate_dx_dy_high d3
482#define alternate_x_32_low d4
483#define alternate_x_32_high d5
484
485#define left_x q3
486#define right_x q4
487#define left_dx_dy q5
488#define right_dx_dy q6
489#define left_edge q7
490#define right_edge q8
491
492#define left_x_low d6
493#define left_x_high d7
494#define right_x_low d8
495#define right_x_high d9
496#define left_dx_dy_low d10
497#define left_dx_dy_high d11
498#define right_dx_dy_low d12
499#define right_dx_dy_high d13
500#define left_edge_low d14
501#define left_edge_high d15
502#define right_edge_low d16
503#define right_edge_high d17
504
505#define y_mid_point d18
506#define c_0x0004 d19
507
508#define left_right_x_16 q11
509#define span_shifts_y q12
510#define c_0x0001 q13
511
512#define span_shifts d24
513#define y_x4 d25
514#define c_0xFFFE d26
515#define c_0x0007 d27
516
517#define left_right_x_16_low d22
518#define left_right_x_16_high d23
519
520#define uvrg q14
521#define uvrg_dy q15
522
523#define alternate_x_16 d4
524
525#define v_clip q3
526#define v_clip_low d6
527
528#define right_x_32 q10
529#define left_x_32 q11
530#define alternate_select d24
531
532#define right_x_32_low d20
533#define right_x_32_high d21
534#define left_x_32_low d22
535#define left_x_32_high d23
536
537#define edges_xy q0
538#define edges_dx_dy d2
539#define edge_shifts d3
540#define edge_shifts_64 q2
541
542#define edges_xy_left d0
543#define edges_xy_right d1
544
545#define height_reciprocals d6
546#define heights d7
547
548#define widths d8
549#define c_0x01 d9
550#define x_starts d10
551#define x_ends d11
552
553#define heights_b d12
554#define edges_dx_dy_64 q10
555
556#define edges_dx_dy_64_left d20
557#define edges_dx_dy_64_right d21
558
559
560#define setup_spans_prologue() \
561 stmdb sp!, { r4 - r11, lr }; \
562 \
563 ldrsh x_a, [ v_a, #8 ]; \
564 ldrsh x_b, [ v_b, #8 ]; \
565 ldrsh x_c, [ v_c, #8 ]; \
566 ldrsh y_a, [ v_a, #10 ]; \
567 ldrsh y_b, [ v_b, #10 ]; \
568 ldrsh y_c, [ v_c, #10 ]; \
569 \
570 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
571 vld1.32 { uvrg }, [ temp ]; \
572 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
573 vld1.32 { uvrg_dy }, [ temp ]; \
5d834c08 574 load_pointer(reciprocal_table_ptr, reciprocal_table); \
75e28f62
E
575 \
576 vmov.u32 c_0x01, #0x01 \
577
578#define setup_spans_load_b() \
579 ldr b, [ psx_gpu, #psx_gpu_b_offset ]; \
580 ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ] \
581
582#define setup_spans_prologue_b() \
583 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
584 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
585 \
586 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
587 vmov.u16 c_0x0004, #0x0004; \
588 \
589 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
590 vmov.u16 c_0x0001, #0x0001; \
591 \
592 vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ]; \
593 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
594 \
595 vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ]; \
596 vadd.u16 right_edge, right_edge, c_0x0001; \
597 \
598 vmov.u16 c_0x0007, #0x0007; \
599 vmvn.u16 c_0xFFFE, #0x0001 \
600
601
602#define compute_edge_delta_x2() \
603 ldr temp, [ reciprocal_table_ptr, height, lsl #2 ]; \
604 \
605 vdup.u32 heights, height; \
606 vsub.u32 widths, x_ends, x_starts; \
607 \
608 vdup.u32 edge_shifts, temp; \
609 vsub.u32 heights_b, heights, c_0x01; \
7d5140f5 610 vshr.u32 height_reciprocals, edge_shifts, #10; \
75e28f62
E
611 \
612 vmla.s32 heights_b, x_starts, heights; \
613 vbic.u16 edge_shifts, #0xE0; \
614 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
615 vmull.s32 edges_xy, heights_b, height_reciprocals \
616
617#define width_alt r6
618#define height_reciprocal_alt r11
619#define height_b_alt r12
620
621#define compute_edge_delta_x3(start_c, height_a, height_b) \
622 vmov.u32 heights, height_a, height_b; \
623 ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
624 vmov.u32 edge_shifts[0], temp; \
625 ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
626 vmov.u32 edge_shifts[1], temp; \
627 ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ]; \
628 \
629 vsub.u32 widths, x_ends, x_starts; \
630 sub width_alt, x_c, start_c; \
631 \
632 vsub.u32 heights_b, heights, c_0x01; \
633 sub height_b_alt, height_minor_b, #1; \
634 \
7d5140f5
E
635 vshr.u32 height_reciprocals, edge_shifts, #10; \
636 lsr height_reciprocal_alt, edge_shift_alt, #10; \
75e28f62
E
637 \
638 vmla.s32 heights_b, x_starts, heights; \
639 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
640 \
641 vbic.u16 edge_shifts, #0xE0; \
642 and edge_shift_alt, edge_shift_alt, #0x1F; \
643 \
644 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
645 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
646 \
647 vmull.s32 edges_xy, heights_b, height_reciprocals; \
648 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
649
650
651#define setup_spans_adjust_y_up() \
652 vsub.u32 y_x4, y_x4, c_0x0004 \
653
654#define setup_spans_adjust_y_down() \
655 vadd.u32 y_x4, y_x4, c_0x0004 \
656
657#define setup_spans_adjust_interpolants_up() \
658 vsub.u32 uvrg, uvrg, uvrg_dy; \
659 sub b, b, b_dy \
660
661#define setup_spans_adjust_interpolants_down() \
662 vadd.u32 uvrg, uvrg, uvrg_dy; \
663 add b, b, b_dy \
664
665
666#define setup_spans_clip_interpolants_increment() \
667 mla b, b_dy, clip, b; \
668 vmla.s32 uvrg, uvrg_dy, v_clip \
669
670#define setup_spans_clip_interpolants_decrement() \
671 mls b, b_dy, clip, b; \
672 vmls.s32 uvrg, uvrg_dy, v_clip \
673
674#define setup_spans_clip_alternate_yes() \
675 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
676
677#define setup_spans_clip_alternate_no() \
678
679#define setup_spans_clip(direction, alternate_active) \
680 vdup.u32 v_clip, clip; \
681 setup_spans_clip_alternate_##alternate_active(); \
682 setup_spans_clip_interpolants_##direction(); \
683 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
684
685
686#define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
687 vmovl.s32 edge_shifts_64, edge_shifts; \
688 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
689 \
690 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
691 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
692 \
693 vmov left_x_low, edges_xy_##left_index; \
694 vmov right_x_low, edges_xy_##right_index; \
695 \
696 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
697 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
698 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
699 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
700 \
701 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
702 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
703 \
704 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
705 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
706
707
708#define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
709 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
710 \
711 vdup.u16 y_mid_point, y_b; \
712 rsb temp, edge_shift_alt, #32; \
713 \
714 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
715 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
716 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
717 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
718 \
719 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
720 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
721 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
722 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
723 \
724 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
725 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
726
727
728#define setup_spans_y_select_up() \
729 vclt.s16 alternate_select, y_x4, y_mid_point \
730
731#define setup_spans_y_select_down() \
732 vcgt.s16 alternate_select, y_x4, y_mid_point \
733
734
735#define setup_spans_alternate_select_left() \
736 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
737
738#define setup_spans_alternate_select_right() \
739 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
740
741
742#define setup_spans_set_x4_alternate_yes(alternate, direction) \
743 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
744 vshrn.s64 left_x_32_low, left_x, #32; \
745 vshrn.s64 right_x_32_low, right_x, #32; \
746 \
747 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
748 vadd.u64 left_x, left_x, left_dx_dy; \
749 vadd.u64 right_x, right_x, right_dx_dy; \
750 \
751 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
752 vshrn.s64 left_x_32_high, left_x, #32; \
753 vshrn.s64 right_x_32_high, right_x, #32; \
754 \
755 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
756 vadd.u64 left_x, left_x, left_dx_dy; \
757 vadd.u64 right_x, right_x, right_dx_dy; \
758 \
759 vmovn.u32 alternate_x_16, alternate_x_32; \
760 setup_spans_y_select_##direction(); \
761 vmovn.u32 left_right_x_16_low, left_x_32; \
762 \
763 vmovn.u32 left_right_x_16_high, right_x_32; \
764 setup_spans_alternate_select_##alternate(); \
765 \
766 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
767 str b, [ span_b_offset ], #4; \
768 setup_spans_adjust_interpolants_##direction(); \
769 \
770 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
771 \
772 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
773 str b, [ span_b_offset ], #4; \
774 setup_spans_adjust_interpolants_##direction(); \
775 \
776 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
777 \
778 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
779 str b, [ span_b_offset ], #4; \
780 setup_spans_adjust_interpolants_##direction(); \
781 \
782 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
783 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
784 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
785 \
786 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
787 str b, [ span_b_offset ], #4; \
788 setup_spans_adjust_interpolants_##direction(); \
789 \
790 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
791 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
792 \
793 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
794 \
795 setup_spans_adjust_y_##direction() \
796
797
798#define setup_spans_set_x4_alternate_no(alternate, direction) \
799 vshrn.s64 left_x_32_low, left_x, #32; \
800 vshrn.s64 right_x_32_low, right_x, #32; \
801 \
802 vadd.u64 left_x, left_x, left_dx_dy; \
803 vadd.u64 right_x, right_x, right_dx_dy; \
804 \
805 vshrn.s64 left_x_32_high, left_x, #32; \
806 vshrn.s64 right_x_32_high, right_x, #32; \
807 \
808 vadd.u64 left_x, left_x, left_dx_dy; \
809 vadd.u64 right_x, right_x, right_dx_dy; \
810 \
811 vmovn.u32 left_right_x_16_low, left_x_32; \
812 vmovn.u32 left_right_x_16_high, right_x_32; \
813 \
814 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
815 str b, [ span_b_offset ], #4; \
816 setup_spans_adjust_interpolants_##direction(); \
817 \
818 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
819 \
820 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
821 str b, [ span_b_offset ], #4; \
822 setup_spans_adjust_interpolants_##direction(); \
823 \
824 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
825 \
826 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
827 str b, [ span_b_offset ], #4; \
828 setup_spans_adjust_interpolants_##direction(); \
829 \
830 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
831 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
832 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
833 \
834 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
835 str b, [ span_b_offset ], #4; \
836 setup_spans_adjust_interpolants_##direction(); \
837 \
838 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
839 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
840 \
841 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
842 \
843 setup_spans_adjust_y_##direction() \
844
845
846#define edge_adjust_low r11
847#define edge_adjust_high r12
848
849#define setup_spans_alternate_adjust_yes() \
850 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
851 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
852 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
853
854#define setup_spans_alternate_adjust_no() \
855
856
857#define setup_spans_down(left_index, right_index, alternate, alternate_active) \
858 setup_spans_alternate_adjust_##alternate_active(); \
859 setup_spans_load_b(); \
860 \
861 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
862 subs y_c, y_c, temp; \
863 subgt height, height, y_c; \
864 addgt height, height, #1; \
865 \
866 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
867 subs clip, temp, y_a; \
868 ble 0f; \
869 \
870 sub height, height, clip; \
871 add y_a, y_a, clip; \
872 setup_spans_clip(increment, alternate_active); \
873 \
874 0: \
875 cmp height, #0; \
876 ble 1f; \
877 \
878 orr temp, y_a, y_a, lsl #16; \
879 add temp, temp, #(1 << 16); \
880 add y_a, temp, #2; \
881 add y_a, y_a, #(2 << 16); \
882 vmov.u32 y_x4, temp, y_a; \
883 \
884 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
885 right_index); \
886 setup_spans_prologue_b(); \
887 \
888 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
889 \
890 2: \
891 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
892 subs height, height, #4; \
893 bhi 2b; \
894 \
895 1: \
896
897
898#define setup_spans_alternate_pre_increment_yes() \
899 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
900 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
901
902#define setup_spans_alternate_pre_increment_no() \
903
904
905#define setup_spans_up_decrement_yes() \
906 suble height, height, #1 \
907
908#define setup_spans_up_decrement_no() \
909
910
911#define setup_spans_up(left_index, right_index, alternate, alternate_active) \
912 setup_spans_alternate_adjust_##alternate_active(); \
913 setup_spans_load_b(); \
914 sub y_a, y_a, #1; \
915 \
916 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
917 subs temp, temp, y_c; \
918 subgt height, height, temp; \
919 setup_spans_up_decrement_##alternate_active(); \
920 \
921 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
922 subs clip, y_a, temp; \
923 ble 0f; \
924 \
925 sub height, height, clip; \
926 sub y_a, y_a, clip; \
927 setup_spans_clip(decrement, alternate_active); \
928 \
929 0: \
930 cmp height, #0; \
931 ble 1f; \
932 \
933 orr temp, y_a, y_a, lsl #16; \
934 sub temp, temp, #(1 << 16); \
935 sub y_a, temp, #2; \
936 sub y_a, y_a, #(2 << 16); \
937 vmov.u32 y_x4, temp, y_a; \
938 \
939 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
940 \
941 setup_spans_alternate_pre_increment_##alternate_active(); \
942 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
943 right_index); \
944 setup_spans_adjust_interpolants_up(); \
945 setup_spans_prologue_b(); \
946 \
947 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
948 \
949 2: \
950 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
951 subs height, height, #4; \
952 bhi 2b; \
953 \
954 1: \
955
956
957#define setup_spans_epilogue() \
958 ldmia sp!, { r4 - r11, pc } \
959
960
961#define setup_spans_up_up(minor, major) \
962 setup_spans_prologue(); \
963 sub height_minor_a, y_a, y_b; \
964 sub height_minor_b, y_b, y_c; \
965 sub height, y_a, y_c; \
966 \
967 vdup.u32 x_starts, x_a; \
968 vmov.u32 x_ends, x_c, x_b; \
969 \
970 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
971 setup_spans_up(major, minor, minor, yes); \
972 setup_spans_epilogue() \
973
974function(setup_spans_up_left)
975 setup_spans_up_up(left, right)
976
977function(setup_spans_up_right)
978 setup_spans_up_up(right, left)
979
5d834c08 980.pool
75e28f62
E
981
982#define setup_spans_down_down(minor, major) \
983 setup_spans_prologue(); \
984 sub height_minor_a, y_b, y_a; \
985 sub height_minor_b, y_c, y_b; \
986 sub height, y_c, y_a; \
987 \
988 vdup.u32 x_starts, x_a; \
989 vmov.u32 x_ends, x_c, x_b; \
990 \
991 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
992 setup_spans_down(major, minor, minor, yes); \
993 setup_spans_epilogue() \
994
995function(setup_spans_down_left)
996 setup_spans_down_down(left, right)
997
998function(setup_spans_down_right)
999 setup_spans_down_down(right, left)
1000
1001
1002#define setup_spans_up_flat() \
1003 sub height, y_a, y_c; \
1004 \
1005 compute_edge_delta_x2(); \
1006 setup_spans_up(left, right, none, no); \
1007 setup_spans_epilogue() \
1008
1009function(setup_spans_up_a)
1010 setup_spans_prologue()
1011
1012 vmov.u32 x_starts, x_a, x_b
1013 vdup.u32 x_ends, x_c
1014
1015 setup_spans_up_flat()
1016
1017function(setup_spans_up_b)
1018 setup_spans_prologue()
1019
1020 vdup.u32 x_starts, x_a
1021 vmov.u32 x_ends, x_b, x_c
1022
1023 setup_spans_up_flat()
1024
1025#define setup_spans_down_flat() \
1026 sub height, y_c, y_a; \
1027 \
1028 compute_edge_delta_x2(); \
1029 setup_spans_down(left, right, none, no); \
1030 setup_spans_epilogue() \
1031
1032function(setup_spans_down_a)
1033 setup_spans_prologue()
1034
1035 vmov.u32 x_starts, x_a, x_b
1036 vdup.u32 x_ends, x_c
1037
1038 setup_spans_down_flat()
1039
1040function(setup_spans_down_b)
1041 setup_spans_prologue()
1042
1043 vdup.u32 x_starts, x_a
1044 vmov.u32 x_ends, x_b, x_c
1045
1046 setup_spans_down_flat()
1047
1048
1049#define middle_y r9
1050
1051#define edges_xy_b q11
1052#define edges_dx_dy_b d26
1053#define edge_shifts_b d27
1054#define edges_dx_dy_and_shifts_b q13
1055#define height_increment d20
1056
1057#define edges_dx_dy_and_shifts q1
1058
1059#define edges_xy_b_left d22
1060#define edges_xy_b_right d23
1061
1062#define setup_spans_up_down_load_edge_set_b() \
1063 vmov edges_xy, edges_xy_b; \
1064 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1065
1066
1067function(setup_spans_up_down)
1068 setup_spans_prologue()
1069
1070 // s32 middle_y = y_a;
1071 sub height_minor_a, y_a, y_b
1072 sub height_minor_b, y_c, y_a
1073 sub height_major, y_c, y_b
1074
1075 vmov.u32 x_starts, x_a, x_c
1076 vdup.u32 x_ends, x_b
1077
1078 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1079
1080 mov temp, #0
1081 vmov.u32 height_increment, temp, height_minor_b
1082 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1083
1084 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1085 vmov edges_xy_b_right, edges_xy_right
1086
1087 vmov edge_shifts_b, edge_shifts
1088 vmov.u32 edge_shifts_b[0], edge_shift_alt
1089
1090 vneg.s32 edges_dx_dy_b, edges_dx_dy
1091 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1092
1093 mov middle_y, y_a
1094
1095 setup_spans_load_b()
1096 sub y_a, y_a, #1
1097
1098 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1099 subs temp, temp, y_b
1100 subgt height_minor_a, height_minor_a, temp
1101
1102 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1103 subs clip, y_a, temp
1104 ble 0f
1105
1106 sub height_minor_a, height_minor_a, clip
1107 sub y_a, y_a, clip
1108 setup_spans_clip(decrement, no)
1109
1110 0:
1111 cmp height_minor_a, #0
1112 ble 3f
1113
1114 orr temp, y_a, y_a, lsl #16
1115 sub temp, temp, #(1 << 16)
1116 sub y_a, temp, #2
1117 sub y_a, y_a, #(2 << 16)
1118 vmov.u32 y_x4, temp, y_a
1119
1120 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1121
1122 strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1123
1124 setup_spans_adjust_edges_alternate_no(left, right);
1125 setup_spans_adjust_interpolants_up()
1126 setup_spans_up_down_load_edge_set_b()
1127
1128 setup_spans_prologue_b()
1129
1130
1131 2:
1132 setup_spans_set_x4_alternate_no(none, up)
1133 subs height_minor_a, height_minor_a, #4
1134 bhi 2b
1135
1136 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1137 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1138 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1139
1140 4:
1141 add temp, psx_gpu, #psx_gpu_uvrg_offset
1142 vld1.32 { uvrg }, [ temp ]
1143 mov y_a, middle_y
1144
1145 setup_spans_load_b()
1146
1147 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1148 subs y_c, y_c, temp
1149 subgt height_minor_b, height_minor_b, y_c
1150 addgt height_minor_b, height_minor_b, #1
1151
1152 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1153 subs clip, temp, y_a
1154 ble 0f
1155
1156 sub height_minor_b, height_minor_b, clip
1157 add y_a, y_a, clip
1158 setup_spans_clip(increment, no)
1159
1160 0:
1161 cmp height_minor_b, #0
1162 ble 1f
1163
1164 orr temp, y_a, y_a, lsl #16
1165 add temp, temp, #(1 << 16)
1166 add y_a, temp, #2
1167 add y_a, y_a, #(2 << 16)
1168 vmov.u32 y_x4, temp, y_a
1169
1170 setup_spans_adjust_edges_alternate_no(left, right)
1171
1172 ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1173 add temp, temp, height_minor_b
1174 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1175
1176 2:
1177 setup_spans_set_x4_alternate_no(none, down)
1178 subs height_minor_b, height_minor_b, #4
1179 bhi 2b
1180
1181 1:
1182 setup_spans_epilogue()
1183
1184 3:
1185 setup_spans_up_down_load_edge_set_b()
1186 setup_spans_prologue_b()
1187 bal 4b
1188
5d834c08 1189.pool
75e28f62
E
1190
1191#undef span_uvrg_offset
1192#undef span_edge_data
1193#undef span_b_offset
1194#undef left_x
1195#undef b
1196
1197#define psx_gpu r0
1198#define num_spans r1
1199#define span_uvrg_offset r2
1200#define span_edge_data r3
1201#define span_b_offset r4
1202#define b_dx r5
1203#define span_num_blocks r6
1204#define y r7
1205#define left_x r8
1206#define b r9
1207#define dither_offset_ptr r10
1208#define block_ptr_a r11
1209#define fb_ptr r12
1210#define num_blocks r14
1211
1212#define uvrg_dx_ptr r2
1213#define texture_mask_ptr r3
1214#define dither_shift r8
1215#define dither_row r10
1216
1217#define c_32 r7
1218#define b_dx4 r8
1219#define b_dx8 r9
1220#define block_ptr_b r10
1221
1222#define block_span_ptr r10
1223#define right_mask r8
1224
1225#define color r2
1226#define color_r r3
1227#define color_g r4
1228#define color_b r5
1229
1230#undef uvrg
1231
1232#define u_block q0
1233#define v_block q1
1234#define r_block q2
1235#define g_block q3
1236#define b_block q4
1237
1238#define uv_dx4 d10
1239#define rg_dx4 d11
1240#define uv_dx8 d12
1241#define rg_dx8 d13
1242#define b_whole_8 d14
1243#define fb_mask_ptrs d15
1244
1245#define uvrg_dx4 q5
1246#define uvrg_dx8 q6
1247#define uv_dx8 d12
1248#define rg_dx8 d13
1249
1250#define u_whole q8
1251#define v_whole q9
1252#define r_whole q10
1253#define g_whole q11
1254#define b_whole q12
1255
1256#define u_whole_low d16
1257#define u_whole_high d17
1258#define v_whole_low d18
1259#define v_whole_high d19
1260#define r_whole_low d20
1261#define r_whole_high d21
1262#define g_whole_low d22
1263#define g_whole_high d23
1264#define b_whole_low d24
1265#define b_whole_high d25
1266
1267#define dx4 q13
1268#define dx8 q13
1269
1270#define u_whole_8 d26
1271#define v_whole_8 d27
1272#define u_whole_8b d24
1273#define r_whole_8 d24
1274#define g_whole_8 d25
1275
1276#define uv_whole_8 q13
1277#define uv_whole_8b q14
1278
1279#define dither_offsets q14
1280#define texture_mask q15
1281#define texture_mask_u d30
1282#define texture_mask_v d31
1283
1284#define dither_offsets_short d28
1285
1286#define v_left_x q8
1287#define uvrg q9
1288#define block_span q10
1289
1290#define uv d18
1291#define rg d19
1292
1293#define draw_mask q1
1294#define draw_mask_edge q13
1295#define test_mask q0
1296
1297#define uvrg_dx q3
1298
1299#define colors q2
1300
1301#define setup_blocks_texture_swizzled() \
1302 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1303 vsli.u8 u_whole_8, v_whole_8, #4; \
1304 vsri.u8 v_whole_8, u_whole_8b, #4 \
1305
1306#define setup_blocks_texture_unswizzled() \
1307
1308
1309#define setup_blocks_shaded_textured_builder(swizzling) \
1310.align 3; \
1311 \
1312function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
1313 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1314 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1315 \
1316 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1317 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1318 \
1319 cmp num_spans, #0; \
1320 bxeq lr; \
1321 \
1322 stmdb sp!, { r4 - r11, r14 }; \
1323 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1324 \
1325 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
1326 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1327 \
1328 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1329 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1330 \
1331 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1332 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1333 \
1334 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1335 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1336 \
1337 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1338 \
1339 0: \
1340 vmov.u8 fb_mask_ptrs, #0; \
1341 \
1342 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1343 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1344 \
1345 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 1346 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
1347 \
1348 cmp span_num_blocks, #0; \
1349 beq 1f; \
1350 \
1351 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1352 add num_blocks, span_num_blocks, num_blocks; \
1353 \
1354 cmp num_blocks, #MAX_BLOCKS; \
1355 bgt 2f; \
1356 \
1357 3: \
1358 ldr b, [ span_b_offset ]; \
1359 add fb_ptr, fb_ptr, y, lsl #11; \
1360 \
1361 vdup.u32 v_left_x, left_x; \
1362 and y, y, #0x3; \
1363 \
1364 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1365 add fb_ptr, fb_ptr, left_x, lsl #1; \
1366 \
1367 mla b, b_dx, left_x, b; \
1368 and dither_shift, left_x, #0x03; \
1369 \
1370 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1371 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1372 \
1373 mov dither_shift, dither_shift, lsl #3; \
1374 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1375 \
1376 mov c_32, #32; \
1377 subs span_num_blocks, span_num_blocks, #1; \
1378 \
1379 mov dither_row, dither_row, ror dither_shift; \
1380 mov b_dx4, b_dx, lsl #2; \
1381 \
1382 vdup.u32 dither_offsets_short, dither_row; \
1383 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1384 \
1385 vdup.u32 b_block, b; \
1386 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1387 \
1388 vdup.u32 u_block, uv[0]; \
1389 mov b_dx8, b_dx, lsl #3; \
1390 \
1391 vdup.u32 v_block, uv[1]; \
1392 vdup.u32 r_block, rg[0]; \
1393 vdup.u32 g_block, rg[1]; \
1394 \
1395 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1396 \
1397 vadd.u32 u_block, u_block, block_span; \
1398 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1399 \
1400 vadd.u32 v_block, v_block, block_span; \
1401 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1402 \
1403 vadd.u32 r_block, r_block, block_span; \
1404 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1405 \
1406 vadd.u32 g_block, g_block, block_span; \
1407 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
1408 \
1409 vadd.u32 b_block, b_block, block_span; \
1410 add block_ptr_b, block_ptr_a, #16; \
1411 \
1412 vshrn.u32 u_whole_low, u_block, #16; \
1413 vshrn.u32 v_whole_low, v_block, #16; \
1414 vshrn.u32 r_whole_low, r_block, #16; \
1415 vshrn.u32 g_whole_low, g_block, #16; \
1416 \
1417 vdup.u32 dx4, uv_dx4[0]; \
1418 vshrn.u32 b_whole_low, b_block, #16; \
1419 \
1420 vaddhn.u32 u_whole_high, u_block, dx4; \
1421 vdup.u32 dx4, uv_dx4[1]; \
1422 \
1423 vaddhn.u32 v_whole_high, v_block, dx4; \
1424 vdup.u32 dx4, rg_dx4[0]; \
1425 \
1426 vaddhn.u32 r_whole_high, r_block, dx4; \
1427 vdup.u32 dx4, rg_dx4[1]; \
1428 \
1429 vaddhn.u32 g_whole_high, g_block, dx4; \
1430 vdup.u32 dx4, b_dx4; \
1431 \
1432 vaddhn.u32 b_whole_high, b_block, dx4; \
1433 vdup.u32 dx8, uv_dx8[0]; \
1434 \
1435 vadd.u32 u_block, u_block, dx8; \
1436 vdup.u32 dx8, uv_dx8[1]; \
1437 \
1438 vadd.u32 v_block, v_block, dx8; \
1439 vdup.u32 dx8, rg_dx8[0]; \
1440 \
1441 vadd.u32 r_block, r_block, dx8; \
1442 vdup.u32 dx8, rg_dx8[1]; \
1443 \
1444 vadd.u32 g_block, g_block, dx8; \
1445 vdup.u32 dx8, b_dx8; \
1446 \
1447 vadd.u32 b_block, b_block, dx8; \
1448 vmovn.u16 u_whole_8, u_whole; \
1449 \
1450 vmovn.u16 v_whole_8, v_whole; \
1451 \
1452 vmovn.u16 b_whole_8, b_whole; \
1453 pld [ fb_ptr ]; \
1454 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1455 \
1456 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1457 setup_blocks_texture_##swizzling(); \
1458 \
1459 vmovn.u16 r_whole_8, r_whole; \
1460 beq 5f; \
1461 \
1462 4: \
1463 vmovn.u16 g_whole_8, g_whole; \
1464 vshrn.u32 u_whole_low, u_block, #16; \
1465 \
1466 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1467 vshrn.u32 v_whole_low, v_block, #16; \
1468 \
1469 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1470 vshrn.u32 r_whole_low, r_block, #16; \
1471 \
1472 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1473 vshrn.u32 g_whole_low, g_block, #16; \
1474 \
1475 vdup.u32 dx4, uv_dx4[0]; \
1476 vshrn.u32 b_whole_low, b_block, #16; \
1477 \
1478 vaddhn.u32 u_whole_high, u_block, dx4; \
1479 vdup.u32 dx4, uv_dx4[1]; \
1480 \
1481 vaddhn.u32 v_whole_high, v_block, dx4; \
1482 vdup.u32 dx4, rg_dx4[0]; \
1483 \
1484 vaddhn.u32 r_whole_high, r_block, dx4; \
1485 vdup.u32 dx4, rg_dx4[1]; \
1486 \
1487 vaddhn.u32 g_whole_high, g_block, dx4; \
1488 vdup.u32 dx4, b_dx4; \
1489 \
1490 vaddhn.u32 b_whole_high, b_block, dx4; \
1491 vdup.u32 dx8, uv_dx8[0]; \
1492 \
1493 vadd.u32 u_block, u_block, dx8; \
1494 vdup.u32 dx8, uv_dx8[1]; \
1495 \
1496 vadd.u32 v_block, v_block, dx8; \
1497 vdup.u32 dx8, rg_dx8[0]; \
1498 \
1499 vadd.u32 r_block, r_block, dx8; \
1500 vdup.u32 dx8, rg_dx8[1]; \
1501 \
1502 vadd.u32 g_block, g_block, dx8; \
1503 vdup.u32 dx8, b_dx8; \
1504 \
1505 vadd.u32 b_block, b_block, dx8; \
1506 vmovn.u16 u_whole_8, u_whole; \
1507 \
1508 add fb_ptr, fb_ptr, #16; \
1509 vmovn.u16 v_whole_8, v_whole; \
1510 \
1511 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1512 vmovn.u16 b_whole_8, b_whole; \
1513 \
1514 pld [ fb_ptr ]; \
1515 \
1516 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1517 subs span_num_blocks, span_num_blocks, #1; \
1518 \
1519 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1520 setup_blocks_texture_##swizzling(); \
1521 \
1522 vmovn.u16 r_whole_8, r_whole; \
1523 bne 4b; \
1524 \
1525 5: \
1526 vmovn.u16 g_whole_8, g_whole; \
1527 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1528 \
1529 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1530 vdup.u8 draw_mask, right_mask; \
1531 \
1532 vmov.u32 fb_mask_ptrs[0], right_mask; \
1533 vtst.u16 draw_mask, draw_mask, test_mask; \
1534 vzip.u8 u_whole_8, v_whole_8; \
1535 \
1536 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1537 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1538 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1539 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1540 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1541 \
1542 1: \
1543 add span_uvrg_offset, span_uvrg_offset, #16; \
1544 add span_b_offset, span_b_offset, #4; \
1545 \
1546 add span_edge_data, span_edge_data, #8; \
1547 subs num_spans, num_spans, #1; \
1548 \
1549 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1550 bne 0b; \
1551 \
1552 ldmia sp!, { r4 - r11, pc }; \
1553 \
1554 2: \
1555 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1556 vpush { texture_mask }; \
1557 vpush { uvrg_dx4 }; \
1558 \
1559 stmdb sp!, { r0 - r3, r12, r14 }; \
1560 bl flush_render_block_buffer; \
1561 ldmia sp!, { r0 - r3, r12, r14 }; \
1562 \
1563 vpop { uvrg_dx4 }; \
1564 vpop { texture_mask }; \
1565 \
1566 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1567 vmov.u8 fb_mask_ptrs, #0; \
1568 \
1569 mov num_blocks, span_num_blocks; \
1570 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1571 bal 3b \
1572
1573
1574setup_blocks_shaded_textured_builder(swizzled)
1575setup_blocks_shaded_textured_builder(unswizzled)
1576
1577
1578#define setup_blocks_unshaded_textured_builder(swizzling) \
1579.align 3; \
1580 \
1581function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
1582 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1583 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1584 \
1585 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1586 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1587 \
1588 cmp num_spans, #0; \
1589 bxeq lr; \
1590 \
1591 stmdb sp!, { r4 - r11, r14 }; \
1592 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1593 \
1594 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1595 \
1596 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1597 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1598 \
1599 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1600 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1601 \
1602 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1603 \
1604 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1605 \
1606 0: \
1607 vmov.u8 fb_mask_ptrs, #0; \
1608 \
1609 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1610 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1611 \
1612 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 1613 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
1614 \
1615 cmp span_num_blocks, #0; \
1616 beq 1f; \
1617 \
1618 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1619 add num_blocks, span_num_blocks, num_blocks; \
1620 \
1621 cmp num_blocks, #MAX_BLOCKS; \
1622 bgt 2f; \
1623 \
1624 3: \
1625 add fb_ptr, fb_ptr, y, lsl #11; \
1626 \
1627 vdup.u32 v_left_x, left_x; \
1628 and y, y, #0x3; \
1629 \
1630 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1631 add fb_ptr, fb_ptr, left_x, lsl #1; \
1632 \
1633 and dither_shift, left_x, #0x03; \
1634 \
1635 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1636 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1637 \
1638 mov dither_shift, dither_shift, lsl #3; \
1639 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1640 \
1641 mov c_32, #32; \
1642 subs span_num_blocks, span_num_blocks, #1; \
1643 \
1644 mov dither_row, dither_row, ror dither_shift; \
1645 \
1646 vdup.u32 dither_offsets_short, dither_row; \
1647 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1648 \
1649 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1650 \
1651 vdup.u32 u_block, uv[0]; \
1652 \
1653 vdup.u32 v_block, uv[1]; \
1654 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1655 \
1656 vadd.u32 u_block, u_block, block_span; \
1657 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1658 \
1659 vadd.u32 v_block, v_block, block_span; \
1660 add block_ptr_b, block_ptr_a, #16; \
1661 \
1662 vshrn.u32 u_whole_low, u_block, #16; \
1663 vshrn.u32 v_whole_low, v_block, #16; \
1664 \
1665 vdup.u32 dx4, uv_dx4[0]; \
1666 \
1667 vaddhn.u32 u_whole_high, u_block, dx4; \
1668 vdup.u32 dx4, uv_dx4[1]; \
1669 \
1670 vaddhn.u32 v_whole_high, v_block, dx4; \
1671 vdup.u32 dx8, uv_dx8[0]; \
1672 \
1673 vadd.u32 u_block, u_block, dx8; \
1674 vdup.u32 dx8, uv_dx8[1]; \
1675 \
1676 vadd.u32 v_block, v_block, dx8; \
1677 vmovn.u16 u_whole_8, u_whole; \
1678 \
1679 vmovn.u16 v_whole_8, v_whole; \
1680 \
1681 pld [ fb_ptr ]; \
1682 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1683 \
1684 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1685 setup_blocks_texture_##swizzling(); \
1686 \
1687 beq 5f; \
1688 \
1689 4: \
1690 vshrn.u32 u_whole_low, u_block, #16; \
1691 \
1692 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1693 vshrn.u32 v_whole_low, v_block, #16; \
1694 \
1695 add block_ptr_b, block_ptr_b, #32; \
1696 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1697 \
1698 vdup.u32 dx4, uv_dx4[0]; \
1699 vaddhn.u32 u_whole_high, u_block, dx4; \
1700 vdup.u32 dx4, uv_dx4[1]; \
1701 \
1702 vaddhn.u32 v_whole_high, v_block, dx4; \
1703 vdup.u32 dx8, uv_dx8[0]; \
1704 \
1705 vadd.u32 u_block, u_block, dx8; \
1706 vdup.u32 dx8, uv_dx8[1]; \
1707 \
1708 vadd.u32 v_block, v_block, dx8; \
1709 vmovn.u16 u_whole_8, u_whole; \
1710 \
1711 add fb_ptr, fb_ptr, #16; \
1712 vmovn.u16 v_whole_8, v_whole; \
1713 \
1714 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1715 pld [ fb_ptr ]; \
1716 \
1717 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1718 subs span_num_blocks, span_num_blocks, #1; \
1719 \
1720 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1721 setup_blocks_texture_##swizzling(); \
1722 \
1723 bne 4b; \
1724 \
1725 5: \
1726 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1727 \
1728 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1729 vdup.u8 draw_mask, right_mask; \
1730 \
1731 vmov.u32 fb_mask_ptrs[0], right_mask; \
1732 vtst.u16 draw_mask, draw_mask, test_mask; \
1733 vzip.u8 u_whole_8, v_whole_8; \
1734 \
1735 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1736 add block_ptr_b, block_ptr_b, #32; \
1737 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1738 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1739 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1740 \
1741 1: \
1742 add span_uvrg_offset, span_uvrg_offset, #16; \
1743 add span_edge_data, span_edge_data, #8; \
1744 subs num_spans, num_spans, #1; \
1745 \
1746 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1747 bne 0b; \
1748 \
1749 ldmia sp!, { r4 - r11, pc }; \
1750 \
1751 2: \
1752 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1753 vpush { texture_mask }; \
1754 vpush { uvrg_dx4 }; \
1755 \
1756 stmdb sp!, { r0 - r3, r12, r14 }; \
1757 bl flush_render_block_buffer; \
1758 ldmia sp!, { r0 - r3, r12, r14 }; \
1759 \
1760 vpop { uvrg_dx4 }; \
1761 vpop { texture_mask }; \
1762 \
1763 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1764 vmov.u8 fb_mask_ptrs, #0; \
1765 \
1766 mov num_blocks, span_num_blocks; \
1767 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1768 bal 3b \
1769
1770
1771setup_blocks_unshaded_textured_builder(swizzled)
1772setup_blocks_unshaded_textured_builder(unswizzled)
1773
1774
1775.align 3
1776
1777function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1778 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1779 veor.u32 draw_mask, draw_mask, draw_mask
1780
1781 cmp num_spans, #0
1782 bxeq lr
1783
1784 stmdb sp!, { r4 - r11, r14 }
1785 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1786
1787 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1788
1789 ubfx color_r, color, #3, #5
1790 ubfx color_g, color, #11, #5
1791 ubfx color_b, color, #19, #5
1792
1793 orr color, color_r, color_b, lsl #10
1794 orr color, color, color_g, lsl #5
1795
1796 vdup.u16 colors, color
1797
1798 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1799 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1800
1801 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1802 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1803
1804 0:
1805 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1806 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1807
c1817bd9 1808 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
1809
1810 cmp span_num_blocks, #0
1811 beq 1f
1812
1813 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1814 add num_blocks, span_num_blocks, num_blocks
1815
1816 cmp num_blocks, #MAX_BLOCKS
1817 bgt 2f
1818
1819 3:
1820 add fb_ptr, fb_ptr, y, lsl #11
1821 and y, y, #0x3
1822
1823 add fb_ptr, fb_ptr, left_x, lsl #1
1824 mov c_32, #32
1825
1826 subs span_num_blocks, span_num_blocks, #1
1827
1828 add block_ptr_b, block_ptr_a, #16
1829 pld [ fb_ptr ]
1830
1831 vmov.u32 fb_mask_ptrs[1], fb_ptr
1832 beq 5f
1833
1834 4:
1835 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1836 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1837 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1838
1839 add fb_ptr, fb_ptr, #16
1840 add block_ptr_b, block_ptr_b, #32
1841
1842 pld [ fb_ptr ]
1843
1844 vmov.u32 fb_mask_ptrs[1], fb_ptr
1845 subs span_num_blocks, span_num_blocks, #1
1846
1847 bne 4b
1848
1849 5:
1850 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1851
1852 vdup.u8 draw_mask_edge, right_mask
1853 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1854
1855 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1856 vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1857 add block_ptr_b, block_ptr_b, #32
1858 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1859
1860 1:
1861 add span_edge_data, span_edge_data, #8
1862 subs num_spans, num_spans, #1
1863
1864 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1865 bne 0b
1866
1867 ldmia sp!, { r4 - r11, pc }
1868
1869 2:
1870 vpush { colors }
1871
1872 stmdb sp!, { r0 - r3, r12, r14 }
1873 bl flush_render_block_buffer
1874 ldmia sp!, { r0 - r3, r12, r14 }
1875
1876 vpop { colors }
1877
1878 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1879 veor.u32 draw_mask, draw_mask, draw_mask
1880
1881 mov num_blocks, span_num_blocks
1882 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1883 bal 3b
1884
1885
1886#define mask_msb_scalar r14
1887
1888#define msb_mask q15
1889
1890#define pixels_low d16
1891
1892#define msb_mask_low d30
1893#define msb_mask_high d31
1894
1895
1896.align 3
1897
1898function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1899 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1900
1901 cmp num_spans, #0
1902 bxeq lr
1903
1904 stmdb sp!, { r4 - r11, r14 }
1905
1906 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1907
1908 ubfx color_r, color, #3, #5
1909 ubfx color_g, color, #11, #5
1910
1911 ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1912 ubfx color_b, color, #19, #5
1913
1914 orr color, color_r, color_b, lsl #10
1915 orr color, color, color_g, lsl #5
1916 orr color, color, mask_msb_scalar
1917
1918 vdup.u16 colors, color
1919
1920 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
3867c6ef
E
1921 orr color, color, lsl #16
1922
75e28f62
E
1923
1924 0:
1925 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1926 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1927
c1817bd9 1928 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
1929
1930 cmp span_num_blocks, #0
1931 beq 1f
1932
1933 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1934
1935 add fb_ptr, fb_ptr, y, lsl #11
1936 subs span_num_blocks, span_num_blocks, #1
1937
1938 add fb_ptr, fb_ptr, left_x, lsl #1
1939 beq 3f
1940
1941 2:
1942 vst1.u32 { colors }, [ fb_ptr ]!
1943 subs span_num_blocks, span_num_blocks, #1
1944
1945 bne 2b
1946
1947 3:
1948 ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
75e28f62 1949
3867c6ef
E
1950 cmp right_mask, #0x0
1951 beq 5f
1952
1953 tst right_mask, #0xF
1954 streq color, [ fb_ptr ], #4
1955 moveq right_mask, right_mask, lsr #4
1956 streq color, [ fb_ptr ], #4
1957
1958 tst right_mask, #0x3
1959 streq color, [ fb_ptr ], #4
1960 moveq right_mask, right_mask, lsr #2
1961
1962 tst right_mask, #0x1
1963 streqh color, [ fb_ptr ]
75e28f62
E
1964
1965 1:
1966 add span_edge_data, span_edge_data, #8
1967 subs num_spans, num_spans, #1
75e28f62
E
1968 bne 0b
1969
1970 ldmia sp!, { r4 - r11, pc }
1971
3867c6ef
E
1972 5:
1973 vst1.u32 { colors }, [ fb_ptr ]
1974 bal 1b
75e28f62
E
1975
1976
1977#undef c_64
1978
1979#define c_64 r7
1980#define rg_dx_ptr r2
1981
1982
1983#undef r_block
1984#undef g_block
1985#undef b_block
1986#undef r_whole
1987#undef g_whole
1988#undef b_whole
1989#undef r_whole_low
1990#undef r_whole_high
1991#undef g_whole_low
1992#undef g_whole_high
1993#undef b_whole_low
1994#undef b_whole_high
1995#undef r_whole_8
1996#undef g_whole_8
1997#undef b_whole_8
1998#undef dither_offsets
1999#undef rg_dx4
2000#undef rg_dx8
2001#undef dx4
2002#undef dx8
2003#undef v_left_x
2004#undef uvrg
2005#undef block_span
2006#undef rg
2007#undef draw_mask
2008#undef test_mask
2009
2010#define r_block q0
2011#define g_block q1
2012#define b_block q2
2013
2014#define r_whole q3
2015#define g_whole q4
2016#define b_whole q5
2017
2018#define r_whole_low d6
2019#define r_whole_high d7
2020#define g_whole_low d8
2021#define g_whole_high d9
2022#define b_whole_low d10
2023#define b_whole_high d11
2024
2025#define gb_whole_8 q6
2026
2027#define g_whole_8 d12
2028#define b_whole_8 d13
2029
2030#define r_whole_8 d14
2031
2032#define pixels q8
2033
2034#define rg_dx4 d18
2035#define rg_dx8 d19
2036
2037#define dx4 q10
2038#define dx8 q10
2039
2040#define v_left_x d6
2041#define uvrg q4
2042#define block_span q5
2043
2044#define rg d9
2045
2046#define d64_1 d22
2047#define d64_128 d23
2048
2049#define d128_4 q12
2050#define d128_0x7 q13
2051
2052#define d64_4 d24
2053
2054#define dither_offsets q14
2055#define draw_mask q15
2056
2057#define dither_offsets_low d28
2058
2059#define rg_dx d0
2060#define test_mask q10
2061
2062
2063#define setup_blocks_shaded_untextured_dither_a_dithered() \
2064 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2065 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2066
2067#define setup_blocks_shaded_untextured_dither_b_dithered() \
2068 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2069 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2070
2071#define setup_blocks_shaded_untextured_dither_a_undithered() \
2072
2073#define setup_blocks_shaded_untextured_dither_b_undithered() \
2074
2075
2076#define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2077.align 3; \
2078 \
2079function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
2080 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2081 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2082 \
2083 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2084 \
2085 cmp num_spans, #0; \
2086 bxeq lr; \
2087 \
2088 stmdb sp!, { r4 - r11, r14 }; \
2089 vshl.u32 rg_dx4, rg_dx, #2; \
2090 \
2091 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2092 vshl.u32 rg_dx8, rg_dx, #3; \
2093 \
2094 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2095 \
2096 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2097 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2098 \
2099 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2100 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2101 \
2102 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2103 vmov.u8 d64_1, #1; \
2104 \
2105 vmov.u8 d128_4, #4; \
2106 vmov.u8 d64_128, #128; \
2107 \
2108 vmov.u8 d128_0x7, #0x7; \
2109 \
2110 0: \
2111 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2112 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2113 \
2114 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 2115 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
2116 \
2117 cmp span_num_blocks, #0; \
2118 beq 1f; \
2119 \
2120 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2121 add num_blocks, span_num_blocks, num_blocks; \
2122 \
2123 cmp num_blocks, #MAX_BLOCKS; \
2124 bgt 2f; \
2125 \
2126 3: \
2127 ldr b, [ span_b_offset ]; \
2128 add fb_ptr, fb_ptr, y, lsl #11; \
2129 \
2130 vdup.u32 v_left_x, left_x; \
2131 and y, y, #0x3; \
2132 \
2133 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2134 add fb_ptr, fb_ptr, left_x, lsl #1; \
2135 \
2136 mla b, b_dx, left_x, b; \
2137 and dither_shift, left_x, #0x03; \
2138 \
2139 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2140 vshr.u32 rg_dx, rg_dx4, #2; \
2141 \
2142 mov dither_shift, dither_shift, lsl #3; \
2143 vmla.u32 rg, rg_dx, v_left_x; \
2144 \
2145 mov c_64, #64; \
2146 subs span_num_blocks, span_num_blocks, #1; \
2147 \
2148 mov dither_row, dither_row, ror dither_shift; \
2149 mov b_dx4, b_dx, lsl #2; \
2150 \
2151 vdup.u32 dither_offsets, dither_row; \
2152 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2153 \
2154 vdup.u32 b_block, b; \
2155 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2156 \
2157 mov b_dx8, b_dx, lsl #3; \
2158 vdup.u32 r_block, rg[0]; \
2159 vdup.u32 g_block, rg[1]; \
2160 \
2161 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2162 \
2163 vadd.u32 r_block, r_block, block_span; \
2164 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2165 \
2166 vadd.u32 g_block, g_block, block_span; \
2167 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2168 \
2169 vadd.u32 b_block, b_block, block_span; \
2170 add block_ptr_b, block_ptr_a, #16; \
2171 \
2172 vshrn.u32 r_whole_low, r_block, #16; \
2173 vshrn.u32 g_whole_low, g_block, #16; \
2174 vshrn.u32 b_whole_low, b_block, #16; \
2175 vdup.u32 dx4, rg_dx4[0]; \
2176 \
2177 vaddhn.u32 r_whole_high, r_block, dx4; \
2178 vdup.u32 dx4, rg_dx4[1]; \
2179 \
2180 vaddhn.u32 g_whole_high, g_block, dx4; \
2181 vdup.u32 dx4, b_dx4; \
2182 \
2183 vaddhn.u32 b_whole_high, b_block, dx4; \
2184 vdup.u32 dx8, rg_dx8[0]; \
2185 \
2186 vadd.u32 r_block, r_block, dx8; \
2187 vdup.u32 dx8, rg_dx8[1]; \
2188 \
2189 vadd.u32 g_block, g_block, dx8; \
2190 vdup.u32 dx8, b_dx8; \
2191 \
2192 vadd.u32 b_block, b_block, dx8; \
2193 \
2194 vmovn.u16 r_whole_8, r_whole; \
2195 vmovn.u16 g_whole_8, g_whole; \
2196 vmovn.u16 b_whole_8, b_whole; \
2197 \
2198 beq 5f; \
2199 veor.u32 draw_mask, draw_mask, draw_mask; \
2200 \
2201 4: \
2202 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2203 vshrn.u32 r_whole_low, r_block, #16; \
2204 \
2205 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2206 vshrn.u32 g_whole_low, g_block, #16; \
2207 \
2208 vshrn.u32 b_whole_low, b_block, #16; \
2209 str fb_ptr, [ block_ptr_a, #44 ]; \
2210 \
2211 vdup.u32 dx4, rg_dx4[0]; \
2212 vshr.u8 r_whole_8, r_whole_8, #3; \
2213 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2214 \
2215 vaddhn.u32 r_whole_high, r_block, dx4; \
2216 vdup.u32 dx4, rg_dx4[1]; \
2217 \
2218 vaddhn.u32 g_whole_high, g_block, dx4; \
2219 vdup.u32 dx4, b_dx4; \
2220 \
2221 vaddhn.u32 b_whole_high, b_block, dx4; \
2222 vdup.u32 dx8, rg_dx8[0]; \
2223 \
2224 vmull.u8 pixels, r_whole_8, d64_1; \
2225 vmlal.u8 pixels, g_whole_8, d64_4; \
2226 vmlal.u8 pixels, b_whole_8, d64_128; \
2227 \
2228 vadd.u32 r_block, r_block, dx8; \
2229 vdup.u32 dx8, rg_dx8[1]; \
2230 \
2231 vadd.u32 g_block, g_block, dx8; \
2232 vdup.u32 dx8, b_dx8; \
2233 \
2234 vadd.u32 b_block, b_block, dx8; \
2235 add fb_ptr, fb_ptr, #16; \
2236 \
2237 vmovn.u16 r_whole_8, r_whole; \
2238 vmovn.u16 g_whole_8, g_whole; \
2239 vmovn.u16 b_whole_8, b_whole; \
2240 \
2241 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2242 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2243 \
2244 pld [ fb_ptr ]; \
2245 \
2246 subs span_num_blocks, span_num_blocks, #1; \
2247 bne 4b; \
2248 \
2249 5: \
2250 str fb_ptr, [ block_ptr_a, #44 ]; \
2251 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2252 \
2253 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
2254 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2255 \
2256 vshr.u8 r_whole_8, r_whole_8, #3; \
2257 vdup.u8 draw_mask, right_mask; \
2258 \
2259 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2260 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
2261 \
2262 vtst.u16 draw_mask, draw_mask, test_mask; \
2263 \
2264 vmull.u8 pixels, r_whole_8, d64_1; \
2265 vmlal.u8 pixels, g_whole_8, d64_4; \
2266 vmlal.u8 pixels, b_whole_8, d64_128; \
2267 \
2268 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2269 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2270 \
2271 1: \
2272 add span_uvrg_offset, span_uvrg_offset, #16; \
2273 add span_b_offset, span_b_offset, #4; \
2274 \
2275 add span_edge_data, span_edge_data, #8; \
2276 subs num_spans, num_spans, #1; \
2277 \
2278 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2279 bne 0b; \
2280 \
2281 ldmia sp!, { r4 - r11, pc }; \
2282 \
2283 2: \
2284 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2285 vpush { rg_dx4 }; \
2286 \
2287 stmdb sp!, { r0 - r3, r12, r14 }; \
2288 bl flush_render_block_buffer; \
2289 ldmia sp!, { r0 - r3, r12, r14 }; \
2290 \
2291 vpop { rg_dx4 }; \
2292 \
2293 vmov.u8 d64_1, #1; \
2294 vmov.u8 d128_4, #4; \
2295 vmov.u8 d64_128, #128; \
2296 vmov.u8 d128_0x7, #0x7; \
2297 \
2298 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2299 \
2300 mov num_blocks, span_num_blocks; \
2301 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2302 bal 3b \
2303
2304
2305setup_blocks_shaded_untextured_indirect_builder(undithered)
2306setup_blocks_shaded_untextured_indirect_builder(dithered)
2307
2308
2309#undef draw_mask
2310
2311#define mask_msb_ptr r14
2312
2313#define draw_mask q0
2314#define pixels_low d16
3867c6ef 2315#define pixels_high d17
75e28f62
E
2316
2317
2318
2319#define setup_blocks_shaded_untextured_direct_builder(dithering) \
2320.align 3; \
2321 \
2322function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
2323 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2324 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2325 \
2326 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2327 \
2328 cmp num_spans, #0; \
2329 bxeq lr; \
2330 \
2331 stmdb sp!, { r4 - r11, r14 }; \
2332 vshl.u32 rg_dx4, rg_dx, #2; \
2333 \
2334 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2335 vshl.u32 rg_dx8, rg_dx, #3; \
2336 \
2337 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2338 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2339 \
2340 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2341 vmov.u8 d64_1, #1; \
2342 \
2343 vmov.u8 d128_4, #4; \
2344 vmov.u8 d64_128, #128; \
2345 \
2346 vmov.u8 d128_0x7, #0x7; \
2347 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
2348 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
2349 \
2350 0: \
2351 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2352 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2353 \
2354 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 2355 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
2356 \
2357 cmp span_num_blocks, #0; \
2358 beq 1f; \
2359 \
2360 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2361 add fb_ptr, fb_ptr, y, lsl #11; \
2362 \
2363 ldr b, [ span_b_offset ]; \
2364 vdup.u32 v_left_x, left_x; \
2365 and y, y, #0x3; \
2366 \
2367 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2368 add fb_ptr, fb_ptr, left_x, lsl #1; \
2369 \
2370 mla b, b_dx, left_x, b; \
2371 and dither_shift, left_x, #0x03; \
2372 \
2373 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2374 vshr.u32 rg_dx, rg_dx4, #2; \
2375 \
2376 mov dither_shift, dither_shift, lsl #3; \
2377 vmla.u32 rg, rg_dx, v_left_x; \
2378 \
2379 subs span_num_blocks, span_num_blocks, #1; \
2380 \
2381 mov dither_row, dither_row, ror dither_shift; \
2382 mov b_dx4, b_dx, lsl #2; \
2383 \
2384 vdup.u32 dither_offsets, dither_row; \
2385 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2386 \
2387 vdup.u32 b_block, b; \
2388 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2389 \
2390 mov b_dx8, b_dx, lsl #3; \
2391 vdup.u32 r_block, rg[0]; \
2392 vdup.u32 g_block, rg[1]; \
2393 \
2394 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2395 \
2396 vadd.u32 r_block, r_block, block_span; \
2397 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2398 \
2399 vadd.u32 g_block, g_block, block_span; \
2400 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2401 \
2402 vadd.u32 b_block, b_block, block_span; \
2403 add block_ptr_b, block_ptr_a, #16; \
2404 \
2405 vshrn.u32 r_whole_low, r_block, #16; \
2406 vshrn.u32 g_whole_low, g_block, #16; \
2407 vshrn.u32 b_whole_low, b_block, #16; \
2408 vdup.u32 dx4, rg_dx4[0]; \
2409 \
2410 vaddhn.u32 r_whole_high, r_block, dx4; \
2411 vdup.u32 dx4, rg_dx4[1]; \
2412 \
2413 vaddhn.u32 g_whole_high, g_block, dx4; \
2414 vdup.u32 dx4, b_dx4; \
2415 \
2416 vaddhn.u32 b_whole_high, b_block, dx4; \
2417 vdup.u32 dx8, rg_dx8[0]; \
2418 \
2419 vadd.u32 r_block, r_block, dx8; \
2420 vdup.u32 dx8, rg_dx8[1]; \
2421 \
2422 vadd.u32 g_block, g_block, dx8; \
2423 vdup.u32 dx8, b_dx8; \
2424 \
2425 vadd.u32 b_block, b_block, dx8; \
2426 \
2427 vmovn.u16 r_whole_8, r_whole; \
2428 vmovn.u16 g_whole_8, g_whole; \
2429 vmovn.u16 b_whole_8, b_whole; \
2430 \
2431 beq 3f; \
2432 \
2433 2: \
2434 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2435 vshrn.u32 r_whole_low, r_block, #16; \
2436 \
2437 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2438 vshrn.u32 g_whole_low, g_block, #16; \
2439 \
2440 vshrn.u32 b_whole_low, b_block, #16; \
2441 \
2442 vdup.u32 dx4, rg_dx4[0]; \
2443 vshr.u8 r_whole_8, r_whole_8, #3; \
2444 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2445 \
2446 vaddhn.u32 r_whole_high, r_block, dx4; \
2447 vdup.u32 dx4, rg_dx4[1]; \
2448 \
2449 vmov pixels, msb_mask; \
2450 vaddhn.u32 g_whole_high, g_block, dx4; \
2451 vdup.u32 dx4, b_dx4; \
2452 \
2453 vaddhn.u32 b_whole_high, b_block, dx4; \
2454 vdup.u32 dx8, rg_dx8[0]; \
2455 \
2456 vmlal.u8 pixels, r_whole_8, d64_1; \
2457 vmlal.u8 pixels, g_whole_8, d64_4; \
2458 vmlal.u8 pixels, b_whole_8, d64_128; \
2459 \
2460 vadd.u32 r_block, r_block, dx8; \
2461 vdup.u32 dx8, rg_dx8[1]; \
2462 \
2463 vadd.u32 g_block, g_block, dx8; \
2464 vdup.u32 dx8, b_dx8; \
2465 \
2466 vadd.u32 b_block, b_block, dx8; \
2467 \
2468 vmovn.u16 r_whole_8, r_whole; \
2469 vmovn.u16 g_whole_8, g_whole; \
2470 vmovn.u16 b_whole_8, b_whole; \
2471 \
2472 vst1.u32 { pixels }, [ fb_ptr ]!; \
2473 subs span_num_blocks, span_num_blocks, #1; \
2474 bne 2b; \
2475 \
2476 3: \
2477 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2478 \
3867c6ef 2479 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
75e28f62
E
2480 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2481 \
2482 vshr.u8 r_whole_8, r_whole_8, #3; \
3867c6ef 2483 rbit right_mask, right_mask; \
75e28f62
E
2484 vmov pixels, msb_mask; \
2485 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
3867c6ef 2486 clz right_mask, right_mask; \
75e28f62
E
2487 \
2488 vmlal.u8 pixels, r_whole_8, d64_1; \
2489 vmlal.u8 pixels, g_whole_8, d64_4; \
2490 vmlal.u8 pixels, b_whole_8, d64_128; \
2491 \
3867c6ef
E
2492 ldr pc, [ pc, right_mask, lsl #2 ]; \
2493 nop; \
2494 nop; \
2495 .word 4f; \
2496 .word 5f; \
2497 .word 6f; \
2498 .word 7f; \
2499 .word 8f; \
2500 .word 9f; \
2501 .word 10f; \
2502 .word 11f; \
2503 \
75e28f62 2504 4: \
3867c6ef
E
2505 vst1.u16 { pixels_low[0] }, [ fb_ptr ]; \
2506 bal 1f; \
2507 \
2508 5: \
2509 vst1.u32 { pixels_low[0] }, [ fb_ptr ]; \
2510 bal 1f; \
2511 \
2512 6: \
2513 vst1.u32 { pixels_low[0] }, [ fb_ptr ]!; \
2514 vst1.u16 { pixels_low[2] }, [ fb_ptr ]; \
2515 bal 1f; \
2516 \
2517 7: \
2518 vst1.u32 { pixels_low }, [ fb_ptr ]; \
2519 bal 1f; \
2520 \
2521 8: \
2522 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2523 vst1.u16 { pixels_high[0] }, [ fb_ptr ]; \
2524 bal 1f; \
2525 \
2526 9: \
2527 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2528 vst1.u32 { pixels_high[0] }, [ fb_ptr ]!; \
2529 bal 1f; \
2530 \
2531 10: \
2532 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2533 vst1.u32 { pixels_high[0] }, [ fb_ptr ]!; \
2534 vst1.u16 { pixels_high[2] }, [ fb_ptr ]; \
2535 bal 1f; \
2536 \
2537 11: \
2538 vst1.u32 { pixels }, [ fb_ptr ]; \
2539 bal 1f; \
75e28f62
E
2540 \
2541 1: \
2542 add span_uvrg_offset, span_uvrg_offset, #16; \
2543 add span_b_offset, span_b_offset, #4; \
2544 \
2545 add span_edge_data, span_edge_data, #8; \
2546 subs num_spans, num_spans, #1; \
2547 \
2548 bne 0b; \
2549 \
2550 ldmia sp!, { r4 - r11, pc } \
2551
2552setup_blocks_shaded_untextured_direct_builder(undithered)
2553setup_blocks_shaded_untextured_direct_builder(dithered)
2554
2555
2556#undef psx_gpu
2557#undef num_blocks
2558#undef triangle
2559#undef c_64
2560
2561#define psx_gpu r0
2562#define block_ptr r1
2563#define num_blocks r2
2564#define uv_01 r3
2565#define uv_23 r4
2566#define uv_45 r5
2567#define uv_67 r6
2568#define uv_0 r7
2569#define uv_1 r3
2570#define uv_2 r8
2571#define uv_3 r4
2572#define uv_4 r9
2573#define uv_5 r5
2574#define uv_6 r10
2575#define uv_7 r6
2576#define texture_ptr r11
2577
2578#define pixel_0 r7
2579#define pixel_1 r3
2580#define pixel_2 r8
2581#define pixel_3 r4
2582#define pixel_4 r9
2583#define pixel_5 r5
2584#define pixel_6 r10
2585#define pixel_7 r6
2586
2587#define pixels_a r7
2588#define pixels_b r9
2589#define pixels_c r8
2590#define pixels_d r10
2591
2592#define c_64 r0
2593
2594#define clut_ptr r12
2595#define current_texture_mask r5
2596#define dirty_textures_mask r6
2597
2598#define texels d0
2599
2600#define clut_low_a d2
2601#define clut_low_b d3
2602#define clut_high_a d4
2603#define clut_high_b d5
2604
2605#define clut_a q1
2606#define clut_b q2
2607
2608#define texels_low d6
2609#define texels_high d7
2610
2611.align 3
2612
2613function(texture_blocks_untextured)
2614 bx lr
2615
2616
2617.align 3
2618
2619function(texture_blocks_4bpp)
2620 stmdb sp!, { r3 - r11, r14 }
2621 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2622
2623 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2624 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2625
2626 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2627 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]
2628
2629 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2630 vuzp.u8 clut_a, clut_b
2631
2632 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
2633 tst dirty_textures_mask, current_texture_mask
2634
2635 bne 1f
2636 mov c_64, #64
2637
26380:
2639 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2640
2641 uxtah uv_0, texture_ptr, uv_01
2642 uxtah uv_1, texture_ptr, uv_01, ror #16
2643
2644 uxtah uv_2, texture_ptr, uv_23
2645 uxtah uv_3, texture_ptr, uv_23, ror #16
2646
2647 uxtah uv_4, texture_ptr, uv_45
2648 ldrb pixel_0, [ uv_0 ]
2649
2650 uxtah uv_5, texture_ptr, uv_45, ror #16
2651 ldrb pixel_1, [ uv_1 ]
2652
2653 uxtah uv_6, texture_ptr, uv_67
2654 ldrb pixel_2, [ uv_2 ]
2655
2656 uxtah uv_7, texture_ptr, uv_67, ror #16
2657 ldrb pixel_3, [ uv_3 ]
2658
2659 ldrb pixel_4, [ uv_4 ]
2660 subs num_blocks, num_blocks, #1
2661
2662 ldrb pixel_5, [ uv_5 ]
2663 orr pixels_a, pixel_0, pixel_1, lsl #8
2664
2665 ldrb pixel_6, [ uv_6 ]
2666 orr pixels_b, pixel_4, pixel_5, lsl #8
2667
2668 ldrb pixel_7, [ uv_7 ]
2669 orr pixels_a, pixels_a, pixel_2, lsl #16
2670
2671 orr pixels_b, pixels_b, pixel_6, lsl #16
2672 orr pixels_a, pixels_a, pixel_3, lsl #24
2673
2674 orr pixels_b, pixels_b, pixel_7, lsl #24
2675 vmov.u32 texels, pixels_a, pixels_b
2676
2677 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2678 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2679
2680 vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64
2681 bne 0b
2682
2683 ldmia sp!, { r3 - r11, pc }
2684
26851:
2686 stmdb sp!, { r1 - r2 }
2687 bl update_texture_4bpp_cache
2688
2689 mov c_64, #64
2690 ldmia sp!, { r1 - r2 }
2691 bal 0b
2692
2693
2694.align 3
2695
2696function(texture_blocks_8bpp)
2697 stmdb sp!, { r3 - r11, r14 }
2698 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2699
2700 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2701 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2702
2703 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2704 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2705
2706 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]
2707 tst dirty_textures_mask, current_texture_mask
2708
2709 bne 1f
2710 nop
2711
27120:
2713 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2714
2715 uxtah uv_0, texture_ptr, uv_01
2716 uxtah uv_1, texture_ptr, uv_01, ror #16
2717
2718 uxtah uv_2, texture_ptr, uv_23
2719 uxtah uv_3, texture_ptr, uv_23, ror #16
2720
2721 uxtah uv_4, texture_ptr, uv_45
2722 ldrb pixel_0, [ uv_0 ]
2723
2724 uxtah uv_5, texture_ptr, uv_45, ror #16
2725 ldrb pixel_1, [ uv_1 ]
2726
2727 uxtah uv_6, texture_ptr, uv_67
2728 ldrb pixel_2, [ uv_2 ]
2729
2730 uxtah uv_7, texture_ptr, uv_67, ror #16
2731 ldrb pixel_3, [ uv_3 ]
2732
2733 ldrb pixel_4, [ uv_4 ]
2734 add pixel_0, pixel_0, pixel_0
2735
2736 ldrb pixel_5, [ uv_5 ]
2737 add pixel_1, pixel_1, pixel_1
2738
2739 ldrb pixel_6, [ uv_6 ]
2740 add pixel_2, pixel_2, pixel_2
2741
2742 ldrb pixel_7, [ uv_7 ]
2743 add pixel_3, pixel_3, pixel_3
2744
2745 ldrh pixel_0, [ clut_ptr, pixel_0 ]
2746 add pixel_4, pixel_4, pixel_4
2747
2748 ldrh pixel_1, [ clut_ptr, pixel_1 ]
2749 add pixel_5, pixel_5, pixel_5
2750
2751 ldrh pixel_2, [ clut_ptr, pixel_2 ]
2752 add pixel_6, pixel_6, pixel_6
2753
2754 ldrh pixel_3, [ clut_ptr, pixel_3 ]
2755 add pixel_7, pixel_7, pixel_7
2756
2757 ldrh pixel_4, [ clut_ptr, pixel_4 ]
2758 orr pixels_a, pixel_0, pixel_1, lsl #16
2759
2760 ldrh pixel_5, [ clut_ptr, pixel_5 ]
2761 orr pixels_c, pixel_2, pixel_3, lsl #16
2762
2763 ldrh pixel_6, [ clut_ptr, pixel_6 ]
2764 subs num_blocks, num_blocks, #1
2765
2766 ldrh pixel_7, [ clut_ptr, pixel_7 ]
2767 orr pixels_b, pixel_4, pixel_5, lsl #16
2768
2769 orr pixels_d, pixel_6, pixel_7, lsl #16
2770 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2771
2772 add block_ptr, block_ptr, #64
2773 bne 0b
2774
2775 ldmia sp!, { r3 - r11, pc }
2776
27771:
2778 stmdb sp!, { r1 - r2, r12 }
2779
2780 bl update_texture_8bpp_cache
2781
2782 ldmia sp!, { r1 - r2, r12 }
2783 bal 0b
2784
2785
2786#undef uv_0
2787#undef uv_1
2788#undef uv_2
2789#undef uv_3
2790#undef uv_4
2791#undef uv_5
2792#undef uv_6
2793#undef uv_7
2794
2795#undef pixel_0
2796#undef pixel_1
2797#undef pixel_2
2798#undef pixel_3
2799#undef pixel_4
2800#undef pixel_5
2801#undef pixel_6
2802#undef pixel_7
2803
2804#undef texture_ptr
2805
2806#undef pixels_a
2807#undef pixels_b
2808#undef pixels_c
2809#undef pixels_d
2810
2811#define psx_gpu r0
2812#define block_ptr r1
2813#define num_blocks r2
2814
2815#define uv_0 r3
2816#define uv_1 r4
2817#define u_0 r3
2818#define u_1 r4
2819#define v_0 r5
2820#define v_1 r6
2821
2822#define uv_2 r5
2823#define uv_3 r6
2824#define u_2 r5
2825#define u_3 r6
2826#define v_2 r7
2827#define v_3 r8
2828
2829#define uv_4 r7
2830#define uv_5 r8
2831#define u_4 r7
2832#define u_5 r8
2833#define v_4 r9
2834#define v_5 r10
2835
2836#define uv_6 r9
2837#define uv_7 r10
2838#define u_6 r9
2839#define u_7 r10
2840#define v_6 r11
2841#define v_7 r0
2842
2843#define pixel_0 r3
2844#define pixel_1 r4
2845#define pixel_2 r5
2846#define pixel_3 r6
2847#define pixel_4 r7
2848#define pixel_5 r8
2849#define pixel_6 r9
2850#define pixel_7 r10
2851
2852#define pixels_a r3
2853#define pixels_b r5
2854#define pixels_c r7
2855#define pixels_d r9
2856
2857#define texture_ptr r12
2858
2859
2860.align 3
2861
2862function(texture_blocks_16bpp)
2863 stmdb sp!, { r3 - r11, r14 }
2864 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2865
2866 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2867 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2868
28690:
2870 ldrh uv_0, [ block_ptr ]
2871 subs num_blocks, num_blocks, #1
2872
2873 ldrh uv_1, [ block_ptr, #2 ]
2874
2875 and v_0, uv_0, #0xFF00
2876 and v_1, uv_1, #0xFF00
2877
2878 and u_0, uv_0, #0xFF
2879 and u_1, uv_1, #0xFF
2880
2881 add uv_0, u_0, v_0, lsl #2
2882 ldrh uv_2, [ block_ptr, #4 ]
2883
2884 add uv_1, u_1, v_1, lsl #2
2885 ldrh uv_3, [ block_ptr, #6 ]
2886
2887 add uv_0, uv_0, uv_0
2888 add uv_1, uv_1, uv_1
2889
2890 and v_2, uv_2, #0xFF00
2891 and v_3, uv_3, #0xFF00
2892
2893 and u_2, uv_2, #0xFF
2894 and u_3, uv_3, #0xFF
2895
2896 add uv_2, u_2, v_2, lsl #2
2897 ldrh uv_4, [ block_ptr, #8 ]
2898
2899 add uv_3, u_3, v_3, lsl #2
2900 ldrh uv_5, [ block_ptr, #10 ]
2901
2902 add uv_2, uv_2, uv_2
2903 add uv_3, uv_3, uv_3
2904
2905 and v_4, uv_4, #0xFF00
2906 and v_5, uv_5, #0xFF00
2907
2908 and u_4, uv_4, #0xFF
2909 and u_5, uv_5, #0xFF
2910
2911 add uv_4, u_4, v_4, lsl #2
2912 ldrh uv_6, [ block_ptr, #12 ]
2913
2914 add uv_5, u_5, v_5, lsl #2
2915 ldrh uv_7, [ block_ptr, #14 ]
2916
2917 add uv_4, uv_4, uv_4
2918 ldrh pixel_0, [ texture_ptr, uv_0 ]
2919
2920 add uv_5, uv_5, uv_5
2921 ldrh pixel_1, [ texture_ptr, uv_1 ]
2922
2923 and v_6, uv_6, #0xFF00
2924 ldrh pixel_2, [ texture_ptr, uv_2 ]
2925
2926 and v_7, uv_7, #0xFF00
2927 ldrh pixel_3, [ texture_ptr, uv_3 ]
2928
2929 and u_6, uv_6, #0xFF
2930 ldrh pixel_4, [ texture_ptr, uv_4 ]
2931
2932 and u_7, uv_7, #0xFF
2933 ldrh pixel_5, [ texture_ptr, uv_5 ]
2934
2935 add uv_6, u_6, v_6, lsl #2
2936 add uv_7, u_7, v_7, lsl #2
2937
2938 add uv_6, uv_6, uv_6
2939 add uv_7, uv_7, uv_7
2940
2941 orr pixels_a, pixel_0, pixel_1, lsl #16
2942 orr pixels_b, pixel_2, pixel_3, lsl #16
2943
2944 ldrh pixel_6, [ texture_ptr, uv_6 ]
2945 orr pixels_c, pixel_4, pixel_5, lsl #16
2946
2947 ldrh pixel_7, [ texture_ptr, uv_7 ]
2948 orr pixels_d, pixel_6, pixel_7, lsl #16
2949
2950 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2951 add block_ptr, block_ptr, #64
2952
2953 bne 0b
2954
2955 ldmia sp!, { r3 - r11, pc }
2956
2957
2958#undef num_blocks
2959
2960#undef test_mask
2961#undef texels
2962#undef pixels_b
2963#undef pixels
2964#undef d64_1
2965#undef d64_4
2966#undef d64_128
2967#undef draw_mask
2968#undef msb_mask
2969#undef msb_mask_low
2970#undef msb_mask_high
2971#undef fb_pixels
2972
2973#undef c_32
2974#undef fb_ptr
2975#undef mask_msb_ptr
2976
2977#define psx_gpu r0
2978#define num_blocks r1
2979#define color_ptr r2
3867c6ef
E
2980#define colors_scalar r2
2981#define colors_scalar_compare r3
75e28f62
E
2982#define mask_msb_ptr r2
2983
2984#define block_ptr_load_a r0
2985#define block_ptr_store r3
2986#define block_ptr_load_b r12
2987#define c_32 r2
2988
2989#define c_48 r4
2990#define fb_ptr r14
2991#define draw_mask_bits_scalar r5
2992
2993#define d128_0x07 q0
2994#define d128_0x1F q1
2995#define d128_0x8000 q2
2996#define test_mask q3
2997#define texels q4
2998#define colors_rg q5
2999#define colors_b_dm_bits q6
3000#define texels_rg q7
3001#define pixels_r q8
3002#define pixels_g q9
3003#define pixels_b q10
3004#define pixels q11
3005#define zero_mask q4
3006#define draw_mask q12
3007#define msb_mask q13
3008
3009#define fb_pixels q8
3010
3011#define pixels_gb_low q9
3012
3013#define colors_r d10
3014#define colors_g d11
3015#define colors_b d12
3016#define draw_mask_bits d13
3017#define texels_r d14
3018#define texels_g d15
3019#define pixels_r_low d16
3020#define pixels_g_low d18
3021#define pixels_b_low d19
3022#define msb_mask_low d26
3023#define msb_mask_high d27
3024
3025#define d64_1 d28
3026#define d64_4 d29
3027#define d64_128 d30
3028#define texels_b d31
3029
3030#define shade_blocks_textured_modulated_prologue_indirect() \
3031 mov c_48, #48; \
3032 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3033
3034#define shade_blocks_textured_modulated_prologue_direct() \
3035 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3036 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] \
3037
75e28f62 3038
3867c6ef
E
3039#define shade_blocks_textured_modulated_prologue_shaded(dithering, target) \
3040
3041#define shade_blocks_textured_false_modulation_check_undithered(target) \
3042 ldr colors_scalar, [ psx_gpu, #psx_gpu_triangle_color_offset ]; \
3043 movw colors_scalar_compare, #0x8080; \
3044 \
3045 movt colors_scalar_compare, #0x80; \
3046 cmp colors_scalar, colors_scalar_compare; \
3047 beq shade_blocks_textured_unmodulated_##target \
3048
3049#define shade_blocks_textured_false_modulation_check_dithered(target) \
3050
3051#define shade_blocks_textured_modulated_prologue_unshaded(dithering, target) \
3052 shade_blocks_textured_false_modulation_check_##dithering(target); \
75e28f62
E
3053 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
3054 vld1.u32 { colors_r[] }, [ color_ptr, :32 ]; \
3055 vdup.u8 colors_g, colors_r[1]; \
3056 vdup.u8 colors_b, colors_r[2]; \
3057 vdup.u8 colors_r, colors_r[0] \
3058
3059
3060#define shade_blocks_textured_modulated_load_dithered(target) \
3061 vld1.u32 { target }, [ block_ptr_load_b, :128 ] \
3062
3063#define shade_blocks_textured_modulated_load_last_dithered(target) \
3064 vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32 \
3065
3066#define shade_blocks_textured_modulated_load_undithered(target) \
3067
3068#define shade_blocks_textured_modulated_load_last_undithered(target) \
3069 add block_ptr_load_b, block_ptr_load_b, #32 \
3070
3071#define shade_blocks_textured_modulate_dithered(channel) \
3072 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3073
3074#define shade_blocks_textured_modulate_undithered(channel) \
3075 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3076
3077
3078#define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
3079 vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]! \
3080
3081#define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
3082 ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ]; \
3083 vld1.u32 { fb_pixels }, [ fb_ptr ]; \
3084 vbit.u16 pixels, fb_pixels, draw_mask \
3085
3086#define shade_blocks_textured_modulated_store_pixels_indirect() \
3087 vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48 \
3088
3089#define shade_blocks_textured_modulated_store_pixels_direct() \
3090 vst1.u32 { pixels }, [ fb_ptr ] \
3091
3092
3093#define shade_blocks_textured_modulated_load_rg_shaded() \
3094 vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32 \
3095
3096#define shade_blocks_textured_modulated_load_rg_unshaded() \
3097 add block_ptr_load_b, block_ptr_load_b, #32 \
3098
3099#define shade_blocks_textured_modulated_load_bdm_shaded() \
3100 vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32 \
3101
3102#define shade_blocks_textured_modulated_load_bdm_unshaded() \
3103 ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ]; \
3104 add block_ptr_load_a, block_ptr_load_a, #32 \
3105
3106#define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3107 vdup.u16 draw_mask, draw_mask_bits[0] \
3108
3109#define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3110 vdup.u16 draw_mask, draw_mask_bits_scalar \
3111
3112
3113#define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3114
3115#define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3116 vorr.u16 pixels, pixels, msb_mask \
3117
3118
3119#define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3120.align 3; \
3121 \
3122function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
3867c6ef 3123 shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
75e28f62
E
3124 stmdb sp!, { r4 - r5, lr }; \
3125 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3126 \
3127 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
3128 \
3129 shade_blocks_textured_modulated_prologue_##target(); \
75e28f62
E
3130 \
3131 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3132 mov c_32, #32; \
3133 \
3134 add block_ptr_load_b, block_ptr_load_a, #16; \
3135 vmov.u8 d64_1, #1; \
3136 vmov.u8 d64_4, #4; \
3137 vmov.u8 d64_128, #128; \
3138 \
3139 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3140 vmov.u8 d128_0x07, #0x07; \
3141 \
3142 shade_blocks_textured_modulated_load_rg_##shading(); \
3143 vmov.u8 d128_0x1F, #0x1F; \
3144 \
3145 shade_blocks_textured_modulated_load_bdm_##shading(); \
3146 vmov.u16 d128_0x8000, #0x8000; \
3147 \
3148 vmovn.u16 texels_r, texels; \
3149 vshrn.u16 texels_g, texels, #5; \
3150 \
3151 vshrn.u16 texels_b, texels, #7; \
3152 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3153 \
3154 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3155 vtst.u16 draw_mask, draw_mask, test_mask; \
3156 \
3157 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3158 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3159 \
3160 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3161 vshr.u8 texels_b, texels_b, #3; \
3162 \
3163 shade_blocks_textured_modulate_##dithering(r); \
3164 shade_blocks_textured_modulate_##dithering(g); \
3165 shade_blocks_textured_modulate_##dithering(b); \
3166 \
3167 vand.u16 pixels, texels, d128_0x8000; \
3168 vceq.u16 zero_mask, texels, #0; \
3169 \
3170 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3171 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3172 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3173 \
3174 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3175 vorr.u16 draw_mask, draw_mask, zero_mask; \
3176 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3177 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3178 \
3179 subs num_blocks, num_blocks, #1; \
3180 beq 1f; \
3181 \
3182 .align 3; \
3183 \
3184 0: \
3185 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3186 shade_blocks_textured_modulated_load_rg_##shading(); \
3187 vshrn.u16 texels_g, texels, #5; \
3188 \
3189 shade_blocks_textured_modulated_load_bdm_##shading(); \
3190 vshrn.u16 texels_b, texels, #7; \
3191 \
59d15d23 3192 pld [ block_ptr_load_a ]; \
75e28f62
E
3193 vmovn.u16 texels_r, texels; \
3194 vmlal.u8 pixels, pixels_r_low, d64_1; \
3195 \
3196 vmlal.u8 pixels, pixels_g_low, d64_4; \
3197 vmlal.u8 pixels, pixels_b_low, d64_128; \
3198 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3199 \
3200 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3201 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3202 \
3203 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3204 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3205 \
3206 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3207 vtst.u16 draw_mask, draw_mask, test_mask; \
3208 \
3209 shade_blocks_textured_modulated_store_pixels_##target(); \
3210 vshr.u8 texels_b, texels_b, #3; \
3211 \
3212 shade_blocks_textured_modulate_##dithering(r); \
3213 shade_blocks_textured_modulate_##dithering(g); \
3214 shade_blocks_textured_modulate_##dithering(b); \
3215 \
3216 vand.u16 pixels, texels, d128_0x8000; \
3217 vceq.u16 zero_mask, texels, #0; \
3218 \
3219 subs num_blocks, num_blocks, #1; \
3220 \
3221 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3222 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3223 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3224 \
3225 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3226 vorr.u16 draw_mask, draw_mask, zero_mask; \
3227 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3228 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3229 \
3230 bne 0b; \
3231 \
3232 1: \
3233 vmlal.u8 pixels, pixels_r_low, d64_1; \
3234 vmlal.u8 pixels, pixels_g_low, d64_4; \
3235 vmlal.u8 pixels, pixels_b_low, d64_128; \
3236 \
3237 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3238 shade_blocks_textured_modulated_store_pixels_##target(); \
3239 \
3240 ldmia sp!, { r4 - r5, pc } \
3241
3242
3243shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3244shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3245shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3246shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3247
3248shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3249shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3250shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3251shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3252
3253
3254#undef c_64
3255#undef fb_ptr
3256#undef color_ptr
3257
3258#undef color_r
3259#undef color_g
3260#undef color_b
3261
3262#undef test_mask
3263#undef pixels
3264#undef draw_mask
3265#undef zero_mask
3266#undef fb_pixels
3267#undef msb_mask
3268#undef msb_mask_low
3269#undef msb_mask_high
3270
3271#define psx_gpu r0
3272#define num_blocks r1
3273#define mask_msb_ptr r2
3274#define color_ptr r3
3275
3276#define block_ptr_load r0
3277#define draw_mask_store_ptr r3
3278#define draw_mask_bits_ptr r12
3279#define draw_mask_ptr r12
3280#define pixel_store_ptr r14
3281
3282#define fb_ptr_cmp r4
3283
3284#define fb_ptr r3
3285#define fb_ptr_next r14
3286
3287#define c_64 r2
3288
3289#define test_mask q0
3290#define pixels q1
3291#define draw_mask q2
3292#define zero_mask q3
3293#define draw_mask_combined q4
3294#define fb_pixels q5
3295#define fb_pixels_next q6
3296#define msb_mask q7
3297
3298#define draw_mask_low d4
3299#define draw_mask_high d5
3300#define msb_mask_low d14
3301#define msb_mask_high d15
3302
3303.align 3
3304function(shade_blocks_textured_unmodulated_indirect)
3305 str r14, [ sp, #-4 ]
3306 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3307
3308 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3309 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3310
3311 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3312 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3313
3314 mov c_64, #64
3315 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3316
3317 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3318 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3319 [ draw_mask_bits_ptr, :16 ], c_64
3320 vceq.u16 zero_mask, pixels, #0
3321
3322 vtst.u16 draw_mask, draw_mask, test_mask
3323 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3324
3325 subs num_blocks, num_blocks, #1
3326 beq 1f
3327
3328 0:
3329 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3330 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3331
3332 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3333 [ draw_mask_bits_ptr, :16 ], c_64
3334 vceq.u16 zero_mask, pixels, #0
3335
3336 vtst.u16 draw_mask, draw_mask, test_mask
3337 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3338
3339 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3340 subs num_blocks, num_blocks, #1
3341
3342 bne 0b
3343
3344 1:
3345 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3346 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3347
3348 ldr pc, [ sp, #-4 ]
3349
3350
3351.align 3
3352
3353function(shade_blocks_textured_unmodulated_direct)
3354 stmdb sp!, { r4, r14 }
3355 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3356
3357 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3358 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3359
3360 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3361 mov c_64, #64
3362
3363 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3364 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3365
3366 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3367 [ draw_mask_bits_ptr, :16 ], c_64
3368 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3369
3370 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3371 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3372 vceq.u16 zero_mask, pixels, #0
3373 vtst.u16 draw_mask, draw_mask, test_mask
3374
3375 subs num_blocks, num_blocks, #1
3376 beq 1f
3377
3378 0:
3379 mov fb_ptr, fb_ptr_next
3380 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3381
3382 vorr.u16 pixels, pixels, msb_mask
3383
3384 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3385 vmov fb_pixels, fb_pixels_next
3386
3387 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3388 [ draw_mask_bits_ptr, :16 ], c_64
3389 vbif.u16 fb_pixels, pixels, draw_mask_combined
3390
3391 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3392
3393 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3394 add fb_ptr_cmp, fb_ptr_cmp, #14
3395 cmp fb_ptr_cmp, #28
3396 bls 4f
3397
3398 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3399 vceq.u16 zero_mask, pixels, #0
3400
3401 vst1.u16 { fb_pixels }, [ fb_ptr ]
3402 vtst.u16 draw_mask, draw_mask, test_mask
3403
3404 3:
3405 subs num_blocks, num_blocks, #1
3406 bne 0b
3407
3408 1:
3409 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3410 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3411
3412 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3413
3414 ldmia sp!, { r4, pc }
3415
3416 4:
3417 vst1.u16 { fb_pixels }, [ fb_ptr ]
3418 vceq.u16 zero_mask, pixels, #0
3419
3420 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3421 vtst.u16 draw_mask, draw_mask, test_mask
3422
3423 bal 3b
3424
3425
3426function(shade_blocks_unshaded_untextured_indirect)
3427 bx lr
3428
3429.align 3
3430
3431function(shade_blocks_unshaded_untextured_direct)
3432 stmdb sp!, { r4, r14 }
3433 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3434
3435 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3436 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3437
3438 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3439 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3440
3441 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3442 vld1.u16 { pixels }, [ color_ptr, :128 ]
3443
3444 mov c_64, #64
3445 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3446
3447 vorr.u16 pixels, pixels, msb_mask
3448 subs num_blocks, num_blocks, #1
3449
3450 ldr fb_ptr_next, [ block_ptr_load ], #64
3451
3452 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3453 beq 1f
3454
3455 0:
3456 vmov fb_pixels, fb_pixels_next
3457 mov fb_ptr, fb_ptr_next
3458 ldr fb_ptr_next, [ block_ptr_load ], #64
3459
3460 vbif.u16 fb_pixels, pixels, draw_mask
3461 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3462
3463 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3464 add fb_ptr_cmp, fb_ptr_cmp, #14
3465 cmp fb_ptr_cmp, #28
3466 bls 4f
3467
3468 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3469 vst1.u16 { fb_pixels }, [ fb_ptr ]
3470
3471 3:
3472 subs num_blocks, num_blocks, #1
3473 bne 0b
3474
3475 1:
3476 vbif.u16 fb_pixels_next, pixels, draw_mask
3477 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3478
3479 ldmia sp!, { r4, pc }
3480
3481 4:
3482 vst1.u16 { fb_pixels }, [ fb_ptr ]
3483 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3484 bal 3b
3485
3486
3487#undef draw_mask_ptr
3488#undef c_64
3489#undef fb_ptr
3490#undef fb_ptr_next
3491#undef fb_ptr_cmp
3492
3493#define psx_gpu r0
3494#define num_blocks r1
3495#define msb_mask_ptr r2
3496#define pixel_ptr r3
3497#define draw_mask_ptr r0
3498#define c_64 r2
3499#define fb_ptr r12
3500#define fb_ptr_next r14
3501#define fb_ptr_cmp r4
3502
3503#undef msb_mask
3504#undef draw_mask
3505#undef pixels
3506#undef fb_pixels
3507#undef d128_0x8000
3508#undef msb_mask_low
3509#undef msb_mask_high
3510#undef draw_mask_next
3511#undef pixels_g
3512#undef blend_pixels
3513#undef fb_pixels_next
3514
3515#define msb_mask q0
3516#define draw_mask q1
3517#define pixels q2
3518#define fb_pixels q3
3519#define blend_pixels q4
3520#define pixels_no_msb q5
3521#define blend_mask q6
3522#define fb_pixels_no_msb q7
3523#define d128_0x8000 q8
3524#define d128_0x0421 q9
3525#define fb_pixels_next q10
3526#define blend_pixels_next q11
3527#define pixels_next q12
3528#define draw_mask_next q13
3529#define write_mask q14
3530
3531#define pixels_rb q5
3532#define pixels_mg q7
3533#define pixels_g q7
3534#define d128_0x7C1F q8
3535#define d128_0x03E0 q9
3536#define fb_pixels_rb q10
3537#define fb_pixels_g q11
3538#define fb_pixels_masked q11
3539#define d128_0x83E0 q15
3540#define pixels_fourth q7
3541#define d128_0x1C07 q12
3542#define d128_0x00E0 q13
3543#define d128_0x80E0 q13
3544
3545#define msb_mask_low d0
3546#define msb_mask_high d1
3547
3548#define blend_blocks_average_set_blend_mask_textured(source) \
3549 vclt.s16 blend_mask, source, #0 \
3550
3551#define blend_blocks_average_set_stp_bit_textured() \
3552 vorr.u16 blend_pixels, #0x8000 \
3553
3554#define blend_blocks_average_combine_textured(source) \
3555 vbif.u16 blend_pixels, source, blend_mask \
3556
3557#define blend_blocks_average_set_blend_mask_untextured(source) \
3558
3559#define blend_blocks_average_set_stp_bit_untextured() \
3560
3561#define blend_blocks_average_combine_untextured(source) \
3562
3563#define blend_blocks_average_mask_set_on() \
3564 vclt.s16 write_mask, fb_pixels_next, #0 \
3565
3566#define blend_blocks_average_mask_copy_on() \
3567 vorr.u16 draw_mask, draw_mask_next, write_mask \
3568
3569#define blend_blocks_average_mask_copy_b_on() \
3570 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3571
3572#define blend_blocks_average_mask_set_off() \
3573
3574#define blend_blocks_average_mask_copy_off() \
3575 vmov draw_mask, draw_mask_next \
3576
3577#define blend_blocks_average_mask_copy_b_off() \
3578
3579#define blend_blocks_average_builder(texturing, mask_evaluate) \
3580.align 3; \
3581 \
3582function(blend_blocks_##texturing##_average_##mask_evaluate) \
3583 stmdb sp!, { r4, r14 }; \
3584 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3585 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3586 \
3587 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3588 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3589 \
3590 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3591 mov c_64, #64; \
3592 \
3593 vmov.u16 d128_0x8000, #0x8000; \
3594 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3595 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3596 \
3597 vmov.u16 d128_0x0421, #0x0400; \
3598 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3599 \
3600 vorr.u16 d128_0x0421, #0x0021; \
3601 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3602 \
3603 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3604 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3605 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3606 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3607 blend_blocks_average_mask_set_##mask_evaluate(); \
3608 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3609 \
3610 subs num_blocks, num_blocks, #1; \
3611 beq 1f; \
3612 \
3613 0: \
3614 mov fb_ptr, fb_ptr_next; \
3615 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3616 \
3617 vmov pixels, pixels_next; \
3618 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3619 \
3620 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3621 \
3622 blend_blocks_average_mask_copy_##mask_evaluate(); \
3623 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3624 \
3625 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3626 blend_blocks_average_set_stp_bit_##texturing(); \
3627 vmov fb_pixels, fb_pixels_next; \
3628 blend_blocks_average_combine_##texturing(pixels); \
3629 \
3630 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3631 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3632 cmp fb_ptr_cmp, #28; \
3633 bls 2f; \
3634 \
3635 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3636 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3637 \
3638 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3639 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3640 \
3641 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3642 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3643 \
3644 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3645 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3646 blend_blocks_average_mask_set_##mask_evaluate(); \
3647 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3648 \
3649 3: \
3650 subs num_blocks, num_blocks, #1; \
3651 bne 0b; \
3652 \
3653 1: \
3654 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3655 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3656 \
3657 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3658 blend_blocks_average_set_stp_bit_##texturing(); \
3659 blend_blocks_average_combine_##texturing(pixels_next); \
3660 \
3661 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3662 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
3663 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3664 \
3665 ldmia sp!, { r4, pc }; \
3666 \
3667 2: \
3668 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3669 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3670 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3671 \
3672 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3673 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3674 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3675 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3676 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3677 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3678 \
3679 bal 3b \
3680
3681blend_blocks_average_builder(textured, off)
3682blend_blocks_average_builder(untextured, off)
3683blend_blocks_average_builder(textured, on)
3684blend_blocks_average_builder(untextured, on)
3685
3686
3687#define blend_blocks_add_mask_set_on() \
3688 vclt.s16 write_mask, fb_pixels, #0 \
3689
3690#define blend_blocks_add_mask_copy_on() \
3691 vorr.u16 draw_mask, draw_mask, write_mask \
3692
3693#define blend_blocks_add_mask_set_off() \
3694
3695#define blend_blocks_add_mask_copy_off() \
3696
3697
3698#define blend_blocks_add_textured_builder(mask_evaluate) \
3699.align 3; \
3700 \
3701function(blend_blocks_textured_add_##mask_evaluate) \
3702 stmdb sp!, { r4, r14 }; \
3703 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3704 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3705 \
3706 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3707 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3708 \
3709 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3710 mov c_64, #64; \
3711 \
3712 vmov.u16 d128_0x7C1F, #0x7C00; \
3713 vmov.u16 d128_0x03E0, #0x0300; \
3714 vmov.u16 d128_0x83E0, #0x8000; \
3715 vorr.u16 d128_0x03E0, #0x00E0; \
3716 vorr.u16 d128_0x7C1F, #0x001F; \
3717 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3718 \
3719 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3720 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3721 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3722 vclt.s16 blend_mask, pixels, #0; \
3723 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3724 blend_blocks_add_mask_set_##mask_evaluate(); \
3725 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3726 \
3727 blend_blocks_add_mask_copy_##mask_evaluate(); \
3728 vorr.u16 pixels, pixels, msb_mask; \
3729 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3730 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3731 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3732 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3733 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3734 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3735 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3736 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3737 \
3738 subs num_blocks, num_blocks, #1; \
3739 beq 1f; \
3740 \
3741 0: \
3742 mov fb_ptr, fb_ptr_next; \
3743 \
3744 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3745 \
3746 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3747 vclt.s16 blend_mask, pixels, #0; \
3748 \
3749 vorr.u16 pixels, pixels, msb_mask; \
3750 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3751 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3752 \
3753 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3754 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3755 \
3756 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3757 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3758 cmp fb_ptr_cmp, #28; \
3759 bls 2f; \
3760 \
3761 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3762 blend_blocks_add_mask_set_##mask_evaluate(); \
3763 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3764 blend_blocks_add_mask_copy_##mask_evaluate(); \
3765 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3766 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3767 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3768 \
3769 3: \
3770 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3771 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3772 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3773 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3774 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3775 \
3776 subs num_blocks, num_blocks, #1; \
3777 bne 0b; \
3778 \
3779 1: \
3780 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3781 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3782 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3783 \
3784 ldmia sp!, { r4, pc }; \
3785 \
3786 2: \
3787 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3788 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3789 \
3790 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3791 blend_blocks_add_mask_set_##mask_evaluate(); \
3792 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3793 blend_blocks_add_mask_copy_##mask_evaluate(); \
3794 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3795 bal 3b \
3796
3797
3798#define blend_blocks_add_untextured_builder(mask_evaluate) \
3799.align 3; \
3800 \
3801function(blend_blocks_untextured_add_##mask_evaluate) \
3802 stmdb sp!, { r4, r14 }; \
3803 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3804 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3805 \
3806 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3807 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3808 \
3809 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3810 mov c_64, #64; \
3811 \
3812 vmov.u16 d128_0x7C1F, #0x7C00; \
3813 vmov.u16 d128_0x03E0, #0x0300; \
3814 vorr.u16 d128_0x7C1F, #0x001F; \
3815 vorr.u16 d128_0x03E0, #0x00E0; \
3816 \
3817 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3818 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3819 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3820 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3821 blend_blocks_add_mask_set_##mask_evaluate(); \
3822 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3823 \
3824 blend_blocks_add_mask_copy_##mask_evaluate(); \
3825 vand.u16 pixels_g, pixels, d128_0x03E0; \
3826 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3827 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3828 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3829 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3830 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3831 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3832 \
3833 subs num_blocks, num_blocks, #1; \
3834 beq 1f; \
3835 \
3836 0: \
3837 mov fb_ptr, fb_ptr_next; \
3838 \
3839 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3840 \
3841 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3842 \
3843 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3844 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3845 vand.u16 pixels_g, pixels, d128_0x03E0; \
3846 \
3847 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3848 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3849 \
3850 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3851 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3852 cmp fb_ptr_cmp, #28; \
3853 bls 2f; \
3854 \
3855 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3856 blend_blocks_add_mask_set_##mask_evaluate(); \
3857 blend_blocks_add_mask_copy_##mask_evaluate(); \
3858 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3859 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3860 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3861 \
3862 3: \
3863 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3864 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3865 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3866 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3867 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3868 \
3869 subs num_blocks, num_blocks, #1; \
3870 bne 0b; \
3871 \
3872 1: \
3873 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3874 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3875 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3876 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3877 \
3878 ldmia sp!, { r4, pc }; \
3879 \
3880 2: \
3881 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3882 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3883 \
3884 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3885 blend_blocks_add_mask_set_##mask_evaluate(); \
3886 blend_blocks_add_mask_copy_##mask_evaluate(); \
3887 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3888 bal 3b \
3889
3890
3891blend_blocks_add_textured_builder(off)
3892blend_blocks_add_textured_builder(on)
3893blend_blocks_add_untextured_builder(off)
3894blend_blocks_add_untextured_builder(on)
3895
3896#define blend_blocks_subtract_set_blend_mask_textured() \
3897 vclt.s16 blend_mask, pixels_next, #0 \
3898
3899#define blend_blocks_subtract_combine_textured() \
3900 vbif.u16 blend_pixels, pixels, blend_mask \
3901
3902#define blend_blocks_subtract_set_stb_textured() \
3903 vorr.u16 blend_pixels, #0x8000 \
3904
3905#define blend_blocks_subtract_msb_mask_textured() \
3906 vorr.u16 pixels, pixels_next, msb_mask \
3907
3908#define blend_blocks_subtract_set_blend_mask_untextured() \
3909
3910#define blend_blocks_subtract_combine_untextured() \
3911
3912#define blend_blocks_subtract_set_stb_untextured() \
3913 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3914
3915#define blend_blocks_subtract_msb_mask_untextured() \
3916
3917
3918#define blend_blocks_subtract_mask_set_on() \
3919 vclt.s16 write_mask, fb_pixels, #0 \
3920
3921#define blend_blocks_subtract_mask_copy_on() \
3922 vorr.u16 draw_mask, draw_mask_next, write_mask \
3923
3924#define blend_blocks_subtract_mask_set_off() \
3925
3926#define blend_blocks_subtract_mask_copy_off() \
3927 vmov draw_mask, draw_mask_next \
3928
3929
3930#define blend_blocks_subtract_builder(texturing, mask_evaluate) \
3931.align 3; \
3932 \
3933function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
3934 stmdb sp!, { r4, r14 }; \
3935 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3936 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3937 \
3938 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3939 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3940 \
3941 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3942 mov c_64, #64; \
3943 \
3944 vmov.u16 d128_0x7C1F, #0x7C00; \
3945 vmov.u16 d128_0x03E0, #0x0300; \
3946 vorr.u16 d128_0x7C1F, #0x001F; \
3947 vorr.u16 d128_0x03E0, #0x00E0; \
3948 \
3949 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3950 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3951 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3952 blend_blocks_subtract_set_blend_mask_##texturing(); \
3953 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3954 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3955 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3956 \
3957 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3958 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3959 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3960 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3961 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3962 \
3963 subs num_blocks, num_blocks, #1; \
3964 beq 1f; \
3965 \
3966 0: \
3967 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
3968 mov fb_ptr, fb_ptr_next; \
3969 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3970 \
3971 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3972 blend_blocks_subtract_msb_mask_##texturing(); \
3973 \
3974 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3975 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3976 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3977 blend_blocks_subtract_set_stb_##texturing(); \
3978 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3979 blend_blocks_subtract_combine_##texturing(); \
3980 blend_blocks_subtract_set_blend_mask_##texturing(); \
3981 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3982 \
3983 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3984 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3985 cmp fb_ptr_cmp, #28; \
3986 bls 2f; \
3987 \
3988 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3989 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3990 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3991 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3992 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3993 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3994 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3995 \
3996 3: \
3997 subs num_blocks, num_blocks, #1; \
3998 bne 0b; \
3999 \
4000 1: \
4001 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4002 \
4003 blend_blocks_subtract_msb_mask_##texturing(); \
4004 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4005 blend_blocks_subtract_set_stb_##texturing(); \
4006 blend_blocks_subtract_combine_##texturing(); \
4007 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4008 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4009 \
4010 ldmia sp!, { r4, pc }; \
4011 \
4012 2: \
4013 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4014 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4015 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4016 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4017 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4018 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4019 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4020 bal 3b \
4021
4022
4023blend_blocks_subtract_builder(textured, off)
4024blend_blocks_subtract_builder(textured, on)
4025blend_blocks_subtract_builder(untextured, off)
4026blend_blocks_subtract_builder(untextured, on)
4027
4028
4029#define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
4030.align 3; \
4031 \
4032function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4033 stmdb sp!, { r4, r14 }; \
4034 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4035 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4036 \
4037 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4038 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4039 \
4040 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4041 mov c_64, #64; \
4042 \
4043 vmov.u16 d128_0x7C1F, #0x7C00; \
4044 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62 4045 vmov.u16 d128_0x1C07, #0x1C00; \
d1c75d1e 4046 vmov.u16 d128_0x00E0, #0x00E0; \
75e28f62
E
4047 vorr.u16 d128_0x7C1F, #0x001F; \
4048 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62 4049 vorr.u16 d128_0x1C07, #0x0007; \
75e28f62
E
4050 \
4051 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4052 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4053 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4054 vclt.s16 blend_mask, pixels, #0; \
4055 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4056 blend_blocks_add_mask_set_##mask_evaluate(); \
4057 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4058 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
75e28f62
E
4059 \
4060 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4061 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4062 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4063 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4064 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4065 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4066 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4067 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4068 \
4069 subs num_blocks, num_blocks, #1; \
4070 beq 1f; \
4071 \
4072 0: \
4073 mov fb_ptr, fb_ptr_next; \
75e28f62
E
4074 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4075 \
d1c75d1e
E
4076 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4077 vbif.u16 blend_pixels, pixels, blend_mask; \
4078 \
75e28f62
E
4079 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4080 vclt.s16 blend_mask, pixels, #0; \
75e28f62 4081 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4082 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62
E
4083 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4084 \
4085 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4086 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4087 \
4088 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4089 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4090 cmp fb_ptr_cmp, #28; \
4091 bls 2f; \
4092 \
4093 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4094 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4095 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4096 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4097 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4098 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4099 \
4100 3: \
d1c75d1e 4101 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4102 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4103 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4104 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4105 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4106 \
4107 subs num_blocks, num_blocks, #1; \
4108 bne 0b; \
4109 \
4110 1: \
4111 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
d1c75d1e
E
4112 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4113 vbif.u16 blend_pixels, pixels, blend_mask; \
75e28f62
E
4114 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4115 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4116 \
4117 ldmia sp!, { r4, pc }; \
4118 \
4119 2: \
4120 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
d1c75d1e 4121 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
75e28f62
E
4122 \
4123 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4124 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4125 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e 4126 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4127 bal 3b \
4128
4129
d1c75d1e 4130
75e28f62
E
4131#define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4132.align 3; \
4133 \
4134function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4135 stmdb sp!, { r4, r14 }; \
4136 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4137 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4138 \
4139 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4140 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4141 \
4142 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4143 mov c_64, #64; \
4144 \
4145 vmov.u16 d128_0x7C1F, #0x7C00; \
4146 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62
E
4147 vmov.u16 d128_0x1C07, #0x1C00; \
4148 vmov.u16 d128_0x00E0, #0x00E0; \
4149 vorr.u16 d128_0x7C1F, #0x001F; \
4150 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62
E
4151 vorr.u16 d128_0x1C07, #0x0007; \
4152 \
4153 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4154 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4155 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4156 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4157 blend_blocks_add_mask_set_##mask_evaluate(); \
4158 vshr.s16 pixels_fourth, pixels, #2; \
4159 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4160 \
4161 blend_blocks_add_mask_copy_##mask_evaluate(); \
4162 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4163 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4164 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4165 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4166 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4167 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4168 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4169 \
4170 subs num_blocks, num_blocks, #1; \
4171 beq 1f; \
4172 \
4173 0: \
4174 mov fb_ptr, fb_ptr_next; \
75e28f62
E
4175 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4176 \
4177 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4178 \
4179 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4180 vshr.s16 pixels_fourth, pixels, #2; \
4181 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4182 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4183 \
4184 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4185 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4186 \
4187 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4188 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4189 cmp fb_ptr_cmp, #28; \
4190 bls 2f; \
4191 \
4192 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4193 blend_blocks_add_mask_set_##mask_evaluate(); \
4194 blend_blocks_add_mask_copy_##mask_evaluate(); \
4195 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4196 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4197 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4198 \
4199 3: \
4200 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4201 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4202 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4203 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4204 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4205 \
4206 subs num_blocks, num_blocks, #1; \
4207 bne 0b; \
4208 \
4209 1: \
4210 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4211 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4212 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4213 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4214 \
4215 ldmia sp!, { r4, pc }; \
4216 \
4217 2: \
4218 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4219 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4220 \
4221 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4222 blend_blocks_add_mask_set_##mask_evaluate(); \
4223 blend_blocks_add_mask_copy_##mask_evaluate(); \
4224 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4225 bal 3b \
4226
4227
4228blend_blocks_add_fourth_textured_builder(off)
4229blend_blocks_add_fourth_textured_builder(on)
4230blend_blocks_add_fourth_untextured_builder(off)
4231blend_blocks_add_fourth_untextured_builder(on)
4232
4233// TODO: Optimize this more. Need a scene that actually uses it for
4234// confirmation..
4235
4236.align 3
4237
4238function(blend_blocks_textured_unblended_on)
4239 stmdb sp!, { r4, r14 }
4240 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4241 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4242
4243 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4244 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
4245
4246 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4247 mov c_64, #64
4248
4249 ldr fb_ptr, [ pixel_ptr, #28 ]
4250 vld1.u16 { fb_pixels }, [ fb_ptr ]
4251 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4252 vclt.s16 write_mask, fb_pixels, #0
4253 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4254
4255 subs num_blocks, num_blocks, #1
4256 beq 1f
4257
4258 0:
4259 vorr.u16 draw_mask, draw_mask, write_mask
4260 vbif.u16 fb_pixels, pixels, draw_mask
4261 vst1.u16 { fb_pixels }, [ fb_ptr ]
4262
4263 ldr fb_ptr, [ pixel_ptr, #28 ]
4264 vld1.u16 { fb_pixels }, [ fb_ptr ]
4265 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4266 vclt.s16 write_mask, fb_pixels, #0
4267 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4268
4269 subs num_blocks, num_blocks, #1
4270 bne 0b
4271
4272 1:
4273 vorr.u16 draw_mask, draw_mask, write_mask
4274 vbif.u16 fb_pixels, pixels, draw_mask
4275 vst1.u16 { fb_pixels }, [ fb_ptr ]
4276
4277 ldmia sp!, { r4, pc }
4278
4279
4280function(blend_blocks_textured_unblended_off)
4281 bx lr
4282
4283
4284function(warmup)
4285 mov r3, #64
4286 cmp r0, #0
4287 bxeq lr
4288
4289 0:
4290 vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3
4291
4292 subs r0, r0, #1
4293 bne 0b
4294
4295 bx lr
4296
6c4a10c4 4297#undef vram_ptr
75e28f62 4298#undef color
6c4a10c4 4299#undef width
75e28f62 4300#undef height
6c4a10c4 4301#undef pitch
75e28f62
E
4302
4303#define vram_ptr r0
6c4a10c4
E
4304#define color r1
4305#define width r2
4306#define height r3
75e28f62 4307
6c4a10c4 4308#define pitch r1
75e28f62 4309
6c4a10c4 4310#define num_width r12
75e28f62 4311
87c45ad1
E
4312#undef colors_a
4313#undef colors_b
75e28f62 4314
87c45ad1
E
4315#define colors_a q0
4316#define colors_b q1
75e28f62
E
4317
4318.align 3
4319
4320function(render_block_fill_body)
87c45ad1 4321 vdup.u16 colors_a, color
6c4a10c4 4322 mov pitch, #2048
75e28f62 4323
87c45ad1 4324 vmov colors_b, colors_a
75e28f62 4325 sub pitch, pitch, width, lsl #1
75e28f62 4326
6c4a10c4 4327 mov num_width, width
75e28f62 4328
6c4a10c4
E
4329 0:
4330 vst1.u32 { colors_a, colors_b }, [ vram_ptr, :256 ]!
75e28f62 4331
d1c75d1e 4332 subs num_width, num_width, #16
6c4a10c4 4333 bne 0b
75e28f62 4334
75e28f62 4335 add vram_ptr, vram_ptr, pitch
6c4a10c4
E
4336 mov num_width, width
4337
75e28f62
E
4338 subs height, height, #1
4339 bne 0b
75e28f62 4340
6c4a10c4
E
4341 bx lr
4342
75e28f62
E
4343
4344#undef x
4345#undef y
4346#undef width
4347#undef height
4348#undef fb_ptr
4349#undef texture_mask
4350#undef num_blocks
4351#undef temp
4352#undef dirty_textures_mask
4353#undef clut_ptr
4354#undef current_texture_mask
4355
4356#define psx_gpu r0
4357#define x r1
4358#define y r2
4359#define u r3
4360#define v r4
4361#define width r5
4362#define height r6
4363#define offset_u r8
4364#define offset_v r9
4365#define offset_u_right r10
4366#define width_rounded r11
4367#define height_rounded r12
4368
4369#define texture_offset_base r1
4370#define tile_width r2
4371#define tile_height r3
4372#define num_blocks r4
4373#define block r5
4374#define sub_tile_height r6
4375#define fb_ptr r7
4376#define texture_mask r8
4377#define column_data r9
4378#define texture_offset r10
4379#define tiles_remaining r11
4380#define fb_ptr_advance_column r12
4381#define texture_block_ptr r14
4382
4383#define texture_page_ptr r3
4384#define left_block_mask r4
4385#define right_block_mask r5
4386#define texture_mask_rev r10
4387#define control_mask r11
4388
4389#define dirty_textures_mask r4
4390#define clut_ptr r5
4391#define current_texture_mask r6
4392
4393
4394#undef texels
4395#undef clut_low_a
4396#undef clut_low_b
4397#undef clut_high_a
4398#undef clut_high_b
4399#undef clut_a
4400#undef clut_b
4401#undef texels_low
4402#undef texels_high
4403
4404#define texels d0
4405#define draw_masks_fb_ptrs q1
4406
4407#define draw_mask_fb_ptr_left d2
4408#define draw_mask_fb_ptr_right d3
4409
59d15d23 4410#define draw_mask_fb_ptr_left_a d2
4411#define draw_mask_fb_ptr_left_b d3
4412#define draw_mask_fb_ptr_right_a d10
4413#define draw_mask_fb_ptr_right_b d11
4414#define draw_masks_fb_ptrs2 q5
4415
75e28f62
E
4416#define clut_low_a d4
4417#define clut_low_b d5
4418#define clut_high_a d6
4419#define clut_high_b d7
4420
4421#define block_masks d8
4422#define block_masks_shifted d9
4423
4424#define clut_a q2
4425#define clut_b q3
4426
59d15d23 4427#define texels_low d12
4428#define texels_high d13
75e28f62 4429
59d15d23 4430#define texels_wide_low d14
4431#define texels_wide_high d15
4432#define texels_wide q7
75e28f62
E
4433
4434
59d15d23 4435setup_sprite_flush_blocks:
4436 vpush { q1 - q5 }
75e28f62
E
4437
4438 stmdb sp!, { r0 - r3, r12, r14 }
4439 bl flush_render_block_buffer
4440 ldmia sp!, { r0 - r3, r12, r14 }
4441
59d15d23 4442 vpop { q1 - q5 }
75e28f62
E
4443
4444 add block, psx_gpu, #psx_gpu_blocks_offset
75e28f62
E
4445 bx lr
4446
4447
4448setup_sprite_update_texture_4bpp_cache:
4449 stmdb sp!, { r0 - r3, r14 }
4450 bl update_texture_4bpp_cache
4451 ldmia sp!, { r0 - r3, pc }
4452
4453
4454setup_sprite_update_texture_8bpp_cache:
4455 stmdb sp!, { r0 - r3, r14 }
4456 bl update_texture_8bpp_cache
4457 ldmia sp!, { r0 - r3, pc }
4458
4459
4460#define setup_sprite_tiled_initialize_4bpp() \
4461 ldr dirty_textures_mask, \
4462 [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]; \
4463 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4464 \
4465 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4466 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4467 \
4468 tst current_texture_mask, dirty_textures_mask; \
4469 vuzp.u8 clut_a, clut_b; \
4470 \
4471 blne setup_sprite_update_texture_4bpp_cache \
4472
4473#define setup_sprite_tiled_initialize_8bpp() \
4474 ldr dirty_textures_mask, \
4475 [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]; \
4476 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4477 \
4478 tst current_texture_mask, dirty_textures_mask; \
4479 blne setup_sprite_update_texture_8bpp_cache \
4480
4481
75e28f62
E
4482#define setup_sprite_block_count_single() \
4483 sub_tile_height \
4484
4485#define setup_sprite_block_count_double() \
4486 sub_tile_height, lsl #1 \
4487
4488#define setup_sprite_tile_add_blocks(type) \
4489 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4490 cmp num_blocks, #MAX_BLOCKS; \
4491 \
59d15d23 4492 movgt num_blocks, setup_sprite_block_count_##type(); \
4493 blgt setup_sprite_flush_blocks \
75e28f62
E
4494
4495
4496#define setup_sprite_tile_full_4bpp(edge) \
4497 setup_sprite_tile_add_blocks(double); \
4498 \
4499 4: \
4500 and texture_block_ptr, texture_offset, texture_mask; \
4501 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4502 \
4503 pld [ fb_ptr ]; \
4504 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4505 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4506 \
4507 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4508 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4509 \
4510 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4511 add texture_block_ptr, texture_offset, #8; \
4512 \
4513 and texture_block_ptr, texture_block_ptr, texture_mask; \
4514 add block, block, #40; \
4515 \
4516 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4517 add fb_ptr, fb_ptr, #16; \
4518 \
4519 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4520 add block, block, #24; \
4521 \
4522 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4523 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4524 \
4525 pld [ fb_ptr ]; \
4526 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4527 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4528 \
4529 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4530 add block, block, #40; \
4531 \
4532 add texture_offset, texture_offset, #0x10; \
4533 add fb_ptr, fb_ptr, #(2048 - 16); \
4534 \
4535 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4536 add block, block, #24; \
4537 \
4538 subs sub_tile_height, sub_tile_height, #1; \
4539 bne 4b; \
4540 \
4541 add texture_offset, texture_offset, #0xF00; \
4542 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4543
4544
4545#define setup_sprite_tile_half_4bpp(edge) \
4546 setup_sprite_tile_add_blocks(single); \
4547 \
4548 4: \
4549 and texture_block_ptr, texture_offset, texture_mask; \
4550 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4551 \
4552 pld [ fb_ptr ]; \
4553 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4554 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4555 \
4556 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4557 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4558 \
4559 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4560 add block, block, #40; \
4561 \
4562 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4563 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4564 \
4565 add block, block, #24; \
4566 add texture_offset, texture_offset, #0x10; \
4567 \
4568 add fb_ptr, fb_ptr, #2048; \
4569 subs sub_tile_height, sub_tile_height, #1; \
4570 \
4571 bne 4b; \
4572 \
4573 add texture_offset, texture_offset, #0xF00; \
4574 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4575
4576
4577#define setup_sprite_tile_full_8bpp(edge) \
4578 setup_sprite_tile_add_blocks(double); \
4579 add block, block, #16; \
4580 \
4581 4: \
4582 and texture_block_ptr, texture_offset, texture_mask; \
4583 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4584 \
4585 pld [ fb_ptr ]; \
4586 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4587 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4588 \
4589 add texture_block_ptr, texture_offset, #8; \
4590 vst1.u32 { texels }, [ block, :64 ]; \
4591 \
4592 and texture_block_ptr, texture_block_ptr, texture_mask; \
4593 add block, block, #24; \
4594 \
4595 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4596 \
4597 add fb_ptr, fb_ptr, #16; \
4598 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4599 \
4600 add block, block, #40; \
4601 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4602 pld [ fb_ptr ]; \
4603 \
4604 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4605 vst1.u32 { texels }, [ block, :64 ]; \
4606 add block, block, #24; \
4607 \
4608 add texture_offset, texture_offset, #0x10; \
4609 add fb_ptr, fb_ptr, #(2048 - 16); \
4610 \
4611 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4612 add block, block, #40; \
4613 \
4614 subs sub_tile_height, sub_tile_height, #1; \
4615 bne 4b; \
4616 \
4617 sub block, block, #16; \
4618 add texture_offset, texture_offset, #0xF00; \
4619 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4620
4621
4622#define setup_sprite_tile_half_8bpp(edge) \
4623 setup_sprite_tile_add_blocks(single); \
4624 add block, block, #16; \
4625 \
4626 4: \
4627 and texture_block_ptr, texture_offset, texture_mask; \
4628 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4629 pld [ fb_ptr ]; \
4630 \
4631 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4632 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4633 \
4634 vst1.u32 { texels }, [ block, :64 ]; \
4635 add block, block, #24; \
4636 \
4637 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4638 add block, block, #40; \
4639 \
4640 add texture_offset, texture_offset, #0x10; \
4641 add fb_ptr, fb_ptr, #2048; \
4642 \
4643 subs sub_tile_height, sub_tile_height, #1; \
4644 bne 4b; \
4645 \
4646 sub block, block, #16; \
4647 add texture_offset, texture_offset, #0xF00; \
4648 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4649
4650
4651#define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4652 add texture_offset, texture_offset_base, #8; \
4653 add fb_ptr, fb_ptr, #16 \
4654
4655#define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4656 mov texture_offset, texture_offset_base \
4657
4658#define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4659 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4660
4661#define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4662 mov texture_offset, texture_offset_base \
4663
4664#define setup_sprite_tile_column_edge_post_adjust_half_right() \
4665 sub fb_ptr, fb_ptr, #16 \
4666
4667#define setup_sprite_tile_column_edge_post_adjust_half_left() \
4668
4669#define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4670 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4671
4672#define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4673
4674
59d15d23 4675#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode, \
4676 x4mode) \
75e28f62 4677 mov sub_tile_height, column_data; \
59d15d23 4678 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4679 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4680 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62 4681
59d15d23 4682#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode, \
4683 x4mode) \
75e28f62
E
4684 and sub_tile_height, column_data, #0xFF; \
4685 mov tiles_remaining, column_data, lsr #16; \
59d15d23 4686 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4687 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4688 \
4689 subs tiles_remaining, tiles_remaining, #1; \
4690 beq 2f; \
4691 \
4692 3: \
4693 mov sub_tile_height, #16; \
59d15d23 4694 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4695 subs tiles_remaining, tiles_remaining, #1; \
4696 bne 3b; \
4697 \
4698 2: \
4699 uxtb sub_tile_height, column_data, ror #8; \
59d15d23 4700 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4701 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62
E
4702
4703
4704#define setup_sprite_column_data_single() \
4705 mov column_data, height; \
4706 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] \
4707
4708#define setup_sprite_column_data_multi() \
4709 and height_rounded, height_rounded, #0xF; \
4710 rsb column_data, offset_v, #16; \
4711 \
4712 add height_rounded, height_rounded, #1; \
4713 sub tile_height, tile_height, #1; \
4714 \
4715 orr column_data, column_data, tile_height, lsl #16; \
4716 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]; \
4717 \
4718 orr column_data, column_data, height_rounded, lsl #8 \
4719
59d15d23 4720#define setup_sprite_setup_left_draw_mask_fb_ptr() \
4721 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4722 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4723
4724#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column() \
4725 mov fb_ptr_advance_column, #32; \
4726 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4727 \
4728 sub fb_ptr_advance_column, height, lsl #11; \
4729 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4730
4731#define setup_sprite_setup_right_draw_mask_fb_ptr() \
4732 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4733 vdup.u8 draw_mask_fb_ptr_right, block_masks[5] \
4734
4735#define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode, \
4736 edge, x4mode) \
4737 setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode: \
75e28f62
E
4738 setup_sprite_column_data_##multi_height(); \
4739 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4740 vorr.u32 block_masks, block_masks, block_masks_shifted; \
59d15d23 4741 setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
75e28f62 4742 \
59d15d23 4743 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
75e28f62
E
4744 ldmia sp!, { r4 - r11, pc } \
4745
4746#define setup_sprite_tiled_advance_column() \
4747 add texture_offset_base, texture_offset_base, #0x100; \
4748 tst texture_offset_base, #0xF00; \
4749 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4750
4751#define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
59d15d23 4752 right_mode, x4mode) \
4753 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
75e28f62 4754 setup_sprite_column_data_##multi_height(); \
75e28f62 4755 \
59d15d23 4756 setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode(); \
75e28f62 4757 \
59d15d23 4758 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
75e28f62
E
4759 \
4760 subs tile_width, tile_width, #2; \
4761 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4762 \
75e28f62
E
4763 beq 1f; \
4764 \
59d15d23 4765 vmov.u8 draw_masks_fb_ptrs, #0; \
4766 vmov.u8 draw_masks_fb_ptrs2, #0; \
4767 \
75e28f62
E
4768 0: \
4769 setup_sprite_tiled_advance_column(); \
59d15d23 4770 setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode); \
75e28f62
E
4771 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4772 subs tile_width, tile_width, #1; \
4773 bne 0b; \
4774 \
4775 1: \
59d15d23 4776 setup_sprite_setup_right_draw_mask_fb_ptr##x4mode(); \
75e28f62
E
4777 \
4778 setup_sprite_tiled_advance_column(); \
59d15d23 4779 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
75e28f62
E
4780 ldmia sp!, { r4 - r11, pc } \
4781
4782
59d15d23 4783#define setup_sprite_offset_u_adjust() \
4784
4785#define setup_sprite_get_left_block_mask() \
4786 and left_block_mask, left_block_mask, #0xFF \
4787
4788#define setup_sprite_compare_left_block_mask() \
4789 cmp left_block_mask, #0xFF \
4790
4791#define setup_sprite_get_right_block_mask() \
4792 uxtb right_block_mask, right_block_mask, ror #8 \
4793
4794#define setup_sprite_compare_right_block_mask() \
4795 cmp right_block_mask, #0xFF \
4796
4797
4798
4799/* 4x stuff */
4800#define fb_ptr2 column_data
4801
4802#define setup_sprite_offset_u_adjust_4x() \
4803 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4804 lsl offset_u_right, #1; \
4805 lsl offset_u, #1; \
4806 add offset_u_right, #1 \
4807
4808#define setup_sprite_get_left_block_mask_4x() \
4809 sxth left_block_mask, left_block_mask \
4810
4811#define setup_sprite_compare_left_block_mask_4x() \
4812 cmp left_block_mask, #0xFFFFFFFF \
4813
4814#define setup_sprite_get_right_block_mask_4x() \
4815 sxth right_block_mask, right_block_mask, ror #16 \
4816
4817#define setup_sprite_compare_right_block_mask_4x() \
4818 cmp right_block_mask, #0xFFFFFFFF \
4819
4820
4821#define widen_texels_16bpp(texels_) \
4822 vmov texels_wide_low, texels_; \
4823 vmov texels_wide_high, texels_; \
4824 vzip.16 texels_wide_low, texels_wide_high \
4825
4826#define widen_texels_8bpp(texels_) \
4827 vmov texels_wide_low, texels_; \
4828 vmov texels_wide_high, texels_; \
4829 vzip.8 texels_wide_low, texels_wide_high \
4830
4831#define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
4832 vst1.u32 { texels_ }, [ block_, :128 ]; \
4833 add block_, block_, #40; \
4834 \
4835 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
4836 vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ]; \
4837 add block_, block_, #24 \
4838
4839/* assumes 16-byte offset already added to block_ */
4840#define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
4841 vst1.u32 { texels_ }, [ block_, :64 ]; \
4842 add block_, block_, #24; \
4843 \
4844 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
4845 vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ]; \
4846 add block_, block_, #40 \
4847
4848#define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4849 draw_mask_fb_ptr_b_) \
4850 widen_texels_16bpp(texels_low); \
4851 add fb_ptr_tmp, fb_ptr, #1024*2; \
4852 \
4853 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr); \
4854 \
4855 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4856 widen_texels_16bpp(texels_high); \
4857 \
4858 add fb_ptr_tmp, fb_ptr, #8*2; \
4859 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4860 \
4861 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4862 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4863
4864#define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4865 draw_mask_fb_ptr_b_) \
4866 widen_texels_8bpp(texels); \
4867 add fb_ptr_tmp, fb_ptr, #1024*2; \
4868 \
4869 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr); \
4870 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4871 \
4872 add fb_ptr_tmp, fb_ptr, #8*2; \
4873 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4874 \
4875 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4876 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4877
4878
4879#define setup_sprite_tiled_initialize_4bpp_4x() \
4880 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4881 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4882 \
4883 vuzp.u8 clut_a, clut_b \
4884
4885#define setup_sprite_tiled_initialize_8bpp_4x() \
4886
4887
4888#define setup_sprite_block_count_single_4x() \
4889 sub_tile_height, lsl #2 \
4890
4891#define setup_sprite_block_count_double_4x() \
4892 sub_tile_height, lsl #(1+2) \
4893
4894#define setup_sprite_tile_full_4bpp_4x(edge) \
4895 setup_sprite_tile_add_blocks(double_4x); \
4896 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4897 \
4898 4: \
4899 and texture_block_ptr, texture_offset, texture_mask; \
4900 pld [ fb_ptr ]; \
4901 \
4902 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4903 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4904 \
4905 add texture_block_ptr, texture_offset, #8; \
4906 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4907 \
4908 and texture_block_ptr, texture_block_ptr, texture_mask; \
4909 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4910 \
4911 vzip.8 texels_low, texels_high; \
4912 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
4913 draw_mask_fb_ptr_left_b); \
4914 \
4915 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4916 add fb_ptr, fb_ptr, #16*2; \
4917 \
4918 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4919 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4920 \
4921 pld [ fb_ptr ]; \
4922 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4923 \
4924 vzip.8 texels_low, texels_high; \
4925 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
4926 draw_mask_fb_ptr_right_b); \
4927 \
4928 add texture_offset, texture_offset, #0x10; \
4929 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
4930 \
4931 subs sub_tile_height, sub_tile_height, #1; \
4932 bne 4b; \
4933 \
4934 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4935 add texture_offset, texture_offset, #0xF00; \
4936 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4937
4938
4939#define setup_sprite_tile_half_4bpp_4x(edge) \
4940 setup_sprite_tile_add_blocks(single_4x); \
4941 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4942 \
4943 4: \
4944 and texture_block_ptr, texture_offset, texture_mask; \
4945 pld [ fb_ptr ]; \
4946 \
4947 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4948 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4949 \
4950 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4951 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4952 \
4953 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4954 add texture_offset, texture_offset, #0x10; \
4955 \
4956 vzip.8 texels_low, texels_high; \
4957 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
4958 draw_mask_fb_ptr_##edge##_b); \
4959 \
4960 add fb_ptr, fb_ptr, #2048 * 2; \
4961 subs sub_tile_height, sub_tile_height, #1; \
4962 \
4963 bne 4b; \
4964 \
4965 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4966 add texture_offset, texture_offset, #0xF00; \
4967 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4968
4969
4970#define setup_sprite_tile_full_8bpp_4x(edge) \
4971 setup_sprite_tile_add_blocks(double_4x); \
4972 add block, block, #16; \
4973 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4974 \
4975 4: \
4976 and texture_block_ptr, texture_offset, texture_mask; \
4977 pld [ fb_ptr ]; \
4978 \
4979 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4980 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4981 \
4982 add texture_block_ptr, texture_offset, #8; \
4983 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
4984 draw_mask_fb_ptr_left_b); \
4985 \
4986 and texture_block_ptr, texture_block_ptr, texture_mask; \
4987 \
4988 add fb_ptr, fb_ptr, #16*2; \
4989 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4990 \
4991 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4992 pld [ fb_ptr ]; \
4993 \
4994 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
4995 draw_mask_fb_ptr_right_b); \
4996 \
4997 add texture_offset, texture_offset, #0x10; \
4998 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
4999 \
5000 subs sub_tile_height, sub_tile_height, #1; \
5001 bne 4b; \
5002 \
5003 sub block, block, #16; \
5004 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5005 add texture_offset, texture_offset, #0xF00; \
5006 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5007
5008
5009#define setup_sprite_tile_half_8bpp_4x(edge) \
5010 setup_sprite_tile_add_blocks(single_4x); \
5011 add block, block, #16; \
5012 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5013 \
5014 4: \
5015 and texture_block_ptr, texture_offset, texture_mask; \
5016 pld [ fb_ptr ]; \
5017 \
5018 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5019 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
5020 \
5021 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5022 draw_mask_fb_ptr_##edge##_b); \
5023 \
5024 add texture_offset, texture_offset, #0x10; \
5025 add fb_ptr, fb_ptr, #2048 * 2; \
5026 \
5027 subs sub_tile_height, sub_tile_height, #1; \
5028 bne 4b; \
5029 \
5030 sub block, block, #16; \
5031 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5032 add texture_offset, texture_offset, #0xF00; \
5033 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5034
5035
5036#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x() \
5037 add texture_offset, texture_offset_base, #8; \
5038 add fb_ptr, fb_ptr, #16 * 2 \
5039
5040#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x() \
5041 mov texture_offset, texture_offset_base \
5042
5043#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge) \
5044 setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x() \
5045
5046#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge) \
5047 mov texture_offset, texture_offset_base \
5048
5049#define setup_sprite_tile_column_edge_post_adjust_half_right_4x() \
5050 sub fb_ptr, fb_ptr, #16 * 2 \
5051
5052#define setup_sprite_tile_column_edge_post_adjust_half_left_4x() \
5053
5054#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge) \
5055 setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x() \
5056
5057#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge) \
5058
5059
5060#define setup_sprite_setup_left_draw_mask_fb_ptr_4x() \
5061 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5062 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5063 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5064 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5065
5066#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x() \
5067 mov fb_ptr_advance_column, #32 * 2; \
5068 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5069 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5070 sub fb_ptr_advance_column, height, lsl #11 + 1; \
5071 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5072 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5073
5074#define setup_sprite_setup_right_draw_mask_fb_ptr_4x() \
5075 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4]; \
5076 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5]; \
5077 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6]; \
5078 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7] \
5079
5080
75e28f62
E
5081// r0: psx_gpu
5082// r1: x
5083// r2: y
5084// r3: u
5085// [ sp ]: v
5086// [ sp + 4 ]: width
5087// [ sp + 8 ]: height
5088// [ sp + 12 ]: color (unused)
5089
59d15d23 5090#define setup_sprite_tiled_builder(texture_mode, x4mode) \
5091 \
5092setup_sprite_tile_column_width_multi(texture_mode, multi, full, full, \
5093 x4mode); \
5094setup_sprite_tile_column_width_single(texture_mode, multi, full, none, \
5095 x4mode); \
5096setup_sprite_tile_column_width_multi(texture_mode, single, full, full, \
5097 x4mode); \
5098setup_sprite_tile_column_width_single(texture_mode, single, full, none, \
5099 x4mode); \
5100setup_sprite_tile_column_width_multi(texture_mode, multi, half, full, \
5101 x4mode); \
5102setup_sprite_tile_column_width_single(texture_mode, multi, half, right, \
5103 x4mode); \
5104setup_sprite_tile_column_width_multi(texture_mode, single, half, full, \
5105 x4mode); \
5106setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
5107 x4mode); \
5108setup_sprite_tile_column_width_multi(texture_mode, multi, full, half, \
5109 x4mode); \
5110setup_sprite_tile_column_width_single(texture_mode, multi, half, left, \
5111 x4mode); \
5112setup_sprite_tile_column_width_multi(texture_mode, single, full, half, \
5113 x4mode); \
5114setup_sprite_tile_column_width_single(texture_mode, single, half, left, \
5115 x4mode); \
5116setup_sprite_tile_column_width_multi(texture_mode, multi, half, half, \
5117 x4mode); \
5118setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \
5119 x4mode); \
75e28f62
E
5120 \
5121.align 4; \
5122 \
59d15d23 5123function(setup_sprite_##texture_mode##x4mode) \
75e28f62 5124 stmdb sp!, { r4 - r11, r14 }; \
59d15d23 5125 setup_sprite_tiled_initialize_##texture_mode##x4mode(); \
75e28f62
E
5126 \
5127 ldr v, [ sp, #36 ]; \
5128 and offset_u, u, #0xF; \
5129 \
5130 ldr width, [ sp, #40 ]; \
c1817bd9 5131 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
5132 \
5133 ldr height, [ sp, #44 ]; \
5134 add fb_ptr, fb_ptr, y, lsl #11; \
5135 \
5136 add fb_ptr, fb_ptr, x, lsl #1; \
5137 and offset_v, v, #0xF; \
5138 \
5139 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
5140 add width_rounded, offset_u, width; \
5141 \
5142 add height_rounded, offset_v, height; \
5143 add width_rounded, width_rounded, #15; \
5144 \
5145 add height_rounded, height_rounded, #15; \
5146 mov tile_width, width_rounded, lsr #4; \
5147 \
5148 /* texture_offset_base = VH-VL-00-00 */\
5149 mov texture_offset_base, v, lsl #8; \
5150 and offset_u_right, width_rounded, #0xF; \
5151 \
5152 /* texture_offset_base = VH-UH-UL-00 */\
5153 bfi texture_offset_base, u, #4, #8; \
59d15d23 5154 mov right_block_mask, #0xFFFFFFFE; \
5155 \
5156 setup_sprite_offset_u_adjust##x4mode(); \
75e28f62
E
5157 \
5158 /* texture_offset_base = VH-UH-VL-00 */\
5159 bfi texture_offset_base, v, #4, #4; \
59d15d23 5160 mov left_block_mask, #0xFFFFFFFF; \
75e28f62
E
5161 \
5162 mov tile_height, height_rounded, lsr #4; \
5163 mvn left_block_mask, left_block_mask, lsl offset_u; \
5164 \
5165 /* texture_mask = HH-HL-WH-WL */\
5166 ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]; \
5167 mov right_block_mask, right_block_mask, lsl offset_u_right; \
5168 \
5169 /* texture_mask_rev = WH-WL-HH-HL */\
5170 rev16 texture_mask_rev, texture_mask; \
5171 vmov block_masks, left_block_mask, right_block_mask; \
5172 \
5173 /* texture_mask = HH-HL-HL-WL */\
5174 bfi texture_mask, texture_mask_rev, #4, #4; \
5175 /* texture_mask_rev = 00-00-00-WH */\
5176 mov texture_mask_rev, texture_mask_rev, lsr #12; \
5177 \
5178 /* texture_mask = HH-WH-HL-WL */\
5179 bfi texture_mask, texture_mask_rev, #8, #4; \
59d15d23 5180 setup_sprite_get_left_block_mask##x4mode(); \
75e28f62
E
5181 \
5182 mov control_mask, #0; \
59d15d23 5183 setup_sprite_compare_left_block_mask##x4mode(); \
75e28f62 5184 \
59d15d23 5185 setup_sprite_get_right_block_mask##x4mode(); \
75e28f62
E
5186 orreq control_mask, control_mask, #0x4; \
5187 \
5188 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
59d15d23 5189 setup_sprite_compare_right_block_mask##x4mode(); \
75e28f62
E
5190 \
5191 orreq control_mask, control_mask, #0x8; \
5192 cmp tile_width, #1; \
5193 \
5194 add block, psx_gpu, #psx_gpu_blocks_offset; \
5195 orreq control_mask, control_mask, #0x1; \
5196 \
5197 cmp tile_height, #1; \
5198 add block, block, num_blocks, lsl #6; \
5199 \
5200 orreq control_mask, control_mask, #0x2; \
5201 ldr pc, [ pc, control_mask, lsl #2 ]; \
5202 nop; \
5203 \
59d15d23 5204 .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode; \
5205 .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode; \
5206 .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode; \
5207 .word setup_sprite_##texture_mode##_single_single_full_none##x4mode; \
5208 .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode; \
5209 .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode; \
5210 .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode; \
5211 .word setup_sprite_##texture_mode##_single_single_half_right##x4mode; \
5212 .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode; \
5213 .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode; \
5214 .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode; \
5215 .word setup_sprite_##texture_mode##_single_single_half_left##x4mode; \
5216 .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode; \
75e28f62 5217 .word 0x00000000; \
59d15d23 5218 .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode; \
5219
5220
5221setup_sprite_tiled_builder(4bpp,);
5222setup_sprite_tiled_builder(8bpp,);
75e28f62 5223
59d15d23 5224#undef draw_mask_fb_ptr_left
5225#undef draw_mask_fb_ptr_right
75e28f62 5226
59d15d23 5227setup_sprite_tiled_builder(4bpp, _4x);
5228setup_sprite_tiled_builder(8bpp, _4x);
75e28f62
E
5229
5230
5231#undef block_ptr
5232#undef num_blocks
5233#undef clut_ptr
5234
5235#define psx_gpu r0
5236#define block_ptr r0
5237#define num_blocks r1
5238#define clut_ptr r2
5239#define texel_shift_mask r3
5240#define block_pixels_a r4
5241#define block_pixels_b r5
5242#define texel_0 r6
5243#define texel_2 r7
5244#define texel_4 r8
5245#define texel_6 r9
5246#define texel_1 r10
5247#define texel_3 r11
5248#define texel_5 r12
5249#define texel_7 r14
5250#define texels_01 r6
5251#define texels_23 r7
5252#define texels_45 r8
5253#define texels_67 r9
5254
5255function(texture_sprite_blocks_8bpp)
5256 stmdb sp!, { r4 - r11, r14 }
5257 movw texel_shift_mask, #(0xFF << 1)
5258
5259 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5260 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
5261
5262 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5263 ldr block_pixels_a, [ block_ptr, #16 ]
5264
5265 0:
5266 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5267 ldr block_pixels_b, [ block_ptr, #20 ]
5268
5269 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5270 ldrh texel_0, [ clut_ptr, texel_0 ]
5271
5272 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5273 ldrh texel_1, [ clut_ptr, texel_1 ]
5274
5275 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5276 ldr block_pixels_a, [ block_ptr, #(64 + 16) ]
5277
5278 ldrh texel_2, [ clut_ptr, texel_2 ]
5279 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5280
5281 ldrh texel_3, [ clut_ptr, texel_3 ]
5282 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5283
5284 ldrh texel_4, [ clut_ptr, texel_4 ]
5285 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5286
5287 ldrh texel_5, [ clut_ptr, texel_5 ]
5288 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5289
5290 ldrh texel_6, [ clut_ptr, texel_6 ]
5291 orr texels_01, texel_0, texel_1, lsl #16
5292
5293 ldrh texel_7, [ clut_ptr, texel_7 ]
5294 orr texels_23, texel_2, texel_3, lsl #16
5295
5296 orr texels_45, texel_4, texel_5, lsl #16
5297 str texels_01, [ block_ptr, #0 ]
5298
5299 orr texels_67, texel_6, texel_7, lsl #16
5300 str texels_23, [ block_ptr, #4 ]
5301
5302 subs num_blocks, num_blocks, #1
5303 str texels_45, [ block_ptr, #8 ]
5304
5305 str texels_67, [ block_ptr, #12 ]
5306 add block_ptr, block_ptr, #64
5307
5308 bne 0b
5309
5310 ldmia sp!, { r4 - r11, pc }
5311
5312
5313#undef width_rounded
5314#undef texture_mask
5315#undef num_blocks
5316#undef texture_offset
59d15d23 5317#undef texels_low
5318#undef texels_high
5319#undef texels_wide_low
5320#undef texels_wide_high
5321#undef texels_wide
5322#undef fb_ptr2
75e28f62
E
5323
5324#define psx_gpu r0
5325#define x r1
5326#define y r2
5327#define u r3
5328#define v r4
5329#define width r5
5330#define height r6
5331#define left_offset r8
5332#define width_rounded r9
5333#define right_width r10
59d15d23 5334
75e28f62
E
5335#define block_width r11
5336
5337#define texture_offset_base r1
5338#define texture_mask r2
5339#define texture_page_ptr r3
5340#define num_blocks r4
5341#define block r5
5342#define fb_ptr r7
5343#define texture_offset r8
5344#define blocks_remaining r9
59d15d23 5345#define fb_ptr2 r10
75e28f62
E
5346#define fb_ptr_pitch r12
5347#define texture_block_ptr r14
5348
5349#define texture_mask_width r2
5350#define texture_mask_height r3
5351#define left_mask_bits r4
5352#define right_mask_bits r5
5353
5354
5355#undef block_masks
5356#undef block_masks_shifted
5357#undef texels
5358
5359#define block_masks d0
5360#define block_masks_shifted d1
5361#define draw_mask_fb_ptr d2
5362#define texels q2
5363
59d15d23 5364#define draw_mask_fb_ptr_a d2
5365#define draw_mask_fb_ptr_b d3
5366#define texels_low d4
5367#define texels_high d5
5368#define texels_wide_low d6
5369#define texels_wide_high d7
5370#define texels_wide q3
75e28f62 5371
75e28f62 5372
59d15d23 5373setup_sprites_16bpp_flush:
5374 vpush { d0 - d3 }
75e28f62
E
5375
5376 stmdb sp!, { r0 - r3, r12, r14 }
5377 bl flush_render_block_buffer
5378 ldmia sp!, { r0 - r3, r12, r14 }
5379
59d15d23 5380 vpop { d0 - d3 }
75e28f62
E
5381
5382 add block, psx_gpu, #psx_gpu_blocks_offset
5383 mov num_blocks, block_width
5384
5385 bx lr
5386
5387function(setup_sprite_16bpp)
5388 stmdb sp!, { r4 - r11, r14 }
c1817bd9 5389 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
5390
5391 ldr v, [ sp, #36 ]
5392 add fb_ptr, fb_ptr, y, lsl #11
5393
5394 ldr width, [ sp, #40 ]
5395 add fb_ptr, fb_ptr, x, lsl #1
5396
5397 ldr height, [ sp, #44 ]
5398 and left_offset, u, #0x7
5399
5400 add texture_offset_base, u, u
5401 add width_rounded, width, #7
5402
5403 add texture_offset_base, v, lsl #11
5404 mov left_mask_bits, #0xFF
5405
5406 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5407 add width_rounded, width_rounded, left_offset
5408
5409 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5410 sub fb_ptr, fb_ptr, left_offset, lsl #1
5411
5412 add texture_mask, texture_mask_width, texture_mask_width
5413 mov right_mask_bits, #0xFE
5414
5415 and right_width, width_rounded, #0x7
5416 mvn left_mask_bits, left_mask_bits, lsl left_offset
5417
5418 add texture_mask, texture_mask_height, lsl #11
5419 mov block_width, width_rounded, lsr #3
5420
5421 mov right_mask_bits, right_mask_bits, lsl right_width
5422 movw fb_ptr_pitch, #(2048 + 16)
5423
5424 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5425 vmov block_masks, left_mask_bits, right_mask_bits
5426
5427 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5428 add block, psx_gpu, #psx_gpu_blocks_offset
5429
6ea0f7bf 5430 bic texture_offset_base, texture_offset_base, #0xF
75e28f62
E
5431 cmp block_width, #1
5432
5433 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5434 add block, block, num_blocks, lsl #6
5435
5436 bne 0f
5437
5438 vext.32 block_masks_shifted, block_masks, block_masks, #1
5439 vorr.u32 block_masks, block_masks, block_masks_shifted
5440 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5441
5442 1:
5443 add num_blocks, num_blocks, #1
5444 cmp num_blocks, #MAX_BLOCKS
59d15d23 5445 blgt setup_sprites_16bpp_flush
75e28f62
E
5446
5447 and texture_block_ptr, texture_offset_base, texture_mask
5448 subs height, height, #1
5449
5450 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5451 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5452
5453 vst1.u32 { texels }, [ block, :128 ]
5454 add block, block, #40
5455
5456 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5457 pld [ fb_ptr ]
5458
5459 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5460
5461 add block, block, #24
5462 add texture_offset_base, texture_offset_base, #2048
5463 add fb_ptr, fb_ptr, #2048
5464 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5465 bne 1b
5466
5467 ldmia sp!, { r4 - r11, pc }
5468
5469 0:
5470 add num_blocks, num_blocks, block_width
5471 mov texture_offset, texture_offset_base
5472
5473 cmp num_blocks, #MAX_BLOCKS
59d15d23 5474 blgt setup_sprites_16bpp_flush
75e28f62
E
5475
5476 add texture_offset_base, texture_offset_base, #2048
5477 and texture_block_ptr, texture_offset, texture_mask
5478
5479 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5480 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5481
5482 vst1.u32 { texels }, [ block, :128 ]
5483 add block, block, #40
5484
5485 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5486 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5487 pld [ fb_ptr ]
5488
5489 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5490 subs blocks_remaining, block_width, #2
5491
5492 add texture_offset, texture_offset, #16
5493 add fb_ptr, fb_ptr, #16
5494
5495 vmov.u8 draw_mask_fb_ptr, #0
5496
5497 add block, block, #24
5498 beq 2f
5499
5500 1:
5501 and texture_block_ptr, texture_offset, texture_mask
5502 subs blocks_remaining, blocks_remaining, #1
5503
5504 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5505 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5506
5507 vst1.u32 { texels }, [ block, :128 ]
5508 add block, block, #40
5509
5510 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5511 pld [ fb_ptr ]
5512
5513 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5514
5515 add texture_offset, texture_offset, #16
5516 add fb_ptr, fb_ptr, #16
5517
5518 add block, block, #24
5519 bne 1b
5520
5521 2:
5522 and texture_block_ptr, texture_offset, texture_mask
5523 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5524
5525 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5526 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5527
5528 vst1.u32 { texels }, [ block, :128 ]
5529 add block, block, #40
5530
5531 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5532 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5533
5534 add block, block, #24
5535 subs height, height, #1
5536
5537 add fb_ptr, fb_ptr, fb_ptr_pitch
5538 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5539
5540 bne 0b
5541
5542 ldmia sp!, { r4 - r11, pc }
5543
5544
59d15d23 5545// 4x version
5546// FIXME: duplicate code with normal version :(
5547#undef draw_mask_fb_ptr
5548
5549function(setup_sprite_16bpp_4x)
5550 stmdb sp!, { r4 - r11, r14 }
5551 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5552
5553 ldr v, [ sp, #36 ]
5554 add fb_ptr, fb_ptr, y, lsl #11
5555
5556 ldr width, [ sp, #40 ]
5557 add fb_ptr, fb_ptr, x, lsl #1
5558
5559 ldr height, [ sp, #44 ]
5560 and left_offset, u, #0x7
5561
5562 add texture_offset_base, u, u
5563 add width_rounded, width, #7
5564
5565 add texture_offset_base, v, lsl #11
5566 movw left_mask_bits, #0xFFFF
5567
5568 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5569 add width_rounded, width_rounded, left_offset
5570
5571 lsl left_offset, #1
5572
5573 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5574 sub fb_ptr, fb_ptr, left_offset, lsl #1
5575
5576 add texture_mask, texture_mask_width, texture_mask_width
5577 movw right_mask_bits, #0xFFFC
5578
5579 and right_width, width_rounded, #0x7
5580 mvn left_mask_bits, left_mask_bits, lsl left_offset
5581
5582 lsl right_width, #1
5583
5584 add texture_mask, texture_mask_height, lsl #11
5585 mov block_width, width_rounded, lsr #3
5586
5587 mov right_mask_bits, right_mask_bits, lsl right_width
5588 movw fb_ptr_pitch, #(2048 + 16) * 2
5589
5590 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5591 vmov block_masks, left_mask_bits, right_mask_bits
5592
5593 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5594 add block, psx_gpu, #psx_gpu_blocks_offset
5595
5596 bic texture_offset_base, texture_offset_base, #0xF
5597 cmp block_width, #1
5598
5599 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5600 add block, block, num_blocks, lsl #6
5601
5602 lsl block_width, #2
5603 bne 0f
5604
5605 vext.32 block_masks_shifted, block_masks, block_masks, #1
5606 vorr.u32 block_masks, block_masks, block_masks_shifted
5607 vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5608 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5609
5610 1:
5611 add num_blocks, num_blocks, block_width
5612 cmp num_blocks, #MAX_BLOCKS
5613 blgt setup_sprites_16bpp_flush
5614
5615 and texture_block_ptr, texture_offset_base, texture_mask
5616 subs height, height, #1
5617
5618 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5619 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5620
5621 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5622
5623 add texture_offset_base, texture_offset_base, #2048
5624 add fb_ptr, fb_ptr, #2048*2
5625 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5626 bne 1b
5627
5628 ldmia sp!, { r4 - r11, pc }
5629
5630 0:
5631 add num_blocks, num_blocks, block_width
5632 mov texture_offset, texture_offset_base
5633
5634 vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5635 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5636
5637 cmp num_blocks, #MAX_BLOCKS
5638 blgt setup_sprites_16bpp_flush
5639
5640 add texture_offset_base, texture_offset_base, #2048
5641 and texture_block_ptr, texture_offset, texture_mask
5642
5643 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5644 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5645
5646 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5647
5648 subs blocks_remaining, block_width, #2*4
5649 add texture_offset, texture_offset, #16
5650
5651 vmov.u8 draw_mask_fb_ptr_a, #0
5652 vmov.u8 draw_mask_fb_ptr_b, #0
5653
5654 add fb_ptr, fb_ptr, #16*2
5655 beq 2f
5656
5657 1:
5658 and texture_block_ptr, texture_offset, texture_mask
5659 subs blocks_remaining, blocks_remaining, #4
5660
5661 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5662 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5663
5664 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5665 add texture_offset, texture_offset, #16
5666
5667 add fb_ptr, fb_ptr, #16*2
5668 bgt 1b
5669
5670 2:
5671 vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5672 vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5673
5674 and texture_block_ptr, texture_offset, texture_mask
5675 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5676
5677 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5678
5679 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5680 subs height, height, #1
5681
5682 add fb_ptr, fb_ptr, fb_ptr_pitch
5683 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5684
5685 bne 0b
5686
5687 ldmia sp!, { r4 - r11, pc }
5688
5689
75e28f62
E
5690#undef texture_page_ptr
5691#undef vram_ptr
5692#undef dirty_textures_mask
5693#undef current_texture_mask
5694
5695#define psx_gpu r0
5696#define current_texture_page r1
5697#define texture_page_ptr r2
5698#define vram_ptr_a r3
5699#define current_texture_page_x r12
5700#define current_texture_page_y r4
5701#define dirty_textures_mask r5
5702#define tile_y r6
5703#define tile_x r7
5704#define sub_y r8
5705#define current_texture_mask r9
5706#define c_4096 r10
5707#define vram_ptr_b r11
5708
5709#define texel_block_a d0
5710#define texel_block_b d1
5711#define texel_block_expanded_a q1
5712#define texel_block_expanded_b q2
5713#define texel_block_expanded_ab q2
5714#define texel_block_expanded_c q3
5715#define texel_block_expanded_d q4
5716#define texel_block_expanded_cd q3
5717
5718function(update_texture_4bpp_cache)
5719 stmdb sp!, { r4 - r11, r14 }
5720 vpush { q0 - q3 }
5721
5722 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5723
3867c6ef 5724 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
75e28f62
E
5725 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5726
5727 and current_texture_page_x, current_texture_page, #0xF
5728 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
5729
5730 mov current_texture_page_y, current_texture_page, lsr #4
5731 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5732
5733 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5734 mov tile_y, #16
5735
5736 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5737 bic dirty_textures_mask, current_texture_mask
5738
5739 mov tile_x, #16
5740 str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5741
5742 mov sub_y, #8
5743 movw c_4096, #4096
5744
5745 add vram_ptr_b, vram_ptr_a, #2048
5746
5747 0:
5748 vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096
5749 vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096
5750
5751 vmovl.u8 texel_block_expanded_a, texel_block_a
5752 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5753 vmovl.u8 texel_block_expanded_c, texel_block_b
5754 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5755
5756 vbic.u16 texel_block_expanded_a, #0x00F0
5757 vbic.u16 texel_block_expanded_b, #0x00F0
5758 vbic.u16 texel_block_expanded_c, #0x00F0
5759 vbic.u16 texel_block_expanded_d, #0x00F0
5760
5761 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5762 texel_block_expanded_b
5763 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5764 texel_block_expanded_d
5765
5766 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
5767 [ texture_page_ptr, :256 ]!
5768
5769 subs sub_y, sub_y, #1
5770 bne 0b
5771
5772 mov sub_y, #8
5773 add vram_ptr_a, vram_ptr_a, #8
5774 add vram_ptr_b, vram_ptr_b, #8
5775
5776 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5777 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5778
5779 subs tile_x, tile_x, #1
5780 bne 0b
5781
5782 mov tile_x, #16
5783 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5784 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5785
5786 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5787 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5788
5789 subs tile_y, tile_y, #1
5790 bne 0b
5791
5792 vpop { q0 - q3 }
5793 ldmia sp!, { r4 - r11, pc }
5794
5795
5796#undef current_texture_page
5797
5798#define psx_gpu r0
5799#define texture_page r1
5800#define texture_page_ptr r2
5801#define vram_ptr_a r3
5802#define texture_page_x r12
5803#define texture_page_y r4
5804#define current_texture_page r5
5805#define tile_y r6
5806#define tile_x r7
5807#define sub_y r8
5808#define c_4096 r10
5809#define vram_ptr_b r11
5810
5811
5812#undef texels_a
5813#undef texels_b
5814
5815#define texels_a q0
5816#define texels_b q1
5817#define texels_c q2
5818#define texels_d q3
5819
5820
5821function(update_texture_8bpp_cache_slice)
5822 stmdb sp!, { r4 - r11, r14 }
5823 vpush { q0 - q3 }
5824
5825 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5826 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5827
3867c6ef 5828 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
75e28f62
E
5829 mov tile_y, #16
5830
5831 and texture_page_x, texture_page, #0xF
5832 mov texture_page_y, texture_page, lsr #4
5833
5834 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
5835 mov tile_x, #8
5836
5837 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
5838 eor current_texture_page, current_texture_page, texture_page
5839
5840 ands current_texture_page, current_texture_page, #0x1
5841 mov sub_y, #4
5842
5843 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
5844 movw c_4096, #4096
5845
5846 add vram_ptr_b, vram_ptr_a, #2048
5847
5848 0:
5849 vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096
5850 vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096
5851 vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096
5852 vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096
5853
5854 vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]!
5855 vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]!
5856
5857 subs sub_y, sub_y, #1
5858 bne 0b
5859
5860 mov sub_y, #4
5861
5862 add vram_ptr_a, vram_ptr_a, #16
5863 add vram_ptr_b, vram_ptr_b, #16
5864
5865 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5866 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5867
5868 subs tile_x, tile_x, #1
5869 bne 0b
5870
5871 mov tile_x, #8
5872
5873 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5874 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5875
5876 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5877 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5878
5879 subs tile_y, tile_y, #1
5880 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
5881
5882 bne 0b
5883
5884 vpop { q0 - q3 }
5885 ldmia sp!, { r4 - r11, pc }
5886
50f9355a 5887
5888/* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
5889function(scale2x_tiles8)
5890 push { r4, r14 }
5891
5892 mov r4, r1
5893 add r12, r0, #1024*2
5894 mov r14, r2
5895
58960:
5897 vld1.u16 { q0 }, [ r1, :128 ]!
5898 vld1.u16 { q2 }, [ r1, :128 ]!
5899 vmov q1, q0
5900 vmov q3, q2
5901 vzip.16 q0, q1
5902 vzip.16 q2, q3
5903 subs r14, #2
5904 vst1.u16 { q0, q1 }, [ r0, :128 ]!
5905 vst1.u16 { q0, q1 }, [ r12, :128 ]!
5906 blt 1f
5907 vst1.u16 { q2, q3 }, [ r0, :128 ]!
5908 vst1.u16 { q2, q3 }, [ r12, :128 ]!
5909 bgt 0b
59101:
5911 subs r3, #1
5912 mov r14, r2
5913 add r0, #1024*2*2
5914 add r4, #1024*2
5915 sub r0, r2, lsl #4+1
5916 mov r1, r4
5917 add r12, r0, #1024*2
5918 bgt 0b
5919 nop
5920
5921 pop { r4, pc }
59d15d23 5922
5923// vim:filetype=armasm