cdrom: change pause timing again
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
CommitLineData
75e28f62
E
1/*
2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
59d15d23 3 * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
75e28f62
E
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of
8 * the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15
d5c08ed3 16#define RENDER_INTERLACE_ENABLED 0x1
f0931e56 17
2d658c89 18#include "psx_gpu.h"
cb88320b 19#include "psx_gpu_offsets.h"
75e28f62 20
cb88320b 21#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
75e28f62 22
75e28f62
E
23#define edge_data_left_x_offset 0
24#define edge_data_num_blocks_offset 2
25#define edge_data_right_mask_offset 4
26#define edge_data_y_offset 6
27
ed0fd81d 28.syntax unified
29.text
75e28f62 30
3f0189c6 31#if 0
32#define save_abi_regs() \
33 vpush {q4-q7}
34#define restore_abi_regs() \
35 vpop {q4-q7}
36#else
37#define save_abi_regs()
38#define restore_abi_regs()
39#endif
40
75e28f62
E
41#define psx_gpu r0
42#define v_a r1
43#define v_b r2
44#define v_c r3
45
46#define x0 r4
47#define x1 r5
48#define x2 r6
49#define x0_x1 r5
50#define x1_x2 r6
51#define y0 r7
52#define y1 r8
53#define y2 r9
54#define y0_y1 r7
55#define y1_y2 r8
56#define b0 r9
57#define b1 r10
58#define b2 r11
59#define b0_b1 r10
60#define b1_b2 r11
61
62
63#define area_r_s r5
64
65#define g_bx0 r2
66#define g_bx r3
67#define g_bx2 r4
68#define g_bx3 r5
69#define b_base r6
70#define g_by r8
71
72#define gs_bx r7
73#define gs_by r10
74
75#define ga_bx g_bx
76#define ga_by g_by
77
78#define gw_bx_h g_bx
79#define gw_by_h g_by
80
81#define gw_bx_l r11
82#define gw_by_l gw_bx_l
83
84#define store_a r0
85#define store_b r1
86#define store_inc r5
87
88
89#define v0 q0
90#define uvrgb0 d0
91#define x0_y0 d1
92
93#define v1 q1
94#define uvrgb1 d2
95#define x1_y1 d3
96
97#define v2 q2
98#define uvrgb2 d4
99#define x2_y2 d5
100
101#define x0_ab q3
102#define uvrg_xxxx0 q3
103#define uvrg0 d6
104#define xxxx0 d7
105
106#define x1_ab q4
107#define uvrg_xxxx1 q4
108#define uvrg1 d8
109#define xxxx1 d9
110
111#define x2_ab q5
112#define uvrg_xxxx2 q5
113#define uvrg2 d10
114#define xxxx2 d11
115
116#define y0_ab q6
117#define yyyy_uvrg0 q6
118#define yyyy0 d12
119#define uvrg0b d13
120
121#define y1_ab q7
122#define yyyy_uvrg1 q7
123#define yyyy1 d14
124#define uvrg1b d15
125
126#define y2_ab q8
127#define yyyy_uvrg2 q8
128#define yyyy2 d16
129#define uvrg2b d17
130
131#define d0_ab q9
132#define d0_a d18
133#define d0_b d19
134
135#define d1_ab q10
136#define d1_a d20
137#define d1_b d21
138
139#define d2_ab q11
140#define d2_a d22
141#define d2_b d23
142
143#define d3_ab q12
144#define d3_a d24
145#define d3_b d25
146
147#define ga_uvrg_x q1
148#define ga_uvrg_y q4
149
150#define dx x0_x1
151#define dy y0_y1
152#define db b0_b1
153
154#define uvrg_base q11
155
156#define gs_uvrg_x q5
157#define gs_uvrg_y q6
158
159#define g_uvrg_x q1
160#define ga_uv_x d2
161#define g_uv_x d2
162#define ga_rg_x d3
163#define g_rg_x d3
164
165#define g_uvrg_y q4
166#define ga_uv_y d8
167#define g_uv_y d8
168#define ga_rg_y d9
169#define g_rg_y d9
170
171#define gw_uv_x q1
172#define gw_rg_x q2
173#define gw_uv_y q4
174#define gw_rg_y q3
175
176#define w_mask q9
177#define w_mask_l d18
178
179#define r_shift q10
180
181#define uvrg_dx0 q0
182#define uvrg_dx0l d0
183#define uvrg_dx0h d1
184
185#define uvrg_dx1 q1
186#define uvrg_dx1l d2
187#define uvrg_dx1h d3
188
189#define uvrg_dx2 q2
190#define uvrg_dx2l d4
191#define uvrg_dx2h d5
192
193#define uvrg_dx3 q3
194#define uvrg_dx3l d6
195#define uvrg_dx3h d7
196
c6063f89 197#define uvrgb_phase q13
75e28f62
E
198
199.align 4
200
0e4ad319 201#include "arm_features.h"
8184d7c5 202
0e4ad319 203#define function(name) FUNCTION(name):
204
205#ifndef TEXRELS_FORBIDDEN
75e28f62 206
8184d7c5 207#define JT_OP_REL(table_label, index_reg, temp)
208#define JT_OP(x...) x
209#define JTE(start, target) target
210
211#else
212
8184d7c5 213#define JT_OP_REL(table_label, index_reg, temp) \
214 adr temp, table_label; \
e1f6de8f 215 ldr temp, [temp, index_reg, lsl #2]; \
8184d7c5 216 add pc, pc, temp \
217
218#define JT_OP(x...)
219#define JTE(start, target) (target - start)
220
0e4ad319 221#endif
4d646738 222
0e4ad319 223#ifdef __MACH__
8184d7c5 224#define flush_render_block_buffer _flush_render_block_buffer
8184d7c5 225#define update_texture_8bpp_cache _update_texture_8bpp_cache
8184d7c5 226#endif
227
75e28f62
E
228@ r0: psx_gpu
229@ r1: v_a
230@ r2: v_b
231@ r3: v_c
232
233function(compute_all_gradients)
234 // First compute the triangle area reciprocal and shift. The division will
235 // happen concurrently with much of the work which follows.
236 @ r12 = psx_gpu->triangle_area
e1f6de8f 237 ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
75e28f62 238 stmdb sp!, { r4 - r11, lr }
3f0189c6 239 save_abi_regs()
75e28f62
E
240
241 @ load exponent of 62 into upper half of double
242 movw r4, #0
243 clz r14, r12 @ r14 = shift
244
245 movt r4, #((62 + 1023) << 4)
246 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
247
248 @ load area normalized into lower half of double
249 mov r5, r12, lsr #10
250 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
251
252 movt r4, #((1022 + 31) << 4)
253 mov r5, r12, lsl #20
254
255 add r4, r4, r12, lsr #11
256 vmov.f64 d31, r5, r4
257
258 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
259
260 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
261 // ( d0 * d1 ) - ( d2 * d3 ) =
262 // ( m0 ) - ( m1 ) = gradient
263
264 // This is split to do 12 elements at a time over three sets: a, b, and c.
265 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
266 // two of the slots are unused.
267
268 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
269 // is g.
270
271 // First type is: uvrg bxxx xxxx
272 // Second type is: yyyy ybyy uvrg
273 // Since x_a and y_c are the same the same variable is used for both.
274
e1f6de8f 275 vld1.u32 { v0 }, [v_a, :128] @ v0 = { uvrg0, b0, x0, y0 }
276 ldrsh x0, [v_a, #8] @ load x0
75e28f62 277
e1f6de8f 278 vld1.u32 { v1 }, [v_b, :128] @ v1 = { uvrg1, b1, x1, y1}
279 ldrh x1, [v_b, #8] @ load x1
75e28f62 280
e1f6de8f 281 vld1.u32 { v2 }, [v_c, :128] @ v2 = { uvrg2, b2, x2, y2 }
282 ldrh x2, [v_c, #8] @ load x2
75e28f62
E
283
284 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
e1f6de8f 285 ldrh y0, [v_a, #10] @ load y0
75e28f62
E
286
287 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
e1f6de8f 288 ldrh y1, [v_b, #10] @ load y1
75e28f62
E
289
290 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
e1f6de8f 291 ldrh y2, [v_c, #10] @ load y2
75e28f62
E
292
293 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
294 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
295
296 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
297 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
298
299 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
300 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
301
302 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
303 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
304
e1f6de8f 305 ldrb b2, [v_c, #4] @ load b2
75e28f62
E
306 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
307
e1f6de8f 308 ldrb b1, [v_b, #4] @ load b1
75e28f62
E
309 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
310
311 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
312 vsub.s16 d0_ab, x1_ab, x0_ab
313
e1f6de8f 314 ldrb b0, [v_a, #4] @ load b0
75e28f62
E
315 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
316
317 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
318 vsub.s16 d2_ab, x2_ab, x1_ab
319
320 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
321 vsub.s16 d1_ab, y2_ab, y1_ab
322
323 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
324 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
325
326 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
327 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
328
329 vsub.s16 d3_ab, y1_ab, y0_ab
330 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
331 @ ((x2 - X1) * (b1 - b0))
332 vmull.s16 ga_uvrg_x, d0_a, d1_a
333 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
334 @ ((b2 - b1) * (y1 - y0))
335 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
336 movs gs_bx, ga_bx, asr #31
337
338 vmull.s16 ga_uvrg_y, d0_b, d1_b
339 rsbmi ga_bx, ga_bx, #0
340
c6063f89 341 @ r12 = psx_gpu->uvrgb_phase
e1f6de8f 342 ldr r12, [psx_gpu, #psx_gpu_uvrgb_phase_offset]
c6063f89 343
75e28f62
E
344 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
345 movs gs_by, ga_by, asr #31
346
347 vshr.u64 d0, d30, #22
c6063f89 348 add b_base, r12, b0, lsl #16
349
350 vdup.u32 uvrgb_phase, r12
75e28f62
E
351
352 rsbmi ga_by, ga_by, #0
353 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
354
355 @ r12 = psx_gpu->triangle_winding_offset
e1f6de8f 356 ldrb r12, [psx_gpu, #psx_gpu_triangle_winding_offset]
75e28f62
E
357 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
358
75e28f62
E
359 rsb r12, r12, #0 @ r12 = -(triangle->winding)
360
361 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
362 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
363
364 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
aafce833 365 vdup.u32 r_shift, r14 @ r_shift = { shift, shift*, shift, shift* }
366 @ * - vshl.u64: ignored by hw
c6063f89 367 vadd.u32 uvrg_base, uvrgb_phase
75e28f62
E
368 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
369
370 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
371 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
372
373 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
374 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
375 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
376 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
377
378 vshl.u64 gw_rg_x, gw_rg_x, r_shift
379 vshl.u64 gw_uv_x, gw_uv_x, r_shift
380 vshl.u64 gw_rg_y, gw_rg_y, r_shift
381 vshl.u64 gw_uv_y, gw_uv_y, r_shift
382
383 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
384 vmovn.u64 g_uv_x, gw_uv_x
385
386 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
387 vmovn.u64 g_rg_x, gw_rg_x
388
389 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
390 vmovn.u64 g_uv_y, gw_uv_y
391
392 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
393 vmovn.u64 g_rg_y, gw_rg_y
394
395 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
396 mov ga_bx, ga_bx, lsl #13
397
398 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
399 mov ga_by, ga_by, lsl #13
400
401 vdup.u32 x0_y0, x0
402 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
403
404 vshl.u32 g_uvrg_x, g_uvrg_x, #4
405 vshl.u32 g_uvrg_y, g_uvrg_y, #4
406
407 umull gw_by_l, gw_by_h, ga_by, area_r_s
408 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
409
410 eor gs_bx, gs_bx, r12
411 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
412
413 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
414 eor gs_by, gs_by, r12
415
416 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
417 add store_a, psx_gpu, #psx_gpu_uvrg_offset
418
419 sub r11, r11, #(32 - 13)
420
421 add store_b, store_a, #16
422 mov store_inc, #32
423
424 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
e1f6de8f 425 vst1.u32 { uvrg_base }, [store_a, :128], store_inc
75e28f62 426
e1f6de8f 427 vst1.u32 { uvrg_dx1 }, [store_b, :128], store_inc
75e28f62
E
428 mov g_bx, gw_bx_h, lsr r11
429
e1f6de8f 430 vst1.u32 { g_uvrg_y }, [store_a, :128], store_inc
75e28f62
E
431 mov g_by, gw_by_h, lsr r11
432
433 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
e1f6de8f 434 [store_b, :128], store_inc
75e28f62
E
435 eor g_bx, g_bx, gs_bx
436
437 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
e1f6de8f 438 [store_b, :128], store_inc
75e28f62
E
439 sub g_bx, g_bx, gs_bx
440
441 lsl g_bx, g_bx, #4
442 eor g_by, g_by, gs_by
443
444 mls b_base, g_bx, x0, b_base
445 sub g_by, g_by, gs_by
446
447 lsl g_by, g_by, #4
448 mov g_bx0, #0
449
450 add g_bx2, g_bx, g_bx
451 add g_bx3, g_bx, g_bx2
452
453 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
454
3f0189c6 455 restore_abi_regs()
75e28f62
E
456 ldmia sp!, { r4 - r11, pc }
457
458
459#define psx_gpu r0
460#define v_a r1
461#define v_b r2
462#define v_c r3
463
464#define temp r14
465
466#define x_a r4
467#define x_b r5
468#define x_c r6
469#define y_a r1
470#define y_b r2
471#define y_c r3
472
473#define height_minor_a r7
474#define height_minor_b r8
475#define height_major r9
476#define height r9
477
478#define reciprocal_table_ptr r10
479
480#define edge_alt_low r4
481#define edge_alt_high r5
482#define edge_dx_dy_alt r6
483#define edge_shift_alt r10
484
485#define edge_dx_dy_alt_low r4
486#define edge_dx_dy_alt_high r5
487
488#define span_edge_data r4
489#define span_uvrg_offset r5
490#define span_b_offset r6
491
492#define clip r14
493
494#define b r11
495#define b_dy r12
496
497
498#define alternate_x q0
499#define alternate_dx_dy q1
500#define alternate_x_32 q2
501
502#define alternate_x_low d0
503#define alternate_x_high d1
504#define alternate_dx_dy_low d2
505#define alternate_dx_dy_high d3
506#define alternate_x_32_low d4
507#define alternate_x_32_high d5
508
509#define left_x q3
510#define right_x q4
511#define left_dx_dy q5
512#define right_dx_dy q6
513#define left_edge q7
514#define right_edge q8
515
516#define left_x_low d6
517#define left_x_high d7
518#define right_x_low d8
519#define right_x_high d9
520#define left_dx_dy_low d10
521#define left_dx_dy_high d11
522#define right_dx_dy_low d12
523#define right_dx_dy_high d13
524#define left_edge_low d14
525#define left_edge_high d15
526#define right_edge_low d16
527#define right_edge_high d17
528
529#define y_mid_point d18
530#define c_0x0004 d19
531
532#define left_right_x_16 q11
533#define span_shifts_y q12
534#define c_0x0001 q13
535
536#define span_shifts d24
537#define y_x4 d25
538#define c_0xFFFE d26
539#define c_0x0007 d27
540
541#define left_right_x_16_low d22
542#define left_right_x_16_high d23
543
544#define uvrg q14
545#define uvrg_dy q15
546
547#define alternate_x_16 d4
548
549#define v_clip q3
550#define v_clip_low d6
551
552#define right_x_32 q10
553#define left_x_32 q11
554#define alternate_select d24
555
556#define right_x_32_low d20
557#define right_x_32_high d21
558#define left_x_32_low d22
559#define left_x_32_high d23
560
2d658c89 561#define tmp_max_blocks d20
562
75e28f62
E
563#define edges_xy q0
564#define edges_dx_dy d2
565#define edge_shifts d3
566#define edge_shifts_64 q2
567
568#define edges_xy_left d0
569#define edges_xy_right d1
570
571#define height_reciprocals d6
572#define heights d7
573
574#define widths d8
575#define c_0x01 d9
576#define x_starts d10
577#define x_ends d11
578
579#define heights_b d12
580#define edges_dx_dy_64 q10
581
582#define edges_dx_dy_64_left d20
583#define edges_dx_dy_64_right d21
584
585
586#define setup_spans_prologue() \
587 stmdb sp!, { r4 - r11, lr }; \
3f0189c6 588 save_abi_regs(); \
75e28f62 589 \
e1f6de8f 590 ldrsh x_a, [v_a, #8]; \
591 ldrsh x_b, [v_b, #8]; \
592 ldrsh x_c, [v_c, #8]; \
593 ldrsh y_a, [v_a, #10]; \
594 ldrsh y_b, [v_b, #10]; \
595 ldrsh y_c, [v_c, #10]; \
75e28f62
E
596 \
597 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
e1f6de8f 598 vld1.32 { uvrg }, [temp]; \
75e28f62 599 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
e1f6de8f 600 vld1.32 { uvrg_dy }, [temp]; \
601 ldr reciprocal_table_ptr, [psx_gpu, #psx_gpu_reciprocal_table_ptr_offset]; \
75e28f62
E
602 \
603 vmov.u32 c_0x01, #0x01 \
604
605#define setup_spans_load_b() \
e1f6de8f 606 ldr b, [psx_gpu, #psx_gpu_b_offset]; \
607 ldr b_dy, [psx_gpu, #psx_gpu_b_dy_offset] \
75e28f62
E
608
609#define setup_spans_prologue_b() \
610 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
611 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
612 \
613 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
614 vmov.u16 c_0x0004, #0x0004; \
615 \
616 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
617 vmov.u16 c_0x0001, #0x0001; \
618 \
e1f6de8f 619 vld1.u16 { left_edge_low[], left_edge_high[] }, [temp]; \
75e28f62
E
620 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
621 \
e1f6de8f 622 vld1.u16 { right_edge_low[], right_edge_high[] }, [temp]; \
75e28f62
E
623 vadd.u16 right_edge, right_edge, c_0x0001; \
624 \
625 vmov.u16 c_0x0007, #0x0007; \
626 vmvn.u16 c_0xFFFE, #0x0001 \
627
628
629#define compute_edge_delta_x2() \
e1f6de8f 630 ldr temp, [reciprocal_table_ptr, height, lsl #2]; \
75e28f62
E
631 \
632 vdup.u32 heights, height; \
633 vsub.u32 widths, x_ends, x_starts; \
634 \
635 vdup.u32 edge_shifts, temp; \
636 vsub.u32 heights_b, heights, c_0x01; \
7d5140f5 637 vshr.u32 height_reciprocals, edge_shifts, #10; \
75e28f62
E
638 \
639 vmla.s32 heights_b, x_starts, heights; \
640 vbic.u16 edge_shifts, #0xE0; \
641 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
642 vmull.s32 edges_xy, heights_b, height_reciprocals \
643
644#define width_alt r6
645#define height_reciprocal_alt r11
646#define height_b_alt r12
647
648#define compute_edge_delta_x3(start_c, height_a, height_b) \
ed0fd81d 649 vmov heights, height_a, height_b; \
e1f6de8f 650 ldr temp, [reciprocal_table_ptr, height_a, lsl #2]; \
75e28f62 651 vmov.u32 edge_shifts[0], temp; \
e1f6de8f 652 ldr temp, [reciprocal_table_ptr, height_b, lsl #2]; \
75e28f62 653 vmov.u32 edge_shifts[1], temp; \
e1f6de8f 654 ldr edge_shift_alt, [reciprocal_table_ptr, height_minor_b, lsl #2]; \
75e28f62
E
655 \
656 vsub.u32 widths, x_ends, x_starts; \
657 sub width_alt, x_c, start_c; \
658 \
659 vsub.u32 heights_b, heights, c_0x01; \
660 sub height_b_alt, height_minor_b, #1; \
661 \
7d5140f5
E
662 vshr.u32 height_reciprocals, edge_shifts, #10; \
663 lsr height_reciprocal_alt, edge_shift_alt, #10; \
75e28f62
E
664 \
665 vmla.s32 heights_b, x_starts, heights; \
666 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
667 \
668 vbic.u16 edge_shifts, #0xE0; \
669 and edge_shift_alt, edge_shift_alt, #0x1F; \
670 \
671 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
672 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
673 \
674 vmull.s32 edges_xy, heights_b, height_reciprocals; \
675 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
676
677
678#define setup_spans_adjust_y_up() \
679 vsub.u32 y_x4, y_x4, c_0x0004 \
680
681#define setup_spans_adjust_y_down() \
682 vadd.u32 y_x4, y_x4, c_0x0004 \
683
684#define setup_spans_adjust_interpolants_up() \
685 vsub.u32 uvrg, uvrg, uvrg_dy; \
686 sub b, b, b_dy \
687
688#define setup_spans_adjust_interpolants_down() \
689 vadd.u32 uvrg, uvrg, uvrg_dy; \
690 add b, b, b_dy \
691
692
693#define setup_spans_clip_interpolants_increment() \
694 mla b, b_dy, clip, b; \
695 vmla.s32 uvrg, uvrg_dy, v_clip \
696
697#define setup_spans_clip_interpolants_decrement() \
698 mls b, b_dy, clip, b; \
699 vmls.s32 uvrg, uvrg_dy, v_clip \
700
701#define setup_spans_clip_alternate_yes() \
702 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
703
704#define setup_spans_clip_alternate_no() \
705
706#define setup_spans_clip(direction, alternate_active) \
707 vdup.u32 v_clip, clip; \
708 setup_spans_clip_alternate_##alternate_active(); \
709 setup_spans_clip_interpolants_##direction(); \
710 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
711
712
713#define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
714 vmovl.s32 edge_shifts_64, edge_shifts; \
715 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
716 \
717 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
718 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
719 \
720 vmov left_x_low, edges_xy_##left_index; \
721 vmov right_x_low, edges_xy_##right_index; \
722 \
723 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
724 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
725 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
726 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
727 \
728 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
729 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
730 \
731 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
732 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
733
734
735#define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
736 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
737 \
738 vdup.u16 y_mid_point, y_b; \
739 rsb temp, edge_shift_alt, #32; \
740 \
741 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
742 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
743 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
744 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
745 \
746 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
747 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
748 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
749 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
750 \
751 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
752 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
753
754
755#define setup_spans_y_select_up() \
756 vclt.s16 alternate_select, y_x4, y_mid_point \
757
758#define setup_spans_y_select_down() \
759 vcgt.s16 alternate_select, y_x4, y_mid_point \
760
761
762#define setup_spans_alternate_select_left() \
763 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
764
765#define setup_spans_alternate_select_right() \
766 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
767
768
769#define setup_spans_set_x4_alternate_yes(alternate, direction) \
770 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
771 vshrn.s64 left_x_32_low, left_x, #32; \
772 vshrn.s64 right_x_32_low, right_x, #32; \
773 \
774 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
775 vadd.u64 left_x, left_x, left_dx_dy; \
776 vadd.u64 right_x, right_x, right_dx_dy; \
777 \
778 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
779 vshrn.s64 left_x_32_high, left_x, #32; \
780 vshrn.s64 right_x_32_high, right_x, #32; \
781 \
782 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
783 vadd.u64 left_x, left_x, left_dx_dy; \
784 vadd.u64 right_x, right_x, right_dx_dy; \
785 \
786 vmovn.u32 alternate_x_16, alternate_x_32; \
787 setup_spans_y_select_##direction(); \
788 vmovn.u32 left_right_x_16_low, left_x_32; \
789 \
790 vmovn.u32 left_right_x_16_high, right_x_32; \
791 setup_spans_alternate_select_##alternate(); \
792 \
e1f6de8f 793 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
794 str b, [span_b_offset], #4; \
75e28f62
E
795 setup_spans_adjust_interpolants_##direction(); \
796 \
797 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
798 \
e1f6de8f 799 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
800 str b, [span_b_offset], #4; \
75e28f62
E
801 setup_spans_adjust_interpolants_##direction(); \
802 \
803 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
804 \
e1f6de8f 805 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
806 str b, [span_b_offset], #4; \
75e28f62
E
807 setup_spans_adjust_interpolants_##direction(); \
808 \
809 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
810 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
811 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
812 \
e1f6de8f 813 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
814 str b, [span_b_offset], #4; \
75e28f62
E
815 setup_spans_adjust_interpolants_##direction(); \
816 \
2d658c89 817 vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW; \
75e28f62
E
818 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
819 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
2d658c89 820 vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks; \
75e28f62 821 \
e1f6de8f 822 vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \
75e28f62
E
823 \
824 setup_spans_adjust_y_##direction() \
825
826
827#define setup_spans_set_x4_alternate_no(alternate, direction) \
828 vshrn.s64 left_x_32_low, left_x, #32; \
829 vshrn.s64 right_x_32_low, right_x, #32; \
830 \
831 vadd.u64 left_x, left_x, left_dx_dy; \
832 vadd.u64 right_x, right_x, right_dx_dy; \
833 \
834 vshrn.s64 left_x_32_high, left_x, #32; \
835 vshrn.s64 right_x_32_high, right_x, #32; \
836 \
837 vadd.u64 left_x, left_x, left_dx_dy; \
838 vadd.u64 right_x, right_x, right_dx_dy; \
839 \
840 vmovn.u32 left_right_x_16_low, left_x_32; \
841 vmovn.u32 left_right_x_16_high, right_x_32; \
842 \
e1f6de8f 843 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
844 str b, [span_b_offset], #4; \
75e28f62
E
845 setup_spans_adjust_interpolants_##direction(); \
846 \
847 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
848 \
e1f6de8f 849 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
850 str b, [span_b_offset], #4; \
75e28f62
E
851 setup_spans_adjust_interpolants_##direction(); \
852 \
853 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
854 \
e1f6de8f 855 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
856 str b, [span_b_offset], #4; \
75e28f62
E
857 setup_spans_adjust_interpolants_##direction(); \
858 \
859 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
860 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
861 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
862 \
e1f6de8f 863 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
864 str b, [span_b_offset], #4; \
75e28f62
E
865 setup_spans_adjust_interpolants_##direction(); \
866 \
2d658c89 867 vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW; \
75e28f62 868 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
2d658c89 869 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
870 vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks; \
75e28f62 871 \
e1f6de8f 872 vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \
75e28f62
E
873 \
874 setup_spans_adjust_y_##direction() \
875
876
877#define edge_adjust_low r11
878#define edge_adjust_high r12
879
880#define setup_spans_alternate_adjust_yes() \
881 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
882 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
883 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
884
885#define setup_spans_alternate_adjust_no() \
886
887
888#define setup_spans_down(left_index, right_index, alternate, alternate_active) \
889 setup_spans_alternate_adjust_##alternate_active(); \
890 setup_spans_load_b(); \
891 \
e1f6de8f 892 ldrsh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]; \
75e28f62
E
893 subs y_c, y_c, temp; \
894 subgt height, height, y_c; \
895 addgt height, height, #1; \
896 \
e1f6de8f 897 ldrsh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]; \
75e28f62
E
898 subs clip, temp, y_a; \
899 ble 0f; \
900 \
901 sub height, height, clip; \
902 add y_a, y_a, clip; \
903 setup_spans_clip(increment, alternate_active); \
904 \
905 0: \
906 cmp height, #0; \
907 ble 1f; \
908 \
909 orr temp, y_a, y_a, lsl #16; \
2d658c89 910 cmp height, #512; \
75e28f62 911 add temp, temp, #(1 << 16); \
2d658c89 912 movgt height, #512; \
75e28f62
E
913 add y_a, temp, #2; \
914 add y_a, y_a, #(2 << 16); \
ed0fd81d 915 vmov y_x4, temp, y_a; \
75e28f62
E
916 \
917 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
918 right_index); \
919 setup_spans_prologue_b(); \
920 \
e1f6de8f 921 strh height, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
922 \
923 2: \
924 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
925 subs height, height, #4; \
926 bhi 2b; \
927 \
928 1: \
929
930
931#define setup_spans_alternate_pre_increment_yes() \
932 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
933 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
934
935#define setup_spans_alternate_pre_increment_no() \
936
937
938#define setup_spans_up_decrement_yes() \
939 suble height, height, #1 \
940
941#define setup_spans_up_decrement_no() \
942
943
944#define setup_spans_up(left_index, right_index, alternate, alternate_active) \
945 setup_spans_alternate_adjust_##alternate_active(); \
946 setup_spans_load_b(); \
947 sub y_a, y_a, #1; \
948 \
e1f6de8f 949 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]; \
75e28f62
E
950 subs temp, temp, y_c; \
951 subgt height, height, temp; \
952 setup_spans_up_decrement_##alternate_active(); \
953 \
e1f6de8f 954 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]; \
75e28f62
E
955 subs clip, y_a, temp; \
956 ble 0f; \
957 \
958 sub height, height, clip; \
959 sub y_a, y_a, clip; \
960 setup_spans_clip(decrement, alternate_active); \
961 \
962 0: \
963 cmp height, #0; \
964 ble 1f; \
965 \
966 orr temp, y_a, y_a, lsl #16; \
2d658c89 967 cmp height, #512; \
75e28f62 968 sub temp, temp, #(1 << 16); \
2d658c89 969 movgt height, #512; \
75e28f62
E
970 sub y_a, temp, #2; \
971 sub y_a, y_a, #(2 << 16); \
ed0fd81d 972 vmov y_x4, temp, y_a; \
75e28f62
E
973 \
974 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
975 \
976 setup_spans_alternate_pre_increment_##alternate_active(); \
977 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
978 right_index); \
979 setup_spans_adjust_interpolants_up(); \
980 setup_spans_prologue_b(); \
981 \
e1f6de8f 982 strh height, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
983 \
984 2: \
985 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
986 subs height, height, #4; \
987 bhi 2b; \
988 \
989 1: \
990
991
992#define setup_spans_epilogue() \
3f0189c6 993 restore_abi_regs(); \
75e28f62
E
994 ldmia sp!, { r4 - r11, pc } \
995
996
997#define setup_spans_up_up(minor, major) \
998 setup_spans_prologue(); \
999 sub height_minor_a, y_a, y_b; \
1000 sub height_minor_b, y_b, y_c; \
1001 sub height, y_a, y_c; \
1002 \
1003 vdup.u32 x_starts, x_a; \
ed0fd81d 1004 vmov x_ends, x_c, x_b; \
75e28f62
E
1005 \
1006 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1007 setup_spans_up(major, minor, minor, yes); \
1008 setup_spans_epilogue() \
1009
1010function(setup_spans_up_left)
1011 setup_spans_up_up(left, right)
1012
1013function(setup_spans_up_right)
1014 setup_spans_up_up(right, left)
1015
75e28f62
E
1016#define setup_spans_down_down(minor, major) \
1017 setup_spans_prologue(); \
1018 sub height_minor_a, y_b, y_a; \
1019 sub height_minor_b, y_c, y_b; \
1020 sub height, y_c, y_a; \
1021 \
1022 vdup.u32 x_starts, x_a; \
ed0fd81d 1023 vmov x_ends, x_c, x_b; \
75e28f62
E
1024 \
1025 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1026 setup_spans_down(major, minor, minor, yes); \
1027 setup_spans_epilogue() \
1028
1029function(setup_spans_down_left)
1030 setup_spans_down_down(left, right)
1031
1032function(setup_spans_down_right)
1033 setup_spans_down_down(right, left)
1034
1035
1036#define setup_spans_up_flat() \
1037 sub height, y_a, y_c; \
1038 \
1039 compute_edge_delta_x2(); \
1040 setup_spans_up(left, right, none, no); \
1041 setup_spans_epilogue() \
1042
1043function(setup_spans_up_a)
1044 setup_spans_prologue()
1045
ed0fd81d 1046 vmov x_starts, x_a, x_b
75e28f62
E
1047 vdup.u32 x_ends, x_c
1048
1049 setup_spans_up_flat()
1050
1051function(setup_spans_up_b)
1052 setup_spans_prologue()
1053
1054 vdup.u32 x_starts, x_a
ed0fd81d 1055 vmov x_ends, x_b, x_c
75e28f62
E
1056
1057 setup_spans_up_flat()
1058
1059#define setup_spans_down_flat() \
1060 sub height, y_c, y_a; \
1061 \
1062 compute_edge_delta_x2(); \
1063 setup_spans_down(left, right, none, no); \
1064 setup_spans_epilogue() \
1065
1066function(setup_spans_down_a)
1067 setup_spans_prologue()
1068
ed0fd81d 1069 vmov x_starts, x_a, x_b
75e28f62
E
1070 vdup.u32 x_ends, x_c
1071
1072 setup_spans_down_flat()
1073
1074function(setup_spans_down_b)
1075 setup_spans_prologue()
1076
1077 vdup.u32 x_starts, x_a
ed0fd81d 1078 vmov x_ends, x_b, x_c
75e28f62
E
1079
1080 setup_spans_down_flat()
1081
1082
1083#define middle_y r9
1084
1085#define edges_xy_b q11
1086#define edges_dx_dy_b d26
1087#define edge_shifts_b d27
1088#define edges_dx_dy_and_shifts_b q13
1089#define height_increment d20
1090
1091#define edges_dx_dy_and_shifts q1
1092
1093#define edges_xy_b_left d22
1094#define edges_xy_b_right d23
1095
1096#define setup_spans_up_down_load_edge_set_b() \
1097 vmov edges_xy, edges_xy_b; \
1098 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1099
1100
1101function(setup_spans_up_down)
1102 setup_spans_prologue()
1103
1104 // s32 middle_y = y_a;
1105 sub height_minor_a, y_a, y_b
1106 sub height_minor_b, y_c, y_a
1107 sub height_major, y_c, y_b
1108
ed0fd81d 1109 vmov x_starts, x_a, x_c
75e28f62
E
1110 vdup.u32 x_ends, x_b
1111
1112 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1113
1114 mov temp, #0
ed0fd81d 1115 vmov height_increment, temp, height_minor_b
75e28f62
E
1116 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1117
1118 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1119 vmov edges_xy_b_right, edges_xy_right
1120
1121 vmov edge_shifts_b, edge_shifts
1122 vmov.u32 edge_shifts_b[0], edge_shift_alt
1123
1124 vneg.s32 edges_dx_dy_b, edges_dx_dy
1125 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1126
1127 mov middle_y, y_a
1128
1129 setup_spans_load_b()
1130 sub y_a, y_a, #1
1131
e1f6de8f 1132 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
75e28f62
E
1133 subs temp, temp, y_b
1134 subgt height_minor_a, height_minor_a, temp
1135
e1f6de8f 1136 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
75e28f62
E
1137 subs clip, y_a, temp
1138 ble 0f
1139
1140 sub height_minor_a, height_minor_a, clip
1141 sub y_a, y_a, clip
1142 setup_spans_clip(decrement, no)
1143
1144 0:
1145 cmp height_minor_a, #0
1146 ble 3f
1147
1148 orr temp, y_a, y_a, lsl #16
1149 sub temp, temp, #(1 << 16)
1150 sub y_a, temp, #2
1151 sub y_a, y_a, #(2 << 16)
ed0fd81d 1152 vmov y_x4, temp, y_a
75e28f62
E
1153
1154 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1155
e1f6de8f 1156 strh height_minor_a, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1157
1158 setup_spans_adjust_edges_alternate_no(left, right);
1159 setup_spans_adjust_interpolants_up()
1160 setup_spans_up_down_load_edge_set_b()
1161
1162 setup_spans_prologue_b()
1163
1164
1165 2:
1166 setup_spans_set_x4_alternate_no(none, up)
1167 subs height_minor_a, height_minor_a, #4
1168 bhi 2b
1169
1170 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1171 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1172 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1173
1174 4:
1175 add temp, psx_gpu, #psx_gpu_uvrg_offset
e1f6de8f 1176 vld1.32 { uvrg }, [temp]
75e28f62
E
1177 mov y_a, middle_y
1178
1179 setup_spans_load_b()
1180
e1f6de8f 1181 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
75e28f62
E
1182 subs y_c, y_c, temp
1183 subgt height_minor_b, height_minor_b, y_c
1184 addgt height_minor_b, height_minor_b, #1
1185
e1f6de8f 1186 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
75e28f62
E
1187 subs clip, temp, y_a
1188 ble 0f
1189
1190 sub height_minor_b, height_minor_b, clip
1191 add y_a, y_a, clip
1192 setup_spans_clip(increment, no)
1193
1194 0:
1195 cmp height_minor_b, #0
1196 ble 1f
1197
1198 orr temp, y_a, y_a, lsl #16
1199 add temp, temp, #(1 << 16)
1200 add y_a, temp, #2
1201 add y_a, y_a, #(2 << 16)
ed0fd81d 1202 vmov y_x4, temp, y_a
75e28f62
E
1203
1204 setup_spans_adjust_edges_alternate_no(left, right)
1205
e1f6de8f 1206 ldrh temp, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62 1207 add temp, temp, height_minor_b
b7569147 1208
1209 cmp temp, #MAX_SPANS
1210 beq 5f
1211
e1f6de8f 1212 strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1213
1214 2:
1215 setup_spans_set_x4_alternate_no(none, down)
1216 subs height_minor_b, height_minor_b, #4
1217 bhi 2b
1218
1219 1:
1220 setup_spans_epilogue()
1221
1222 3:
1223 setup_spans_up_down_load_edge_set_b()
1224 setup_spans_prologue_b()
1225 bal 4b
1226
b7569147 1227 5:
1228 // FIXME: overflow corner case
1229 sub temp, temp, height_minor_b
1230 bics height_minor_b, #3
1231 add temp, temp, height_minor_b
e1f6de8f 1232 strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
b7569147 1233 bne 2b
1234 bal 1b
1235
75e28f62
E
1236#undef span_uvrg_offset
1237#undef span_edge_data
1238#undef span_b_offset
1239#undef left_x
1240#undef b
1241
1242#define psx_gpu r0
1243#define num_spans r1
1244#define span_uvrg_offset r2
1245#define span_edge_data r3
1246#define span_b_offset r4
1247#define b_dx r5
1248#define span_num_blocks r6
1249#define y r7
1250#define left_x r8
1251#define b r9
1252#define dither_offset_ptr r10
1253#define block_ptr_a r11
1254#define fb_ptr r12
1255#define num_blocks r14
1256
1257#define uvrg_dx_ptr r2
1258#define texture_mask_ptr r3
1259#define dither_shift r8
1260#define dither_row r10
1261
1262#define c_32 r7
1263#define b_dx4 r8
1264#define b_dx8 r9
1265#define block_ptr_b r10
1266
1267#define block_span_ptr r10
1268#define right_mask r8
1269
1270#define color r2
1271#define color_r r3
1272#define color_g r4
1273#define color_b r5
1274
1275#undef uvrg
1276
1277#define u_block q0
1278#define v_block q1
1279#define r_block q2
1280#define g_block q3
1281#define b_block q4
1282
1283#define uv_dx4 d10
1284#define rg_dx4 d11
1285#define uv_dx8 d12
1286#define rg_dx8 d13
1287#define b_whole_8 d14
1288#define fb_mask_ptrs d15
1289
1290#define uvrg_dx4 q5
1291#define uvrg_dx8 q6
1292#define uv_dx8 d12
1293#define rg_dx8 d13
1294
1295#define u_whole q8
1296#define v_whole q9
1297#define r_whole q10
1298#define g_whole q11
1299#define b_whole q12
1300
1301#define u_whole_low d16
1302#define u_whole_high d17
1303#define v_whole_low d18
1304#define v_whole_high d19
1305#define r_whole_low d20
1306#define r_whole_high d21
1307#define g_whole_low d22
1308#define g_whole_high d23
1309#define b_whole_low d24
1310#define b_whole_high d25
1311
1312#define dx4 q13
1313#define dx8 q13
1314
1315#define u_whole_8 d26
1316#define v_whole_8 d27
1317#define u_whole_8b d24
1318#define r_whole_8 d24
1319#define g_whole_8 d25
1320
1321#define uv_whole_8 q13
1322#define uv_whole_8b q14
1323
1324#define dither_offsets q14
1325#define texture_mask q15
1326#define texture_mask_u d30
1327#define texture_mask_v d31
1328
1329#define dither_offsets_short d28
1330
1331#define v_left_x q8
1332#define uvrg q9
1333#define block_span q10
1334
1335#define uv d18
1336#define rg d19
1337
1338#define draw_mask q1
1339#define draw_mask_edge q13
1340#define test_mask q0
1341
1342#define uvrg_dx q3
1343
1344#define colors q2
1345
1346#define setup_blocks_texture_swizzled() \
1347 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1348 vsli.u8 u_whole_8, v_whole_8, #4; \
1349 vsri.u8 v_whole_8, u_whole_8b, #4 \
1350
1351#define setup_blocks_texture_unswizzled() \
1352
1353
1354#define setup_blocks_shaded_textured_builder(swizzling) \
1355.align 3; \
1356 \
1357function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
e1f6de8f 1358 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
1359 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1360 \
e1f6de8f 1361 vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128]; \
75e28f62
E
1362 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1363 \
1364 cmp num_spans, #0; \
1365 bxeq lr; \
1366 \
1367 stmdb sp!, { r4 - r11, r14 }; \
3f0189c6 1368 save_abi_regs(); \
75e28f62
E
1369 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1370 \
e1f6de8f 1371 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
1372 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1373 \
e1f6de8f 1374 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16]; \
75e28f62
E
1375 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1376 \
e1f6de8f 1377 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1378 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1379 \
1380 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1381 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1382 \
1383 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1384 \
1385 0: \
1386 vmov.u8 fb_mask_ptrs, #0; \
1387 \
e1f6de8f 1388 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
1389 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1390 \
e1f6de8f 1391 ldrh y, [span_edge_data, #edge_data_y_offset]; \
1392 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
1393 \
1394 cmp span_num_blocks, #0; \
1395 beq 1f; \
1396 \
e1f6de8f 1397 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
1398 add num_blocks, span_num_blocks, num_blocks; \
1399 \
1400 cmp num_blocks, #MAX_BLOCKS; \
1401 bgt 2f; \
1402 \
1403 3: \
e1f6de8f 1404 ldr b, [span_b_offset]; \
75e28f62
E
1405 add fb_ptr, fb_ptr, y, lsl #11; \
1406 \
1407 vdup.u32 v_left_x, left_x; \
1408 and y, y, #0x3; \
1409 \
e1f6de8f 1410 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
1411 add fb_ptr, fb_ptr, left_x, lsl #1; \
1412 \
1413 mla b, b_dx, left_x, b; \
1414 and dither_shift, left_x, #0x03; \
1415 \
e1f6de8f 1416 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
1417 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1418 \
1419 mov dither_shift, dither_shift, lsl #3; \
1420 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1421 \
1422 mov c_32, #32; \
1423 subs span_num_blocks, span_num_blocks, #1; \
1424 \
1425 mov dither_row, dither_row, ror dither_shift; \
1426 mov b_dx4, b_dx, lsl #2; \
1427 \
1428 vdup.u32 dither_offsets_short, dither_row; \
1429 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1430 \
1431 vdup.u32 b_block, b; \
1432 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1433 \
1434 vdup.u32 u_block, uv[0]; \
1435 mov b_dx8, b_dx, lsl #3; \
1436 \
1437 vdup.u32 v_block, uv[1]; \
1438 vdup.u32 r_block, rg[0]; \
1439 vdup.u32 g_block, rg[1]; \
1440 \
e1f6de8f 1441 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1442 \
1443 vadd.u32 u_block, u_block, block_span; \
e1f6de8f 1444 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1445 \
1446 vadd.u32 v_block, v_block, block_span; \
e1f6de8f 1447 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1448 \
1449 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 1450 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1451 \
1452 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 1453 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
1454 \
1455 vadd.u32 b_block, b_block, block_span; \
1456 add block_ptr_b, block_ptr_a, #16; \
1457 \
1458 vshrn.u32 u_whole_low, u_block, #16; \
1459 vshrn.u32 v_whole_low, v_block, #16; \
1460 vshrn.u32 r_whole_low, r_block, #16; \
1461 vshrn.u32 g_whole_low, g_block, #16; \
1462 \
1463 vdup.u32 dx4, uv_dx4[0]; \
1464 vshrn.u32 b_whole_low, b_block, #16; \
1465 \
1466 vaddhn.u32 u_whole_high, u_block, dx4; \
1467 vdup.u32 dx4, uv_dx4[1]; \
1468 \
1469 vaddhn.u32 v_whole_high, v_block, dx4; \
1470 vdup.u32 dx4, rg_dx4[0]; \
1471 \
1472 vaddhn.u32 r_whole_high, r_block, dx4; \
1473 vdup.u32 dx4, rg_dx4[1]; \
1474 \
1475 vaddhn.u32 g_whole_high, g_block, dx4; \
1476 vdup.u32 dx4, b_dx4; \
1477 \
1478 vaddhn.u32 b_whole_high, b_block, dx4; \
1479 vdup.u32 dx8, uv_dx8[0]; \
1480 \
1481 vadd.u32 u_block, u_block, dx8; \
1482 vdup.u32 dx8, uv_dx8[1]; \
1483 \
1484 vadd.u32 v_block, v_block, dx8; \
1485 vdup.u32 dx8, rg_dx8[0]; \
1486 \
1487 vadd.u32 r_block, r_block, dx8; \
1488 vdup.u32 dx8, rg_dx8[1]; \
1489 \
1490 vadd.u32 g_block, g_block, dx8; \
1491 vdup.u32 dx8, b_dx8; \
1492 \
1493 vadd.u32 b_block, b_block, dx8; \
1494 vmovn.u16 u_whole_8, u_whole; \
1495 \
1496 vmovn.u16 v_whole_8, v_whole; \
1497 \
1498 vmovn.u16 b_whole_8, b_whole; \
e1f6de8f 1499 pld [fb_ptr]; \
75e28f62
E
1500 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1501 \
1502 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1503 setup_blocks_texture_##swizzling(); \
1504 \
1505 vmovn.u16 r_whole_8, r_whole; \
1506 beq 5f; \
1507 \
1508 4: \
1509 vmovn.u16 g_whole_8, g_whole; \
1510 vshrn.u32 u_whole_low, u_block, #16; \
1511 \
e1f6de8f 1512 vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32; \
75e28f62
E
1513 vshrn.u32 v_whole_low, v_block, #16; \
1514 \
e1f6de8f 1515 vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32; \
75e28f62
E
1516 vshrn.u32 r_whole_low, r_block, #16; \
1517 \
e1f6de8f 1518 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1519 vshrn.u32 g_whole_low, g_block, #16; \
1520 \
1521 vdup.u32 dx4, uv_dx4[0]; \
1522 vshrn.u32 b_whole_low, b_block, #16; \
1523 \
1524 vaddhn.u32 u_whole_high, u_block, dx4; \
1525 vdup.u32 dx4, uv_dx4[1]; \
1526 \
1527 vaddhn.u32 v_whole_high, v_block, dx4; \
1528 vdup.u32 dx4, rg_dx4[0]; \
1529 \
1530 vaddhn.u32 r_whole_high, r_block, dx4; \
1531 vdup.u32 dx4, rg_dx4[1]; \
1532 \
1533 vaddhn.u32 g_whole_high, g_block, dx4; \
1534 vdup.u32 dx4, b_dx4; \
1535 \
1536 vaddhn.u32 b_whole_high, b_block, dx4; \
1537 vdup.u32 dx8, uv_dx8[0]; \
1538 \
1539 vadd.u32 u_block, u_block, dx8; \
1540 vdup.u32 dx8, uv_dx8[1]; \
1541 \
1542 vadd.u32 v_block, v_block, dx8; \
1543 vdup.u32 dx8, rg_dx8[0]; \
1544 \
1545 vadd.u32 r_block, r_block, dx8; \
1546 vdup.u32 dx8, rg_dx8[1]; \
1547 \
1548 vadd.u32 g_block, g_block, dx8; \
1549 vdup.u32 dx8, b_dx8; \
1550 \
1551 vadd.u32 b_block, b_block, dx8; \
1552 vmovn.u16 u_whole_8, u_whole; \
1553 \
1554 add fb_ptr, fb_ptr, #16; \
1555 vmovn.u16 v_whole_8, v_whole; \
1556 \
e1f6de8f 1557 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
75e28f62
E
1558 vmovn.u16 b_whole_8, b_whole; \
1559 \
e1f6de8f 1560 pld [fb_ptr]; \
75e28f62
E
1561 \
1562 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1563 subs span_num_blocks, span_num_blocks, #1; \
1564 \
1565 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1566 setup_blocks_texture_##swizzling(); \
1567 \
1568 vmovn.u16 r_whole_8, r_whole; \
1569 bne 4b; \
1570 \
1571 5: \
1572 vmovn.u16 g_whole_8, g_whole; \
e1f6de8f 1573 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62 1574 \
e1f6de8f 1575 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
1576 vdup.u8 draw_mask, right_mask; \
1577 \
1578 vmov.u32 fb_mask_ptrs[0], right_mask; \
1579 vtst.u16 draw_mask, draw_mask, test_mask; \
1580 vzip.u8 u_whole_8, v_whole_8; \
1581 \
1582 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
e1f6de8f 1583 vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32; \
1584 vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32; \
1585 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1586 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1587 \
1588 1: \
1589 add span_uvrg_offset, span_uvrg_offset, #16; \
1590 add span_b_offset, span_b_offset, #4; \
1591 \
1592 add span_edge_data, span_edge_data, #8; \
1593 subs num_spans, num_spans, #1; \
1594 \
e1f6de8f 1595 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1596 bne 0b; \
1597 \
3f0189c6 1598 restore_abi_regs(); \
75e28f62
E
1599 ldmia sp!, { r4 - r11, pc }; \
1600 \
1601 2: \
1602 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1603 vpush { texture_mask }; \
1604 vpush { uvrg_dx4 }; \
1605 \
26e3e2aa 1606 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \
75e28f62 1607 bl flush_render_block_buffer; \
26e3e2aa 1608 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \
75e28f62
E
1609 \
1610 vpop { uvrg_dx4 }; \
1611 vpop { texture_mask }; \
1612 \
1613 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1614 vmov.u8 fb_mask_ptrs, #0; \
1615 \
1616 mov num_blocks, span_num_blocks; \
1617 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1618 bal 3b \
1619
1620
1621setup_blocks_shaded_textured_builder(swizzled)
1622setup_blocks_shaded_textured_builder(unswizzled)
1623
1624
1625#define setup_blocks_unshaded_textured_builder(swizzling) \
1626.align 3; \
1627 \
1628function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
e1f6de8f 1629 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
1630 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1631 \
e1f6de8f 1632 vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128]; \
75e28f62
E
1633 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1634 \
1635 cmp num_spans, #0; \
1636 bxeq lr; \
1637 \
1638 stmdb sp!, { r4 - r11, r14 }; \
3f0189c6 1639 save_abi_regs(); \
75e28f62
E
1640 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1641 \
1642 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1643 \
e1f6de8f 1644 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16]; \
75e28f62
E
1645 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1646 \
e1f6de8f 1647 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1648 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1649 \
1650 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1651 \
1652 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1653 \
1654 0: \
1655 vmov.u8 fb_mask_ptrs, #0; \
1656 \
e1f6de8f 1657 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
1658 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1659 \
e1f6de8f 1660 ldrh y, [span_edge_data, #edge_data_y_offset]; \
1661 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
1662 \
1663 cmp span_num_blocks, #0; \
1664 beq 1f; \
1665 \
e1f6de8f 1666 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
1667 add num_blocks, span_num_blocks, num_blocks; \
1668 \
1669 cmp num_blocks, #MAX_BLOCKS; \
1670 bgt 2f; \
1671 \
1672 3: \
1673 add fb_ptr, fb_ptr, y, lsl #11; \
1674 \
1675 vdup.u32 v_left_x, left_x; \
1676 and y, y, #0x3; \
1677 \
e1f6de8f 1678 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
1679 add fb_ptr, fb_ptr, left_x, lsl #1; \
1680 \
1681 and dither_shift, left_x, #0x03; \
1682 \
e1f6de8f 1683 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
1684 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1685 \
1686 mov dither_shift, dither_shift, lsl #3; \
1687 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1688 \
1689 mov c_32, #32; \
1690 subs span_num_blocks, span_num_blocks, #1; \
1691 \
1692 mov dither_row, dither_row, ror dither_shift; \
1693 \
1694 vdup.u32 dither_offsets_short, dither_row; \
1695 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1696 \
1697 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1698 \
1699 vdup.u32 u_block, uv[0]; \
1700 \
1701 vdup.u32 v_block, uv[1]; \
e1f6de8f 1702 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1703 \
1704 vadd.u32 u_block, u_block, block_span; \
e1f6de8f 1705 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1706 \
1707 vadd.u32 v_block, v_block, block_span; \
1708 add block_ptr_b, block_ptr_a, #16; \
1709 \
1710 vshrn.u32 u_whole_low, u_block, #16; \
1711 vshrn.u32 v_whole_low, v_block, #16; \
1712 \
1713 vdup.u32 dx4, uv_dx4[0]; \
1714 \
1715 vaddhn.u32 u_whole_high, u_block, dx4; \
1716 vdup.u32 dx4, uv_dx4[1]; \
1717 \
1718 vaddhn.u32 v_whole_high, v_block, dx4; \
1719 vdup.u32 dx8, uv_dx8[0]; \
1720 \
1721 vadd.u32 u_block, u_block, dx8; \
1722 vdup.u32 dx8, uv_dx8[1]; \
1723 \
1724 vadd.u32 v_block, v_block, dx8; \
1725 vmovn.u16 u_whole_8, u_whole; \
1726 \
1727 vmovn.u16 v_whole_8, v_whole; \
1728 \
e1f6de8f 1729 pld [fb_ptr]; \
75e28f62
E
1730 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1731 \
1732 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1733 setup_blocks_texture_##swizzling(); \
1734 \
1735 beq 5f; \
1736 \
1737 4: \
1738 vshrn.u32 u_whole_low, u_block, #16; \
1739 \
e1f6de8f 1740 vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32; \
75e28f62
E
1741 vshrn.u32 v_whole_low, v_block, #16; \
1742 \
1743 add block_ptr_b, block_ptr_b, #32; \
e1f6de8f 1744 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1745 \
1746 vdup.u32 dx4, uv_dx4[0]; \
1747 vaddhn.u32 u_whole_high, u_block, dx4; \
1748 vdup.u32 dx4, uv_dx4[1]; \
1749 \
1750 vaddhn.u32 v_whole_high, v_block, dx4; \
1751 vdup.u32 dx8, uv_dx8[0]; \
1752 \
1753 vadd.u32 u_block, u_block, dx8; \
1754 vdup.u32 dx8, uv_dx8[1]; \
1755 \
1756 vadd.u32 v_block, v_block, dx8; \
1757 vmovn.u16 u_whole_8, u_whole; \
1758 \
1759 add fb_ptr, fb_ptr, #16; \
1760 vmovn.u16 v_whole_8, v_whole; \
1761 \
e1f6de8f 1762 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1763 pld [fb_ptr]; \
75e28f62
E
1764 \
1765 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1766 subs span_num_blocks, span_num_blocks, #1; \
1767 \
1768 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1769 setup_blocks_texture_##swizzling(); \
1770 \
1771 bne 4b; \
1772 \
1773 5: \
e1f6de8f 1774 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62 1775 \
e1f6de8f 1776 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
1777 vdup.u8 draw_mask, right_mask; \
1778 \
1779 vmov.u32 fb_mask_ptrs[0], right_mask; \
1780 vtst.u16 draw_mask, draw_mask, test_mask; \
1781 vzip.u8 u_whole_8, v_whole_8; \
1782 \
1783 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1784 add block_ptr_b, block_ptr_b, #32; \
e1f6de8f 1785 vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32; \
1786 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1787 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1788 \
1789 1: \
1790 add span_uvrg_offset, span_uvrg_offset, #16; \
1791 add span_edge_data, span_edge_data, #8; \
1792 subs num_spans, num_spans, #1; \
1793 \
e1f6de8f 1794 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1795 bne 0b; \
1796 \
3f0189c6 1797 restore_abi_regs(); \
75e28f62
E
1798 ldmia sp!, { r4 - r11, pc }; \
1799 \
1800 2: \
1801 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1802 vpush { texture_mask }; \
1803 vpush { uvrg_dx4 }; \
1804 \
26e3e2aa 1805 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */ \
75e28f62 1806 bl flush_render_block_buffer; \
26e3e2aa 1807 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; \
75e28f62
E
1808 \
1809 vpop { uvrg_dx4 }; \
1810 vpop { texture_mask }; \
1811 \
1812 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1813 vmov.u8 fb_mask_ptrs, #0; \
1814 \
1815 mov num_blocks, span_num_blocks; \
1816 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1817 bal 3b \
1818
1819
1820setup_blocks_unshaded_textured_builder(swizzled)
1821setup_blocks_unshaded_textured_builder(unswizzled)
1822
1823
1824.align 3
1825
1826function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
e1f6de8f 1827 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1828 veor.u32 draw_mask, draw_mask, draw_mask
1829
1830 cmp num_spans, #0
1831 bxeq lr
1832
1833 stmdb sp!, { r4 - r11, r14 }
3f0189c6 1834 save_abi_regs()
e1f6de8f 1835 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62 1836
e1f6de8f 1837 ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
75e28f62
E
1838
1839 ubfx color_r, color, #3, #5
1840 ubfx color_g, color, #11, #5
1841 ubfx color_b, color, #19, #5
1842
1843 orr color, color_r, color_b, lsl #10
1844 orr color, color, color_g, lsl #5
1845
1846 vdup.u16 colors, color
1847
e1f6de8f 1848 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
1849 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1850
1851 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1852 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1853
1854 0:
e1f6de8f 1855 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1856 ldrh y, [span_edge_data, #edge_data_y_offset]
75e28f62 1857
e1f6de8f 1858 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62
E
1859
1860 cmp span_num_blocks, #0
1861 beq 1f
1862
e1f6de8f 1863 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
75e28f62
E
1864 add num_blocks, span_num_blocks, num_blocks
1865
1866 cmp num_blocks, #MAX_BLOCKS
1867 bgt 2f
1868
1869 3:
1870 add fb_ptr, fb_ptr, y, lsl #11
1871 and y, y, #0x3
1872
1873 add fb_ptr, fb_ptr, left_x, lsl #1
1874 mov c_32, #32
1875
1876 subs span_num_blocks, span_num_blocks, #1
1877
1878 add block_ptr_b, block_ptr_a, #16
e1f6de8f 1879 pld [fb_ptr]
75e28f62
E
1880
1881 vmov.u32 fb_mask_ptrs[1], fb_ptr
1882 beq 5f
1883
1884 4:
e1f6de8f 1885 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_32
1886 vst1.u32 { colors }, [block_ptr_b, :128], c_32
1887 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
75e28f62
E
1888
1889 add fb_ptr, fb_ptr, #16
1890 add block_ptr_b, block_ptr_b, #32
1891
e1f6de8f 1892 pld [fb_ptr]
75e28f62
E
1893
1894 vmov.u32 fb_mask_ptrs[1], fb_ptr
1895 subs span_num_blocks, span_num_blocks, #1
1896
1897 bne 4b
1898
1899 5:
e1f6de8f 1900 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]
75e28f62
E
1901
1902 vdup.u8 draw_mask_edge, right_mask
1903 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1904
e1f6de8f 1905 vst1.u32 { colors }, [block_ptr_b, :128], c_32
1906 vst1.u32 { draw_mask_edge }, [block_ptr_a, :128], c_32
75e28f62 1907 add block_ptr_b, block_ptr_b, #32
e1f6de8f 1908 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
75e28f62
E
1909
1910 1:
1911 add span_edge_data, span_edge_data, #8
1912 subs num_spans, num_spans, #1
1913
e1f6de8f 1914 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
1915 bne 0b
1916
3f0189c6 1917 restore_abi_regs()
75e28f62
E
1918 ldmia sp!, { r4 - r11, pc }
1919
1920 2:
1921 vpush { colors }
1922
4d646738 1923 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 1924 bl flush_render_block_buffer
4d646738 1925 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62
E
1926
1927 vpop { colors }
1928
e1f6de8f 1929 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
1930 veor.u32 draw_mask, draw_mask, draw_mask
1931
1932 mov num_blocks, span_num_blocks
1933 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1934 bal 3b
1935
1936
1937#define mask_msb_scalar r14
1938
1939#define msb_mask q15
1940
1941#define pixels_low d16
1942
1943#define msb_mask_low d30
1944#define msb_mask_high d31
1945
1946
1947.align 3
1948
1949function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
e1f6de8f 1950 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1951
1952 cmp num_spans, #0
1953 bxeq lr
1954
1955 stmdb sp!, { r4 - r11, r14 }
1956
e1f6de8f 1957 ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
75e28f62
E
1958
1959 ubfx color_r, color, #3, #5
1960 ubfx color_g, color, #11, #5
1961
e1f6de8f 1962 ldrh mask_msb_scalar, [psx_gpu, #psx_gpu_mask_msb_offset]
75e28f62
E
1963 ubfx color_b, color, #19, #5
1964
1965 orr color, color_r, color_b, lsl #10
1966 orr color, color, color_g, lsl #5
1967 orr color, color, mask_msb_scalar
1968
1969 vdup.u16 colors, color
1970
1971 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
ed0fd81d 1972 orr color, color, color, lsl #16
3867c6ef 1973
75e28f62
E
1974
1975 0:
e1f6de8f 1976 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1977 ldrh y, [span_edge_data, #edge_data_y_offset]
75e28f62 1978
e1f6de8f 1979 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62
E
1980
1981 cmp span_num_blocks, #0
1982 beq 1f
1983
e1f6de8f 1984 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
75e28f62
E
1985
1986 add fb_ptr, fb_ptr, y, lsl #11
1987 subs span_num_blocks, span_num_blocks, #1
1988
1989 add fb_ptr, fb_ptr, left_x, lsl #1
1990 beq 3f
1991
1992 2:
e1f6de8f 1993 vst1.u32 { colors }, [fb_ptr]!
75e28f62
E
1994 subs span_num_blocks, span_num_blocks, #1
1995
1996 bne 2b
1997
1998 3:
e1f6de8f 1999 ldrb right_mask, [span_edge_data, #edge_data_right_mask_offset]
75e28f62 2000
3867c6ef
E
2001 cmp right_mask, #0x0
2002 beq 5f
2003
2004 tst right_mask, #0xF
e1f6de8f 2005 streq color, [fb_ptr], #4
3867c6ef 2006 moveq right_mask, right_mask, lsr #4
e1f6de8f 2007 streq color, [fb_ptr], #4
3867c6ef
E
2008
2009 tst right_mask, #0x3
e1f6de8f 2010 streq color, [fb_ptr], #4
3867c6ef
E
2011 moveq right_mask, right_mask, lsr #2
2012
2013 tst right_mask, #0x1
e1f6de8f 2014 strheq color, [fb_ptr]
75e28f62
E
2015
2016 1:
2017 add span_edge_data, span_edge_data, #8
2018 subs num_spans, num_spans, #1
75e28f62
E
2019 bne 0b
2020
2021 ldmia sp!, { r4 - r11, pc }
2022
3867c6ef 2023 5:
e1f6de8f 2024 vst1.u32 { colors }, [fb_ptr]
3867c6ef 2025 bal 1b
75e28f62
E
2026
2027
2028#undef c_64
2029
2030#define c_64 r7
2031#define rg_dx_ptr r2
2032
2033
2034#undef r_block
2035#undef g_block
2036#undef b_block
2037#undef r_whole
2038#undef g_whole
2039#undef b_whole
2040#undef r_whole_low
2041#undef r_whole_high
2042#undef g_whole_low
2043#undef g_whole_high
2044#undef b_whole_low
2045#undef b_whole_high
2046#undef r_whole_8
2047#undef g_whole_8
2048#undef b_whole_8
2049#undef dither_offsets
2050#undef rg_dx4
2051#undef rg_dx8
2052#undef dx4
2053#undef dx8
2054#undef v_left_x
2055#undef uvrg
2056#undef block_span
2057#undef rg
2058#undef draw_mask
2059#undef test_mask
2060
2061#define r_block q0
2062#define g_block q1
2063#define b_block q2
2064
2065#define r_whole q3
2066#define g_whole q4
2067#define b_whole q5
2068
2069#define r_whole_low d6
2070#define r_whole_high d7
2071#define g_whole_low d8
2072#define g_whole_high d9
2073#define b_whole_low d10
2074#define b_whole_high d11
2075
2076#define gb_whole_8 q6
2077
2078#define g_whole_8 d12
2079#define b_whole_8 d13
2080
2081#define r_whole_8 d14
2082
2083#define pixels q8
2084
2085#define rg_dx4 d18
2086#define rg_dx8 d19
2087
2088#define dx4 q10
2089#define dx8 q10
2090
2091#define v_left_x d6
2092#define uvrg q4
2093#define block_span q5
2094
2095#define rg d9
2096
2097#define d64_1 d22
2098#define d64_128 d23
2099
2100#define d128_4 q12
2101#define d128_0x7 q13
2102
2103#define d64_4 d24
2104
2105#define dither_offsets q14
2106#define draw_mask q15
2107
2108#define dither_offsets_low d28
2109
2110#define rg_dx d0
2111#define test_mask q10
2112
2113
2114#define setup_blocks_shaded_untextured_dither_a_dithered() \
2115 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2116 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2117
2118#define setup_blocks_shaded_untextured_dither_b_dithered() \
2119 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2120 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2121
2122#define setup_blocks_shaded_untextured_dither_a_undithered() \
2123
2124#define setup_blocks_shaded_untextured_dither_b_undithered() \
2125
2126
2127#define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2128.align 3; \
2129 \
2130function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
e1f6de8f 2131 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
2132 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2133 \
e1f6de8f 2134 vld1.u32 { rg_dx }, [rg_dx_ptr, :64]; \
75e28f62
E
2135 \
2136 cmp num_spans, #0; \
2137 bxeq lr; \
2138 \
2139 stmdb sp!, { r4 - r11, r14 }; \
3f0189c6 2140 save_abi_regs(); \
75e28f62
E
2141 vshl.u32 rg_dx4, rg_dx, #2; \
2142 \
e1f6de8f 2143 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
2144 vshl.u32 rg_dx8, rg_dx, #3; \
2145 \
2146 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2147 \
e1f6de8f 2148 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
2149 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2150 \
2151 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2152 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2153 \
2154 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2155 vmov.u8 d64_1, #1; \
2156 \
2157 vmov.u8 d128_4, #4; \
2158 vmov.u8 d64_128, #128; \
2159 \
2160 vmov.u8 d128_0x7, #0x7; \
2161 \
2162 0: \
e1f6de8f 2163 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
2164 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2165 \
e1f6de8f 2166 ldrh y, [span_edge_data, #edge_data_y_offset]; \
2167 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
2168 \
2169 cmp span_num_blocks, #0; \
2170 beq 1f; \
2171 \
e1f6de8f 2172 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
2173 add num_blocks, span_num_blocks, num_blocks; \
2174 \
2175 cmp num_blocks, #MAX_BLOCKS; \
2176 bgt 2f; \
2177 \
2178 3: \
e1f6de8f 2179 ldr b, [span_b_offset]; \
75e28f62
E
2180 add fb_ptr, fb_ptr, y, lsl #11; \
2181 \
2182 vdup.u32 v_left_x, left_x; \
2183 and y, y, #0x3; \
2184 \
e1f6de8f 2185 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
2186 add fb_ptr, fb_ptr, left_x, lsl #1; \
2187 \
2188 mla b, b_dx, left_x, b; \
2189 and dither_shift, left_x, #0x03; \
2190 \
e1f6de8f 2191 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
2192 vshr.u32 rg_dx, rg_dx4, #2; \
2193 \
2194 mov dither_shift, dither_shift, lsl #3; \
2195 vmla.u32 rg, rg_dx, v_left_x; \
2196 \
2197 mov c_64, #64; \
2198 subs span_num_blocks, span_num_blocks, #1; \
2199 \
2200 mov dither_row, dither_row, ror dither_shift; \
2201 mov b_dx4, b_dx, lsl #2; \
2202 \
2203 vdup.u32 dither_offsets, dither_row; \
2204 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2205 \
2206 vdup.u32 b_block, b; \
2207 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2208 \
2209 mov b_dx8, b_dx, lsl #3; \
2210 vdup.u32 r_block, rg[0]; \
2211 vdup.u32 g_block, rg[1]; \
2212 \
e1f6de8f 2213 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2214 \
2215 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 2216 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2217 \
2218 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 2219 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
2220 \
2221 vadd.u32 b_block, b_block, block_span; \
2222 add block_ptr_b, block_ptr_a, #16; \
2223 \
2224 vshrn.u32 r_whole_low, r_block, #16; \
2225 vshrn.u32 g_whole_low, g_block, #16; \
2226 vshrn.u32 b_whole_low, b_block, #16; \
2227 vdup.u32 dx4, rg_dx4[0]; \
2228 \
2229 vaddhn.u32 r_whole_high, r_block, dx4; \
2230 vdup.u32 dx4, rg_dx4[1]; \
2231 \
2232 vaddhn.u32 g_whole_high, g_block, dx4; \
2233 vdup.u32 dx4, b_dx4; \
2234 \
2235 vaddhn.u32 b_whole_high, b_block, dx4; \
2236 vdup.u32 dx8, rg_dx8[0]; \
2237 \
2238 vadd.u32 r_block, r_block, dx8; \
2239 vdup.u32 dx8, rg_dx8[1]; \
2240 \
2241 vadd.u32 g_block, g_block, dx8; \
2242 vdup.u32 dx8, b_dx8; \
2243 \
2244 vadd.u32 b_block, b_block, dx8; \
2245 \
2246 vmovn.u16 r_whole_8, r_whole; \
2247 vmovn.u16 g_whole_8, g_whole; \
2248 vmovn.u16 b_whole_8, b_whole; \
2249 \
2250 beq 5f; \
2251 veor.u32 draw_mask, draw_mask, draw_mask; \
2252 \
2253 4: \
2254 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2255 vshrn.u32 r_whole_low, r_block, #16; \
2256 \
2257 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2258 vshrn.u32 g_whole_low, g_block, #16; \
2259 \
2260 vshrn.u32 b_whole_low, b_block, #16; \
e1f6de8f 2261 str fb_ptr, [block_ptr_a, #44]; \
75e28f62
E
2262 \
2263 vdup.u32 dx4, rg_dx4[0]; \
2264 vshr.u8 r_whole_8, r_whole_8, #3; \
2265 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2266 \
2267 vaddhn.u32 r_whole_high, r_block, dx4; \
2268 vdup.u32 dx4, rg_dx4[1]; \
2269 \
2270 vaddhn.u32 g_whole_high, g_block, dx4; \
2271 vdup.u32 dx4, b_dx4; \
2272 \
2273 vaddhn.u32 b_whole_high, b_block, dx4; \
2274 vdup.u32 dx8, rg_dx8[0]; \
2275 \
2276 vmull.u8 pixels, r_whole_8, d64_1; \
2277 vmlal.u8 pixels, g_whole_8, d64_4; \
2278 vmlal.u8 pixels, b_whole_8, d64_128; \
2279 \
2280 vadd.u32 r_block, r_block, dx8; \
2281 vdup.u32 dx8, rg_dx8[1]; \
2282 \
2283 vadd.u32 g_block, g_block, dx8; \
2284 vdup.u32 dx8, b_dx8; \
2285 \
2286 vadd.u32 b_block, b_block, dx8; \
2287 add fb_ptr, fb_ptr, #16; \
2288 \
2289 vmovn.u16 r_whole_8, r_whole; \
2290 vmovn.u16 g_whole_8, g_whole; \
2291 vmovn.u16 b_whole_8, b_whole; \
2292 \
e1f6de8f 2293 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64; \
2294 vst1.u32 { pixels }, [block_ptr_b, :128], c_64; \
75e28f62 2295 \
e1f6de8f 2296 pld [fb_ptr]; \
75e28f62
E
2297 \
2298 subs span_num_blocks, span_num_blocks, #1; \
2299 bne 4b; \
2300 \
2301 5: \
e1f6de8f 2302 str fb_ptr, [block_ptr_a, #44]; \
75e28f62
E
2303 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2304 \
e1f6de8f 2305 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62
E
2306 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2307 \
2308 vshr.u8 r_whole_8, r_whole_8, #3; \
2309 vdup.u8 draw_mask, right_mask; \
2310 \
2311 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
e1f6de8f 2312 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
2313 \
2314 vtst.u16 draw_mask, draw_mask, test_mask; \
2315 \
2316 vmull.u8 pixels, r_whole_8, d64_1; \
2317 vmlal.u8 pixels, g_whole_8, d64_4; \
2318 vmlal.u8 pixels, b_whole_8, d64_128; \
2319 \
e1f6de8f 2320 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64; \
2321 vst1.u32 { pixels }, [block_ptr_b, :128], c_64; \
75e28f62
E
2322 \
2323 1: \
2324 add span_uvrg_offset, span_uvrg_offset, #16; \
2325 add span_b_offset, span_b_offset, #4; \
2326 \
2327 add span_edge_data, span_edge_data, #8; \
2328 subs num_spans, num_spans, #1; \
2329 \
e1f6de8f 2330 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
2331 bne 0b; \
2332 \
3f0189c6 2333 restore_abi_regs(); \
75e28f62
E
2334 ldmia sp!, { r4 - r11, pc }; \
2335 \
2336 2: \
2337 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2338 vpush { rg_dx4 }; \
2339 \
4d646738 2340 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 2341 bl flush_render_block_buffer; \
4d646738 2342 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
2343 \
2344 vpop { rg_dx4 }; \
2345 \
2346 vmov.u8 d64_1, #1; \
2347 vmov.u8 d128_4, #4; \
2348 vmov.u8 d64_128, #128; \
2349 vmov.u8 d128_0x7, #0x7; \
2350 \
2351 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2352 \
2353 mov num_blocks, span_num_blocks; \
2354 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2355 bal 3b \
2356
2357
2358setup_blocks_shaded_untextured_indirect_builder(undithered)
2359setup_blocks_shaded_untextured_indirect_builder(dithered)
2360
2361
2362#undef draw_mask
2363
2364#define mask_msb_ptr r14
2365
2366#define draw_mask q0
2367#define pixels_low d16
3867c6ef 2368#define pixels_high d17
75e28f62
E
2369
2370
2371
2372#define setup_blocks_shaded_untextured_direct_builder(dithering) \
2373.align 3; \
2374 \
2375function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
e1f6de8f 2376 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
2377 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2378 \
e1f6de8f 2379 vld1.u32 { rg_dx }, [rg_dx_ptr, :64]; \
75e28f62
E
2380 \
2381 cmp num_spans, #0; \
2382 bxeq lr; \
2383 \
2384 stmdb sp!, { r4 - r11, r14 }; \
3f0189c6 2385 save_abi_regs(); \
75e28f62
E
2386 vshl.u32 rg_dx4, rg_dx, #2; \
2387 \
e1f6de8f 2388 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
2389 vshl.u32 rg_dx8, rg_dx, #3; \
2390 \
2391 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2392 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2393 \
2394 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2395 vmov.u8 d64_1, #1; \
2396 \
2397 vmov.u8 d128_4, #4; \
2398 vmov.u8 d64_128, #128; \
2399 \
2400 vmov.u8 d128_0x7, #0x7; \
2401 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 2402 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
2403 \
2404 0: \
e1f6de8f 2405 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
2406 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2407 \
e1f6de8f 2408 ldrh y, [span_edge_data, #edge_data_y_offset]; \
2409 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
2410 \
2411 cmp span_num_blocks, #0; \
2412 beq 1f; \
2413 \
e1f6de8f 2414 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
2415 add fb_ptr, fb_ptr, y, lsl #11; \
2416 \
e1f6de8f 2417 ldr b, [span_b_offset]; \
75e28f62
E
2418 vdup.u32 v_left_x, left_x; \
2419 and y, y, #0x3; \
2420 \
e1f6de8f 2421 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
2422 add fb_ptr, fb_ptr, left_x, lsl #1; \
2423 \
2424 mla b, b_dx, left_x, b; \
2425 and dither_shift, left_x, #0x03; \
2426 \
e1f6de8f 2427 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
2428 vshr.u32 rg_dx, rg_dx4, #2; \
2429 \
2430 mov dither_shift, dither_shift, lsl #3; \
2431 vmla.u32 rg, rg_dx, v_left_x; \
2432 \
2433 subs span_num_blocks, span_num_blocks, #1; \
2434 \
2435 mov dither_row, dither_row, ror dither_shift; \
2436 mov b_dx4, b_dx, lsl #2; \
2437 \
2438 vdup.u32 dither_offsets, dither_row; \
2439 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2440 \
2441 vdup.u32 b_block, b; \
2442 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2443 \
2444 mov b_dx8, b_dx, lsl #3; \
2445 vdup.u32 r_block, rg[0]; \
2446 vdup.u32 g_block, rg[1]; \
2447 \
e1f6de8f 2448 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2449 \
2450 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 2451 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2452 \
2453 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 2454 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
2455 \
2456 vadd.u32 b_block, b_block, block_span; \
2457 add block_ptr_b, block_ptr_a, #16; \
2458 \
2459 vshrn.u32 r_whole_low, r_block, #16; \
2460 vshrn.u32 g_whole_low, g_block, #16; \
2461 vshrn.u32 b_whole_low, b_block, #16; \
2462 vdup.u32 dx4, rg_dx4[0]; \
2463 \
2464 vaddhn.u32 r_whole_high, r_block, dx4; \
2465 vdup.u32 dx4, rg_dx4[1]; \
2466 \
2467 vaddhn.u32 g_whole_high, g_block, dx4; \
2468 vdup.u32 dx4, b_dx4; \
2469 \
2470 vaddhn.u32 b_whole_high, b_block, dx4; \
2471 vdup.u32 dx8, rg_dx8[0]; \
2472 \
2473 vadd.u32 r_block, r_block, dx8; \
2474 vdup.u32 dx8, rg_dx8[1]; \
2475 \
2476 vadd.u32 g_block, g_block, dx8; \
2477 vdup.u32 dx8, b_dx8; \
2478 \
2479 vadd.u32 b_block, b_block, dx8; \
2480 \
2481 vmovn.u16 r_whole_8, r_whole; \
2482 vmovn.u16 g_whole_8, g_whole; \
2483 vmovn.u16 b_whole_8, b_whole; \
2484 \
2485 beq 3f; \
2486 \
2487 2: \
2488 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2489 vshrn.u32 r_whole_low, r_block, #16; \
2490 \
2491 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2492 vshrn.u32 g_whole_low, g_block, #16; \
2493 \
2494 vshrn.u32 b_whole_low, b_block, #16; \
2495 \
2496 vdup.u32 dx4, rg_dx4[0]; \
2497 vshr.u8 r_whole_8, r_whole_8, #3; \
2498 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2499 \
2500 vaddhn.u32 r_whole_high, r_block, dx4; \
2501 vdup.u32 dx4, rg_dx4[1]; \
2502 \
2503 vmov pixels, msb_mask; \
2504 vaddhn.u32 g_whole_high, g_block, dx4; \
2505 vdup.u32 dx4, b_dx4; \
2506 \
2507 vaddhn.u32 b_whole_high, b_block, dx4; \
2508 vdup.u32 dx8, rg_dx8[0]; \
2509 \
2510 vmlal.u8 pixels, r_whole_8, d64_1; \
2511 vmlal.u8 pixels, g_whole_8, d64_4; \
2512 vmlal.u8 pixels, b_whole_8, d64_128; \
2513 \
2514 vadd.u32 r_block, r_block, dx8; \
2515 vdup.u32 dx8, rg_dx8[1]; \
2516 \
2517 vadd.u32 g_block, g_block, dx8; \
2518 vdup.u32 dx8, b_dx8; \
2519 \
2520 vadd.u32 b_block, b_block, dx8; \
2521 \
2522 vmovn.u16 r_whole_8, r_whole; \
2523 vmovn.u16 g_whole_8, g_whole; \
2524 vmovn.u16 b_whole_8, b_whole; \
2525 \
e1f6de8f 2526 vst1.u32 { pixels }, [fb_ptr]!; \
75e28f62
E
2527 subs span_num_blocks, span_num_blocks, #1; \
2528 bne 2b; \
2529 \
2530 3: \
2531 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2532 \
e1f6de8f 2533 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62
E
2534 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2535 \
2536 vshr.u8 r_whole_8, r_whole_8, #3; \
3867c6ef 2537 rbit right_mask, right_mask; \
75e28f62
E
2538 vmov pixels, msb_mask; \
2539 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
3867c6ef 2540 clz right_mask, right_mask; \
75e28f62
E
2541 \
2542 vmlal.u8 pixels, r_whole_8, d64_1; \
2543 vmlal.u8 pixels, g_whole_8, d64_4; \
2544 vmlal.u8 pixels, b_whole_8, d64_128; \
2545 \
8184d7c5 2546 JT_OP_REL(100f, right_mask, temp); \
e1f6de8f 2547 JT_OP(ldr pc, [pc, right_mask, lsl #2]); \
3867c6ef 2548 nop; \
8184d7c5 2549 100: \
3867c6ef 2550 nop; \
8184d7c5 2551 .word JTE(100b, 4f); \
2552 .word JTE(100b, 5f); \
2553 .word JTE(100b, 6f); \
2554 .word JTE(100b, 7f); \
2555 .word JTE(100b, 8f); \
2556 .word JTE(100b, 9f); \
2557 .word JTE(100b, 10f); \
2558 .word JTE(100b, 11f); \
3867c6ef 2559 \
75e28f62 2560 4: \
e1f6de8f 2561 vst1.u16 { pixels_low[0] }, [fb_ptr]; \
3867c6ef
E
2562 bal 1f; \
2563 \
2564 5: \
e1f6de8f 2565 vst1.u32 { pixels_low[0] }, [fb_ptr]; \
3867c6ef
E
2566 bal 1f; \
2567 \
2568 6: \
e1f6de8f 2569 vst1.u32 { pixels_low[0] }, [fb_ptr]!; \
2570 vst1.u16 { pixels_low[2] }, [fb_ptr]; \
3867c6ef
E
2571 bal 1f; \
2572 \
2573 7: \
e1f6de8f 2574 vst1.u32 { pixels_low }, [fb_ptr]; \
3867c6ef
E
2575 bal 1f; \
2576 \
2577 8: \
e1f6de8f 2578 vst1.u32 { pixels_low }, [fb_ptr]!; \
2579 vst1.u16 { pixels_high[0] }, [fb_ptr]; \
3867c6ef
E
2580 bal 1f; \
2581 \
2582 9: \
e1f6de8f 2583 vst1.u32 { pixels_low }, [fb_ptr]!; \
2584 vst1.u32 { pixels_high[0] }, [fb_ptr]!; \
3867c6ef
E
2585 bal 1f; \
2586 \
2587 10: \
e1f6de8f 2588 vst1.u32 { pixels_low }, [fb_ptr]!; \
2589 vst1.u32 { pixels_high[0] }, [fb_ptr]!; \
2590 vst1.u16 { pixels_high[2] }, [fb_ptr]; \
3867c6ef
E
2591 bal 1f; \
2592 \
2593 11: \
e1f6de8f 2594 vst1.u32 { pixels }, [fb_ptr]; \
3867c6ef 2595 bal 1f; \
75e28f62
E
2596 \
2597 1: \
2598 add span_uvrg_offset, span_uvrg_offset, #16; \
2599 add span_b_offset, span_b_offset, #4; \
2600 \
2601 add span_edge_data, span_edge_data, #8; \
2602 subs num_spans, num_spans, #1; \
2603 \
2604 bne 0b; \
2605 \
3f0189c6 2606 restore_abi_regs(); \
75e28f62
E
2607 ldmia sp!, { r4 - r11, pc } \
2608
2609setup_blocks_shaded_untextured_direct_builder(undithered)
2610setup_blocks_shaded_untextured_direct_builder(dithered)
2611
2612
2613#undef psx_gpu
2614#undef num_blocks
2615#undef triangle
2616#undef c_64
2617
2618#define psx_gpu r0
2619#define block_ptr r1
2620#define num_blocks r2
2621#define uv_01 r3
2622#define uv_23 r4
2623#define uv_45 r5
2624#define uv_67 r6
2625#define uv_0 r7
2626#define uv_1 r3
2627#define uv_2 r8
2628#define uv_3 r4
2629#define uv_4 r9
2630#define uv_5 r5
2631#define uv_6 r10
2632#define uv_7 r6
2633#define texture_ptr r11
2634
2635#define pixel_0 r7
2636#define pixel_1 r3
2637#define pixel_2 r8
2638#define pixel_3 r4
2639#define pixel_4 r9
2640#define pixel_5 r5
2641#define pixel_6 r10
2642#define pixel_7 r6
2643
2644#define pixels_a r7
2645#define pixels_b r9
2646#define pixels_c r8
2647#define pixels_d r10
2648
2649#define c_64 r0
2650
2651#define clut_ptr r12
2652#define current_texture_mask r5
2653#define dirty_textures_mask r6
2654
2655#define texels d0
2656
2657#define clut_low_a d2
2658#define clut_low_b d3
2659#define clut_high_a d4
2660#define clut_high_b d5
2661
2662#define clut_a q1
2663#define clut_b q2
2664
2665#define texels_low d6
2666#define texels_high d7
2667
2668.align 3
2669
2670function(texture_blocks_untextured)
2671 bx lr
2672
2673
2674.align 3
2675
2676function(texture_blocks_4bpp)
2677 stmdb sp!, { r3 - r11, r14 }
2678 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2679
e1f6de8f 2680 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2681 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62 2682
e1f6de8f 2683 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2684 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]
75e28f62 2685
e1f6de8f 2686 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62
E
2687 vuzp.u8 clut_a, clut_b
2688
e1f6de8f 2689 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
2690 tst dirty_textures_mask, current_texture_mask
2691
2692 bne 1f
2693 mov c_64, #64
2694
26950:
2696 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2697
2698 uxtah uv_0, texture_ptr, uv_01
2699 uxtah uv_1, texture_ptr, uv_01, ror #16
2700
2701 uxtah uv_2, texture_ptr, uv_23
2702 uxtah uv_3, texture_ptr, uv_23, ror #16
2703
2704 uxtah uv_4, texture_ptr, uv_45
e1f6de8f 2705 ldrb pixel_0, [uv_0]
75e28f62
E
2706
2707 uxtah uv_5, texture_ptr, uv_45, ror #16
e1f6de8f 2708 ldrb pixel_1, [uv_1]
75e28f62
E
2709
2710 uxtah uv_6, texture_ptr, uv_67
e1f6de8f 2711 ldrb pixel_2, [uv_2]
75e28f62
E
2712
2713 uxtah uv_7, texture_ptr, uv_67, ror #16
e1f6de8f 2714 ldrb pixel_3, [uv_3]
75e28f62 2715
e1f6de8f 2716 ldrb pixel_4, [uv_4]
75e28f62
E
2717 subs num_blocks, num_blocks, #1
2718
e1f6de8f 2719 ldrb pixel_5, [uv_5]
75e28f62
E
2720 orr pixels_a, pixel_0, pixel_1, lsl #8
2721
e1f6de8f 2722 ldrb pixel_6, [uv_6]
75e28f62
E
2723 orr pixels_b, pixel_4, pixel_5, lsl #8
2724
e1f6de8f 2725 ldrb pixel_7, [uv_7]
75e28f62
E
2726 orr pixels_a, pixels_a, pixel_2, lsl #16
2727
2728 orr pixels_b, pixels_b, pixel_6, lsl #16
2729 orr pixels_a, pixels_a, pixel_3, lsl #24
2730
2731 orr pixels_b, pixels_b, pixel_7, lsl #24
ed0fd81d 2732 vmov texels, pixels_a, pixels_b
75e28f62
E
2733
2734 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2735 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2736
e1f6de8f 2737 vst2.u8 { texels_low, texels_high }, [block_ptr, :128], c_64
75e28f62
E
2738 bne 0b
2739
2740 ldmia sp!, { r3 - r11, pc }
2741
27421:
2743 stmdb sp!, { r1 - r2 }
2744 bl update_texture_4bpp_cache
2745
2746 mov c_64, #64
2747 ldmia sp!, { r1 - r2 }
2748 bal 0b
2749
2750
2751.align 3
2752
2753function(texture_blocks_8bpp)
2754 stmdb sp!, { r3 - r11, r14 }
2755 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2756
e1f6de8f 2757 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2758 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62 2759
e1f6de8f 2760 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2761 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62 2762
e1f6de8f 2763 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]
75e28f62
E
2764 tst dirty_textures_mask, current_texture_mask
2765
2766 bne 1f
2767 nop
2768
27690:
2770 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2771
2772 uxtah uv_0, texture_ptr, uv_01
2773 uxtah uv_1, texture_ptr, uv_01, ror #16
2774
2775 uxtah uv_2, texture_ptr, uv_23
2776 uxtah uv_3, texture_ptr, uv_23, ror #16
2777
2778 uxtah uv_4, texture_ptr, uv_45
e1f6de8f 2779 ldrb pixel_0, [uv_0]
75e28f62
E
2780
2781 uxtah uv_5, texture_ptr, uv_45, ror #16
e1f6de8f 2782 ldrb pixel_1, [uv_1]
75e28f62
E
2783
2784 uxtah uv_6, texture_ptr, uv_67
e1f6de8f 2785 ldrb pixel_2, [uv_2]
75e28f62
E
2786
2787 uxtah uv_7, texture_ptr, uv_67, ror #16
e1f6de8f 2788 ldrb pixel_3, [uv_3]
75e28f62 2789
e1f6de8f 2790 ldrb pixel_4, [uv_4]
75e28f62
E
2791 add pixel_0, pixel_0, pixel_0
2792
e1f6de8f 2793 ldrb pixel_5, [uv_5]
75e28f62
E
2794 add pixel_1, pixel_1, pixel_1
2795
e1f6de8f 2796 ldrb pixel_6, [uv_6]
75e28f62
E
2797 add pixel_2, pixel_2, pixel_2
2798
e1f6de8f 2799 ldrb pixel_7, [uv_7]
75e28f62
E
2800 add pixel_3, pixel_3, pixel_3
2801
e1f6de8f 2802 ldrh pixel_0, [clut_ptr, pixel_0]
75e28f62
E
2803 add pixel_4, pixel_4, pixel_4
2804
e1f6de8f 2805 ldrh pixel_1, [clut_ptr, pixel_1]
75e28f62
E
2806 add pixel_5, pixel_5, pixel_5
2807
e1f6de8f 2808 ldrh pixel_2, [clut_ptr, pixel_2]
75e28f62
E
2809 add pixel_6, pixel_6, pixel_6
2810
e1f6de8f 2811 ldrh pixel_3, [clut_ptr, pixel_3]
75e28f62
E
2812 add pixel_7, pixel_7, pixel_7
2813
e1f6de8f 2814 ldrh pixel_4, [clut_ptr, pixel_4]
75e28f62
E
2815 orr pixels_a, pixel_0, pixel_1, lsl #16
2816
e1f6de8f 2817 ldrh pixel_5, [clut_ptr, pixel_5]
75e28f62
E
2818 orr pixels_c, pixel_2, pixel_3, lsl #16
2819
e1f6de8f 2820 ldrh pixel_6, [clut_ptr, pixel_6]
75e28f62
E
2821 subs num_blocks, num_blocks, #1
2822
e1f6de8f 2823 ldrh pixel_7, [clut_ptr, pixel_7]
75e28f62
E
2824 orr pixels_b, pixel_4, pixel_5, lsl #16
2825
2826 orr pixels_d, pixel_6, pixel_7, lsl #16
2827 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2828
2829 add block_ptr, block_ptr, #64
2830 bne 0b
2831
2832 ldmia sp!, { r3 - r11, pc }
2833
28341:
4d646738 2835 stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
75e28f62
E
2836
2837 bl update_texture_8bpp_cache
2838
4d646738 2839 ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
75e28f62
E
2840 bal 0b
2841
2842
2843#undef uv_0
2844#undef uv_1
2845#undef uv_2
2846#undef uv_3
2847#undef uv_4
2848#undef uv_5
2849#undef uv_6
2850#undef uv_7
2851
2852#undef pixel_0
2853#undef pixel_1
2854#undef pixel_2
2855#undef pixel_3
2856#undef pixel_4
2857#undef pixel_5
2858#undef pixel_6
2859#undef pixel_7
2860
2861#undef texture_ptr
2862
2863#undef pixels_a
2864#undef pixels_b
2865#undef pixels_c
2866#undef pixels_d
2867
2868#define psx_gpu r0
2869#define block_ptr r1
2870#define num_blocks r2
2871
2872#define uv_0 r3
2873#define uv_1 r4
2874#define u_0 r3
2875#define u_1 r4
2876#define v_0 r5
2877#define v_1 r6
2878
2879#define uv_2 r5
2880#define uv_3 r6
2881#define u_2 r5
2882#define u_3 r6
2883#define v_2 r7
2884#define v_3 r8
2885
2886#define uv_4 r7
2887#define uv_5 r8
2888#define u_4 r7
2889#define u_5 r8
2890#define v_4 r9
2891#define v_5 r10
2892
2893#define uv_6 r9
2894#define uv_7 r10
2895#define u_6 r9
2896#define u_7 r10
2897#define v_6 r11
2898#define v_7 r0
2899
2900#define pixel_0 r3
2901#define pixel_1 r4
2902#define pixel_2 r5
2903#define pixel_3 r6
2904#define pixel_4 r7
2905#define pixel_5 r8
2906#define pixel_6 r9
2907#define pixel_7 r10
2908
2909#define pixels_a r3
2910#define pixels_b r5
2911#define pixels_c r7
2912#define pixels_d r9
2913
2914#define texture_ptr r12
2915
2916
2917.align 3
2918
2919function(texture_blocks_16bpp)
2920 stmdb sp!, { r3 - r11, r14 }
2921 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2922
e1f6de8f 2923 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2924 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
75e28f62
E
2925
29260:
e1f6de8f 2927 ldrh uv_0, [block_ptr]
75e28f62
E
2928 subs num_blocks, num_blocks, #1
2929
e1f6de8f 2930 ldrh uv_1, [block_ptr, #2]
75e28f62
E
2931
2932 and v_0, uv_0, #0xFF00
2933 and v_1, uv_1, #0xFF00
2934
2935 and u_0, uv_0, #0xFF
2936 and u_1, uv_1, #0xFF
2937
2938 add uv_0, u_0, v_0, lsl #2
e1f6de8f 2939 ldrh uv_2, [block_ptr, #4]
75e28f62
E
2940
2941 add uv_1, u_1, v_1, lsl #2
e1f6de8f 2942 ldrh uv_3, [block_ptr, #6]
75e28f62
E
2943
2944 add uv_0, uv_0, uv_0
2945 add uv_1, uv_1, uv_1
2946
2947 and v_2, uv_2, #0xFF00
2948 and v_3, uv_3, #0xFF00
2949
2950 and u_2, uv_2, #0xFF
2951 and u_3, uv_3, #0xFF
2952
2953 add uv_2, u_2, v_2, lsl #2
e1f6de8f 2954 ldrh uv_4, [block_ptr, #8]
75e28f62
E
2955
2956 add uv_3, u_3, v_3, lsl #2
e1f6de8f 2957 ldrh uv_5, [block_ptr, #10]
75e28f62
E
2958
2959 add uv_2, uv_2, uv_2
2960 add uv_3, uv_3, uv_3
2961
2962 and v_4, uv_4, #0xFF00
2963 and v_5, uv_5, #0xFF00
2964
2965 and u_4, uv_4, #0xFF
2966 and u_5, uv_5, #0xFF
2967
2968 add uv_4, u_4, v_4, lsl #2
e1f6de8f 2969 ldrh uv_6, [block_ptr, #12]
75e28f62
E
2970
2971 add uv_5, u_5, v_5, lsl #2
e1f6de8f 2972 ldrh uv_7, [block_ptr, #14]
75e28f62
E
2973
2974 add uv_4, uv_4, uv_4
e1f6de8f 2975 ldrh pixel_0, [texture_ptr, uv_0]
75e28f62
E
2976
2977 add uv_5, uv_5, uv_5
e1f6de8f 2978 ldrh pixel_1, [texture_ptr, uv_1]
75e28f62
E
2979
2980 and v_6, uv_6, #0xFF00
e1f6de8f 2981 ldrh pixel_2, [texture_ptr, uv_2]
75e28f62
E
2982
2983 and v_7, uv_7, #0xFF00
e1f6de8f 2984 ldrh pixel_3, [texture_ptr, uv_3]
75e28f62
E
2985
2986 and u_6, uv_6, #0xFF
e1f6de8f 2987 ldrh pixel_4, [texture_ptr, uv_4]
75e28f62
E
2988
2989 and u_7, uv_7, #0xFF
e1f6de8f 2990 ldrh pixel_5, [texture_ptr, uv_5]
75e28f62
E
2991
2992 add uv_6, u_6, v_6, lsl #2
2993 add uv_7, u_7, v_7, lsl #2
2994
2995 add uv_6, uv_6, uv_6
2996 add uv_7, uv_7, uv_7
2997
2998 orr pixels_a, pixel_0, pixel_1, lsl #16
2999 orr pixels_b, pixel_2, pixel_3, lsl #16
3000
e1f6de8f 3001 ldrh pixel_6, [texture_ptr, uv_6]
75e28f62
E
3002 orr pixels_c, pixel_4, pixel_5, lsl #16
3003
e1f6de8f 3004 ldrh pixel_7, [texture_ptr, uv_7]
75e28f62
E
3005 orr pixels_d, pixel_6, pixel_7, lsl #16
3006
3007 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
3008 add block_ptr, block_ptr, #64
3009
3010 bne 0b
3011
3012 ldmia sp!, { r3 - r11, pc }
3013
3014
3015#undef num_blocks
3016
3017#undef test_mask
3018#undef texels
3019#undef pixels_b
3020#undef pixels
3021#undef d64_1
3022#undef d64_4
3023#undef d64_128
3024#undef draw_mask
3025#undef msb_mask
3026#undef msb_mask_low
3027#undef msb_mask_high
3028#undef fb_pixels
3029
3030#undef c_32
3031#undef fb_ptr
3032#undef mask_msb_ptr
3033
3034#define psx_gpu r0
3035#define num_blocks r1
3036#define color_ptr r2
3867c6ef
E
3037#define colors_scalar r2
3038#define colors_scalar_compare r3
75e28f62
E
3039#define mask_msb_ptr r2
3040
3041#define block_ptr_load_a r0
3042#define block_ptr_store r3
3043#define block_ptr_load_b r12
3044#define c_32 r2
3045
3046#define c_48 r4
3047#define fb_ptr r14
3048#define draw_mask_bits_scalar r5
3049
3050#define d128_0x07 q0
3051#define d128_0x1F q1
3052#define d128_0x8000 q2
3053#define test_mask q3
3054#define texels q4
3055#define colors_rg q5
3056#define colors_b_dm_bits q6
3057#define texels_rg q7
3058#define pixels_r q8
3059#define pixels_g q9
3060#define pixels_b q10
3061#define pixels q11
3062#define zero_mask q4
3063#define draw_mask q12
3064#define msb_mask q13
3065
3066#define fb_pixels q8
3067
3068#define pixels_gb_low q9
3069
3070#define colors_r d10
3071#define colors_g d11
3072#define colors_b d12
3073#define draw_mask_bits d13
3074#define texels_r d14
3075#define texels_g d15
3076#define pixels_r_low d16
3077#define pixels_g_low d18
3078#define pixels_b_low d19
3079#define msb_mask_low d26
3080#define msb_mask_high d27
3081
3082#define d64_1 d28
3083#define d64_4 d29
3084#define d64_128 d30
3085#define texels_b d31
3086
3087#define shade_blocks_textured_modulated_prologue_indirect() \
3088 mov c_48, #48; \
3089 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3090
3091#define shade_blocks_textured_modulated_prologue_direct() \
3092 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3093 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16] \
75e28f62 3094
75e28f62 3095
3867c6ef
E
3096#define shade_blocks_textured_modulated_prologue_shaded(dithering, target) \
3097
3098#define shade_blocks_textured_false_modulation_check_undithered(target) \
e1f6de8f 3099 ldr colors_scalar, [psx_gpu, #psx_gpu_triangle_color_offset]; \
3867c6ef
E
3100 movw colors_scalar_compare, #0x8080; \
3101 \
3102 movt colors_scalar_compare, #0x80; \
3103 cmp colors_scalar, colors_scalar_compare; \
3104 beq shade_blocks_textured_unmodulated_##target \
3105
3106#define shade_blocks_textured_false_modulation_check_dithered(target) \
3107
3108#define shade_blocks_textured_modulated_prologue_unshaded(dithering, target) \
3109 shade_blocks_textured_false_modulation_check_##dithering(target); \
75e28f62 3110 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
e1f6de8f 3111 vld1.u32 { colors_r[] }, [color_ptr, :32]; \
75e28f62
E
3112 vdup.u8 colors_g, colors_r[1]; \
3113 vdup.u8 colors_b, colors_r[2]; \
3114 vdup.u8 colors_r, colors_r[0] \
3115
3116
3117#define shade_blocks_textured_modulated_load_dithered(target) \
e1f6de8f 3118 vld1.u32 { target }, [block_ptr_load_b, :128] \
75e28f62
E
3119
3120#define shade_blocks_textured_modulated_load_last_dithered(target) \
e1f6de8f 3121 vld1.u32 { target }, [block_ptr_load_b, :128], c_32 \
75e28f62
E
3122
3123#define shade_blocks_textured_modulated_load_undithered(target) \
3124
3125#define shade_blocks_textured_modulated_load_last_undithered(target) \
3126 add block_ptr_load_b, block_ptr_load_b, #32 \
3127
3128#define shade_blocks_textured_modulate_dithered(channel) \
3129 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3130
3131#define shade_blocks_textured_modulate_undithered(channel) \
3132 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3133
3134
3135#define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
e1f6de8f 3136 vst1.u32 { draw_mask }, [block_ptr_store, :128]! \
75e28f62
E
3137
3138#define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
e1f6de8f 3139 ldr fb_ptr, [block_ptr_load_b, #(offset - 64)]; \
3140 vld1.u32 { fb_pixels }, [fb_ptr]; \
75e28f62
E
3141 vbit.u16 pixels, fb_pixels, draw_mask \
3142
3143#define shade_blocks_textured_modulated_store_pixels_indirect() \
e1f6de8f 3144 vst1.u32 { pixels }, [block_ptr_store, :128], c_48 \
75e28f62
E
3145
3146#define shade_blocks_textured_modulated_store_pixels_direct() \
e1f6de8f 3147 vst1.u32 { pixels }, [fb_ptr] \
75e28f62
E
3148
3149
3150#define shade_blocks_textured_modulated_load_rg_shaded() \
e1f6de8f 3151 vld1.u32 { colors_r, colors_g }, [block_ptr_load_b, :128], c_32 \
75e28f62
E
3152
3153#define shade_blocks_textured_modulated_load_rg_unshaded() \
3154 add block_ptr_load_b, block_ptr_load_b, #32 \
3155
3156#define shade_blocks_textured_modulated_load_bdm_shaded() \
e1f6de8f 3157 vld1.u32 { colors_b, draw_mask_bits }, [block_ptr_load_a, :128], c_32 \
75e28f62
E
3158
3159#define shade_blocks_textured_modulated_load_bdm_unshaded() \
e1f6de8f 3160 ldr draw_mask_bits_scalar, [block_ptr_load_a, #8]; \
75e28f62
E
3161 add block_ptr_load_a, block_ptr_load_a, #32 \
3162
3163#define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3164 vdup.u16 draw_mask, draw_mask_bits[0] \
3165
3166#define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3167 vdup.u16 draw_mask, draw_mask_bits_scalar \
3168
3169
3170#define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3171
3172#define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3173 vorr.u16 pixels, pixels, msb_mask \
3174
3175
3176#define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3177.align 3; \
3178 \
3179function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
fab27ba2 3180 save_abi_regs(); \
3867c6ef 3181 shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
75e28f62 3182 stmdb sp!, { r4 - r5, lr }; \
e1f6de8f 3183 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62 3184 \
e1f6de8f 3185 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
3186 \
3187 shade_blocks_textured_modulated_prologue_##target(); \
75e28f62
E
3188 \
3189 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3190 mov c_32, #32; \
3191 \
3192 add block_ptr_load_b, block_ptr_load_a, #16; \
3193 vmov.u8 d64_1, #1; \
3194 vmov.u8 d64_4, #4; \
3195 vmov.u8 d64_128, #128; \
3196 \
e1f6de8f 3197 vld1.u32 { texels }, [block_ptr_load_a, :128], c_32; \
75e28f62
E
3198 vmov.u8 d128_0x07, #0x07; \
3199 \
3200 shade_blocks_textured_modulated_load_rg_##shading(); \
3201 vmov.u8 d128_0x1F, #0x1F; \
3202 \
3203 shade_blocks_textured_modulated_load_bdm_##shading(); \
3204 vmov.u16 d128_0x8000, #0x8000; \
3205 \
3206 vmovn.u16 texels_r, texels; \
3207 vshrn.u16 texels_g, texels, #5; \
3208 \
3209 vshrn.u16 texels_b, texels, #7; \
3210 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3211 \
3212 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3213 vtst.u16 draw_mask, draw_mask, test_mask; \
3214 \
3215 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3216 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3217 \
3218 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3219 vshr.u8 texels_b, texels_b, #3; \
3220 \
3221 shade_blocks_textured_modulate_##dithering(r); \
3222 shade_blocks_textured_modulate_##dithering(g); \
3223 shade_blocks_textured_modulate_##dithering(b); \
3224 \
3225 vand.u16 pixels, texels, d128_0x8000; \
3226 vceq.u16 zero_mask, texels, #0; \
3227 \
3228 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3229 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3230 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3231 \
3232 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3233 vorr.u16 draw_mask, draw_mask, zero_mask; \
3234 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3235 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3236 \
3237 subs num_blocks, num_blocks, #1; \
3238 beq 1f; \
3239 \
3240 .align 3; \
3241 \
3242 0: \
e1f6de8f 3243 vld1.u32 { texels }, [block_ptr_load_a, :128], c_32; \
75e28f62
E
3244 shade_blocks_textured_modulated_load_rg_##shading(); \
3245 vshrn.u16 texels_g, texels, #5; \
3246 \
3247 shade_blocks_textured_modulated_load_bdm_##shading(); \
3248 vshrn.u16 texels_b, texels, #7; \
3249 \
e1f6de8f 3250 pld [block_ptr_load_a]; \
75e28f62
E
3251 vmovn.u16 texels_r, texels; \
3252 vmlal.u8 pixels, pixels_r_low, d64_1; \
3253 \
3254 vmlal.u8 pixels, pixels_g_low, d64_4; \
3255 vmlal.u8 pixels, pixels_b_low, d64_128; \
3256 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3257 \
3258 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3259 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3260 \
3261 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3262 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3263 \
3264 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3265 vtst.u16 draw_mask, draw_mask, test_mask; \
3266 \
3267 shade_blocks_textured_modulated_store_pixels_##target(); \
3268 vshr.u8 texels_b, texels_b, #3; \
3269 \
3270 shade_blocks_textured_modulate_##dithering(r); \
3271 shade_blocks_textured_modulate_##dithering(g); \
3272 shade_blocks_textured_modulate_##dithering(b); \
3273 \
3274 vand.u16 pixels, texels, d128_0x8000; \
3275 vceq.u16 zero_mask, texels, #0; \
3276 \
3277 subs num_blocks, num_blocks, #1; \
3278 \
3279 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3280 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3281 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3282 \
3283 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3284 vorr.u16 draw_mask, draw_mask, zero_mask; \
3285 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3286 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3287 \
3288 bne 0b; \
3289 \
3290 1: \
3291 vmlal.u8 pixels, pixels_r_low, d64_1; \
3292 vmlal.u8 pixels, pixels_g_low, d64_4; \
3293 vmlal.u8 pixels, pixels_b_low, d64_128; \
3294 \
3295 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3296 shade_blocks_textured_modulated_store_pixels_##target(); \
3297 \
fab27ba2 3298 ldmia sp!, { r4 - r5, lr }; \
3f0189c6 3299 restore_abi_regs(); \
fab27ba2 3300 bx lr \
75e28f62
E
3301
3302
3303shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3304shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3305shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3306shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3307
3308shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3309shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3310shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3311shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3312
3313
3314#undef c_64
3315#undef fb_ptr
3316#undef color_ptr
3317
3318#undef color_r
3319#undef color_g
3320#undef color_b
3321
3322#undef test_mask
3323#undef pixels
3324#undef draw_mask
3325#undef zero_mask
3326#undef fb_pixels
3327#undef msb_mask
3328#undef msb_mask_low
3329#undef msb_mask_high
3330
3331#define psx_gpu r0
3332#define num_blocks r1
3333#define mask_msb_ptr r2
3334#define color_ptr r3
3335
3336#define block_ptr_load r0
3337#define draw_mask_store_ptr r3
3338#define draw_mask_bits_ptr r12
3339#define draw_mask_ptr r12
3340#define pixel_store_ptr r14
3341
3342#define fb_ptr_cmp r4
3343
3344#define fb_ptr r3
3345#define fb_ptr_next r14
3346
3347#define c_64 r2
3348
3349#define test_mask q0
3350#define pixels q1
3351#define draw_mask q2
3352#define zero_mask q3
3353#define draw_mask_combined q4
3354#define fb_pixels q5
3355#define fb_pixels_next q6
3356#define msb_mask q7
3357
3358#define draw_mask_low d4
3359#define draw_mask_high d5
3360#define msb_mask_low d14
3361#define msb_mask_high d15
3362
3363.align 3
3364function(shade_blocks_textured_unmodulated_indirect)
3f0189c6 3365 stmdb sp!, { r4, r14 }
3366 save_abi_regs()
75e28f62
E
3367 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3368
e1f6de8f 3369 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3370 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3371
e1f6de8f 3372 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
3373 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3374
3375 mov c_64, #64
3376 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3377
e1f6de8f 3378 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
75e28f62 3379 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3380 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3381 vceq.u16 zero_mask, pixels, #0
3382
3383 vtst.u16 draw_mask, draw_mask, test_mask
e1f6de8f 3384 vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
75e28f62
E
3385
3386 subs num_blocks, num_blocks, #1
3387 beq 1f
3388
3389 0:
e1f6de8f 3390 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
75e28f62
E
3391 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3392
3393 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3394 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3395 vceq.u16 zero_mask, pixels, #0
3396
3397 vtst.u16 draw_mask, draw_mask, test_mask
e1f6de8f 3398 vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
75e28f62 3399
e1f6de8f 3400 vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
75e28f62
E
3401 subs num_blocks, num_blocks, #1
3402
3403 bne 0b
3404
3405 1:
3406 vorr.u16 draw_mask_combined, draw_mask, zero_mask
e1f6de8f 3407 vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
75e28f62 3408
3f0189c6 3409 restore_abi_regs()
3410 ldmia sp!, { r4, pc }
75e28f62
E
3411
3412
3413.align 3
3414
3415function(shade_blocks_textured_unmodulated_direct)
3416 stmdb sp!, { r4, r14 }
3f0189c6 3417 save_abi_regs()
75e28f62
E
3418 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3419
e1f6de8f 3420 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3421 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3422
e1f6de8f 3423 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
3424 mov c_64, #64
3425
e1f6de8f 3426 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
3427 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3428
3429 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3430 [draw_mask_bits_ptr, :16], c_64
3431 ldr fb_ptr_next, [block_ptr_load, #44]
75e28f62 3432
e1f6de8f 3433 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3434 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3435 vceq.u16 zero_mask, pixels, #0
3436 vtst.u16 draw_mask, draw_mask, test_mask
3437
3438 subs num_blocks, num_blocks, #1
3439 beq 1f
3440
3441 0:
3442 mov fb_ptr, fb_ptr_next
e1f6de8f 3443 ldr fb_ptr_next, [block_ptr_load, #44]
75e28f62
E
3444
3445 vorr.u16 pixels, pixels, msb_mask
3446
3447 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3448 vmov fb_pixels, fb_pixels_next
3449
3450 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3451 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3452 vbif.u16 fb_pixels, pixels, draw_mask_combined
3453
75e28f62 3454 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
e1f6de8f 3455 pld [fb_ptr_next, #64]
8438c3c7 3456
75e28f62 3457 add fb_ptr_cmp, fb_ptr_cmp, #14
e1f6de8f 3458 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
8438c3c7 3459
75e28f62
E
3460 cmp fb_ptr_cmp, #28
3461 bls 4f
3462
e1f6de8f 3463 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3464 vceq.u16 zero_mask, pixels, #0
3465
e1f6de8f 3466 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3467 vtst.u16 draw_mask, draw_mask, test_mask
3468
3469 3:
3470 subs num_blocks, num_blocks, #1
3471 bne 0b
3472
3473 1:
3474 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3475 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3476
e1f6de8f 3477 vst1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62 3478
3f0189c6 3479 restore_abi_regs()
75e28f62
E
3480 ldmia sp!, { r4, pc }
3481
3482 4:
e1f6de8f 3483 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3484 vceq.u16 zero_mask, pixels, #0
3485
e1f6de8f 3486 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3487 vtst.u16 draw_mask, draw_mask, test_mask
3488
3489 bal 3b
3490
3491
3492function(shade_blocks_unshaded_untextured_indirect)
3493 bx lr
3494
3495.align 3
3496
3497function(shade_blocks_unshaded_untextured_direct)
3498 stmdb sp!, { r4, r14 }
3f0189c6 3499 save_abi_regs()
75e28f62
E
3500 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3501
e1f6de8f 3502 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3503 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3504
e1f6de8f 3505 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
3506 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3507
3508 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
e1f6de8f 3509 vld1.u16 { pixels }, [color_ptr, :128]
75e28f62
E
3510
3511 mov c_64, #64
e1f6de8f 3512 vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62
E
3513
3514 vorr.u16 pixels, pixels, msb_mask
3515 subs num_blocks, num_blocks, #1
3516
e1f6de8f 3517 ldr fb_ptr_next, [block_ptr_load], #64
75e28f62 3518
e1f6de8f 3519 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3520 beq 1f
3521
3522 0:
3523 vmov fb_pixels, fb_pixels_next
3524 mov fb_ptr, fb_ptr_next
e1f6de8f 3525 ldr fb_ptr_next, [block_ptr_load], #64
75e28f62
E
3526
3527 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 3528 vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62
E
3529
3530 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3531 add fb_ptr_cmp, fb_ptr_cmp, #14
3532 cmp fb_ptr_cmp, #28
3533 bls 4f
3534
e1f6de8f 3535 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3536 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3537
3538 3:
3539 subs num_blocks, num_blocks, #1
3540 bne 0b
3541
3542 1:
3543 vbif.u16 fb_pixels_next, pixels, draw_mask
e1f6de8f 3544 vst1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62 3545
3f0189c6 3546 restore_abi_regs()
75e28f62
E
3547 ldmia sp!, { r4, pc }
3548
3549 4:
e1f6de8f 3550 vst1.u16 { fb_pixels }, [fb_ptr]
3551 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3552 bal 3b
3553
3554
3555#undef draw_mask_ptr
3556#undef c_64
3557#undef fb_ptr
3558#undef fb_ptr_next
3559#undef fb_ptr_cmp
3560
3561#define psx_gpu r0
3562#define num_blocks r1
3563#define msb_mask_ptr r2
3564#define pixel_ptr r3
3565#define draw_mask_ptr r0
3566#define c_64 r2
3567#define fb_ptr r12
3568#define fb_ptr_next r14
3569#define fb_ptr_cmp r4
3570
3571#undef msb_mask
3572#undef draw_mask
3573#undef pixels
3574#undef fb_pixels
3575#undef d128_0x8000
3576#undef msb_mask_low
3577#undef msb_mask_high
3578#undef draw_mask_next
3579#undef pixels_g
3580#undef blend_pixels
3581#undef fb_pixels_next
3582
3583#define msb_mask q0
3584#define draw_mask q1
3585#define pixels q2
3586#define fb_pixels q3
3587#define blend_pixels q4
3588#define pixels_no_msb q5
3589#define blend_mask q6
3590#define fb_pixels_no_msb q7
3591#define d128_0x8000 q8
3592#define d128_0x0421 q9
3593#define fb_pixels_next q10
3594#define blend_pixels_next q11
3595#define pixels_next q12
3596#define draw_mask_next q13
3597#define write_mask q14
3598
3599#define pixels_rb q5
3600#define pixels_mg q7
3601#define pixels_g q7
3602#define d128_0x7C1F q8
3603#define d128_0x03E0 q9
3604#define fb_pixels_rb q10
3605#define fb_pixels_g q11
3606#define fb_pixels_masked q11
3607#define d128_0x83E0 q15
3608#define pixels_fourth q7
3609#define d128_0x1C07 q12
3610#define d128_0x00E0 q13
3611#define d128_0x80E0 q13
3612
3613#define msb_mask_low d0
3614#define msb_mask_high d1
3615
3616#define blend_blocks_average_set_blend_mask_textured(source) \
3617 vclt.s16 blend_mask, source, #0 \
3618
3619#define blend_blocks_average_set_stp_bit_textured() \
3620 vorr.u16 blend_pixels, #0x8000 \
3621
3622#define blend_blocks_average_combine_textured(source) \
3623 vbif.u16 blend_pixels, source, blend_mask \
3624
3625#define blend_blocks_average_set_blend_mask_untextured(source) \
3626
3627#define blend_blocks_average_set_stp_bit_untextured() \
3628
3629#define blend_blocks_average_combine_untextured(source) \
3630
3631#define blend_blocks_average_mask_set_on() \
3632 vclt.s16 write_mask, fb_pixels_next, #0 \
3633
3634#define blend_blocks_average_mask_copy_on() \
3635 vorr.u16 draw_mask, draw_mask_next, write_mask \
3636
3637#define blend_blocks_average_mask_copy_b_on() \
3638 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3639
3640#define blend_blocks_average_mask_set_off() \
3641
3642#define blend_blocks_average_mask_copy_off() \
3643 vmov draw_mask, draw_mask_next \
3644
3645#define blend_blocks_average_mask_copy_b_off() \
3646
3647#define blend_blocks_average_builder(texturing, mask_evaluate) \
3648.align 3; \
3649 \
3650function(blend_blocks_##texturing##_average_##mask_evaluate) \
3651 stmdb sp!, { r4, r14 }; \
3f0189c6 3652 save_abi_regs(); \
75e28f62 3653 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3654 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3655 \
3656 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3657 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3658 \
3659 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3660 mov c_64, #64; \
3661 \
3662 vmov.u16 d128_0x8000, #0x8000; \
e1f6de8f 3663 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
3664 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62
E
3665 \
3666 vmov.u16 d128_0x0421, #0x0400; \
e1f6de8f 3667 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
3668 \
3669 vorr.u16 d128_0x0421, #0x0021; \
e1f6de8f 3670 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3671 \
3672 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3673 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3674 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3675 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3676 blend_blocks_average_mask_set_##mask_evaluate(); \
3677 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3678 \
3679 subs num_blocks, num_blocks, #1; \
3680 beq 1f; \
3681 \
3682 0: \
3683 mov fb_ptr, fb_ptr_next; \
e1f6de8f 3684 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62
E
3685 \
3686 vmov pixels, pixels_next; \
e1f6de8f 3687 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
3688 \
3689 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3690 \
3691 blend_blocks_average_mask_copy_##mask_evaluate(); \
e1f6de8f 3692 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
3693 \
3694 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3695 blend_blocks_average_set_stp_bit_##texturing(); \
3696 vmov fb_pixels, fb_pixels_next; \
3697 blend_blocks_average_combine_##texturing(pixels); \
3698 \
3699 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3700 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3701 cmp fb_ptr_cmp, #28; \
3702 bls 2f; \
3703 \
e1f6de8f 3704 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3705 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3706 \
3707 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3708 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3709 \
3710 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3711 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3712 \
3713 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3714 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3715 blend_blocks_average_mask_set_##mask_evaluate(); \
e1f6de8f 3716 vst1.u16 { fb_pixels }, [fb_ptr]; \
75e28f62
E
3717 \
3718 3: \
3719 subs num_blocks, num_blocks, #1; \
3720 bne 0b; \
3721 \
3722 1: \
3723 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3724 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3725 \
3726 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3727 blend_blocks_average_set_stp_bit_##texturing(); \
3728 blend_blocks_average_combine_##texturing(pixels_next); \
3729 \
3730 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3731 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
e1f6de8f 3732 vst1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62 3733 \
3f0189c6 3734 restore_abi_regs(); \
75e28f62
E
3735 ldmia sp!, { r4, pc }; \
3736 \
3737 2: \
3738 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3739 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
e1f6de8f 3740 vst1.u16 { fb_pixels }, [fb_ptr]; \
75e28f62 3741 \
e1f6de8f 3742 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3743 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3744 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3745 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3746 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3747 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3748 \
3749 bal 3b \
3750
3751blend_blocks_average_builder(textured, off)
3752blend_blocks_average_builder(untextured, off)
3753blend_blocks_average_builder(textured, on)
3754blend_blocks_average_builder(untextured, on)
3755
3756
3757#define blend_blocks_add_mask_set_on() \
3758 vclt.s16 write_mask, fb_pixels, #0 \
3759
3760#define blend_blocks_add_mask_copy_on() \
3761 vorr.u16 draw_mask, draw_mask, write_mask \
3762
3763#define blend_blocks_add_mask_set_off() \
3764
3765#define blend_blocks_add_mask_copy_off() \
3766
3767
3768#define blend_blocks_add_textured_builder(mask_evaluate) \
3769.align 3; \
3770 \
3771function(blend_blocks_textured_add_##mask_evaluate) \
3772 stmdb sp!, { r4, r14 }; \
3f0189c6 3773 save_abi_regs(); \
75e28f62 3774 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3775 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3776 \
3777 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3778 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3779 \
3780 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3781 mov c_64, #64; \
3782 \
3783 vmov.u16 d128_0x7C1F, #0x7C00; \
3784 vmov.u16 d128_0x03E0, #0x0300; \
3785 vmov.u16 d128_0x83E0, #0x8000; \
3786 vorr.u16 d128_0x03E0, #0x00E0; \
3787 vorr.u16 d128_0x7C1F, #0x001F; \
3788 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3789 \
e1f6de8f 3790 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
3791 ldr fb_ptr_next, [pixel_ptr, #28]; \
3792 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 3793 vclt.s16 blend_mask, pixels, #0; \
e1f6de8f 3794 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3795 blend_blocks_add_mask_set_##mask_evaluate(); \
3796 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3797 \
3798 blend_blocks_add_mask_copy_##mask_evaluate(); \
3799 vorr.u16 pixels, pixels, msb_mask; \
3800 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3801 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3802 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3803 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3804 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3805 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3806 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3807 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3808 \
3809 subs num_blocks, num_blocks, #1; \
3810 beq 1f; \
3811 \
3812 0: \
3813 mov fb_ptr, fb_ptr_next; \
3814 \
e1f6de8f 3815 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 3816 \
e1f6de8f 3817 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
3818 vclt.s16 blend_mask, pixels, #0; \
3819 \
3820 vorr.u16 pixels, pixels, msb_mask; \
3821 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3822 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3823 \
8438c3c7 3824 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
e1f6de8f 3825 pld [fb_ptr_next, #64]; \
75e28f62
E
3826 \
3827 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
8438c3c7 3828 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3829 \
75e28f62 3830 add fb_ptr_cmp, fb_ptr_cmp, #14; \
e1f6de8f 3831 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
8438c3c7 3832 \
75e28f62
E
3833 cmp fb_ptr_cmp, #28; \
3834 bls 2f; \
3835 \
e1f6de8f 3836 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3837 blend_blocks_add_mask_set_##mask_evaluate(); \
3838 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3839 blend_blocks_add_mask_copy_##mask_evaluate(); \
3840 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3841 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
e1f6de8f 3842 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3843 \
3844 3: \
3845 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3846 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3847 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3848 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3849 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3850 \
3851 subs num_blocks, num_blocks, #1; \
3852 bne 0b; \
3853 \
3854 1: \
3855 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3856 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3857 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62 3858 \
3f0189c6 3859 restore_abi_regs(); \
75e28f62
E
3860 ldmia sp!, { r4, pc }; \
3861 \
3862 2: \
e1f6de8f 3863 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3864 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3865 \
e1f6de8f 3866 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3867 blend_blocks_add_mask_set_##mask_evaluate(); \
3868 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3869 blend_blocks_add_mask_copy_##mask_evaluate(); \
3870 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3871 bal 3b \
3872
3873
3874#define blend_blocks_add_untextured_builder(mask_evaluate) \
3875.align 3; \
3876 \
3877function(blend_blocks_untextured_add_##mask_evaluate) \
3878 stmdb sp!, { r4, r14 }; \
3f0189c6 3879 save_abi_regs(); \
75e28f62 3880 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3881 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3882 \
3883 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3884 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3885 \
3886 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3887 mov c_64, #64; \
3888 \
3889 vmov.u16 d128_0x7C1F, #0x7C00; \
3890 vmov.u16 d128_0x03E0, #0x0300; \
3891 vorr.u16 d128_0x7C1F, #0x001F; \
3892 vorr.u16 d128_0x03E0, #0x00E0; \
3893 \
e1f6de8f 3894 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
3895 ldr fb_ptr_next, [pixel_ptr, #28]; \
3896 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
3897 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3898 blend_blocks_add_mask_set_##mask_evaluate(); \
3899 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3900 \
3901 blend_blocks_add_mask_copy_##mask_evaluate(); \
3902 vand.u16 pixels_g, pixels, d128_0x03E0; \
3903 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3904 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3905 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3906 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3907 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3908 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3909 \
3910 subs num_blocks, num_blocks, #1; \
3911 beq 1f; \
3912 \
3913 0: \
3914 mov fb_ptr, fb_ptr_next; \
3915 \
e1f6de8f 3916 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 3917 \
e1f6de8f 3918 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
3919 \
3920 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3921 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3922 vand.u16 pixels_g, pixels, d128_0x03E0; \
3923 \
3924 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3925 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
3926 \
3927 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3928 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3929 cmp fb_ptr_cmp, #28; \
3930 bls 2f; \
3931 \
e1f6de8f 3932 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3933 blend_blocks_add_mask_set_##mask_evaluate(); \
3934 blend_blocks_add_mask_copy_##mask_evaluate(); \
3935 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3936 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 3937 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3938 \
3939 3: \
3940 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3941 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3942 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3943 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3944 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3945 \
3946 subs num_blocks, num_blocks, #1; \
3947 bne 0b; \
3948 \
3949 1: \
3950 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3951 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3952 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3953 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62 3954 \
3f0189c6 3955 restore_abi_regs(); \
75e28f62
E
3956 ldmia sp!, { r4, pc }; \
3957 \
3958 2: \
e1f6de8f 3959 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3960 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3961 \
e1f6de8f 3962 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3963 blend_blocks_add_mask_set_##mask_evaluate(); \
3964 blend_blocks_add_mask_copy_##mask_evaluate(); \
3965 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3966 bal 3b \
3967
3968
3969blend_blocks_add_textured_builder(off)
3970blend_blocks_add_textured_builder(on)
3971blend_blocks_add_untextured_builder(off)
3972blend_blocks_add_untextured_builder(on)
3973
3974#define blend_blocks_subtract_set_blend_mask_textured() \
3975 vclt.s16 blend_mask, pixels_next, #0 \
3976
3977#define blend_blocks_subtract_combine_textured() \
3978 vbif.u16 blend_pixels, pixels, blend_mask \
3979
718a9e58 3980#define blend_blocks_subtract_set_stp_textured() \
75e28f62
E
3981 vorr.u16 blend_pixels, #0x8000 \
3982
3983#define blend_blocks_subtract_msb_mask_textured() \
3984 vorr.u16 pixels, pixels_next, msb_mask \
3985
3986#define blend_blocks_subtract_set_blend_mask_untextured() \
3987
3988#define blend_blocks_subtract_combine_untextured() \
3989
718a9e58 3990#define blend_blocks_subtract_set_stp_untextured() \
75e28f62
E
3991 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3992
3993#define blend_blocks_subtract_msb_mask_untextured() \
3994
3995
3996#define blend_blocks_subtract_mask_set_on() \
3997 vclt.s16 write_mask, fb_pixels, #0 \
3998
3999#define blend_blocks_subtract_mask_copy_on() \
4000 vorr.u16 draw_mask, draw_mask_next, write_mask \
4001
4002#define blend_blocks_subtract_mask_set_off() \
4003
4004#define blend_blocks_subtract_mask_copy_off() \
4005 vmov draw_mask, draw_mask_next \
4006
4007
4008#define blend_blocks_subtract_builder(texturing, mask_evaluate) \
4009.align 3; \
4010 \
4011function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
4012 stmdb sp!, { r4, r14 }; \
3f0189c6 4013 save_abi_regs(); \
75e28f62 4014 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 4015 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
4016 \
4017 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 4018 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
4019 \
4020 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4021 mov c_64, #64; \
4022 \
4023 vmov.u16 d128_0x7C1F, #0x7C00; \
4024 vmov.u16 d128_0x03E0, #0x0300; \
4025 vorr.u16 d128_0x7C1F, #0x001F; \
4026 vorr.u16 d128_0x03E0, #0x00E0; \
4027 \
e1f6de8f 4028 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
4029 ldr fb_ptr_next, [pixel_ptr, #28]; \
4030 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62 4031 blend_blocks_subtract_set_blend_mask_##texturing(); \
e1f6de8f 4032 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4033 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4034 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
4035 \
4036 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
4037 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4038 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4039 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4040 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4041 \
4042 subs num_blocks, num_blocks, #1; \
4043 beq 1f; \
4044 \
4045 0: \
4046 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4047 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4048 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4049 \
e1f6de8f 4050 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4051 blend_blocks_subtract_msb_mask_##texturing(); \
4052 \
e1f6de8f 4053 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
4054 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4055 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
718a9e58 4056 blend_blocks_subtract_set_stp_##texturing(); \
75e28f62
E
4057 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
4058 blend_blocks_subtract_combine_##texturing(); \
4059 blend_blocks_subtract_set_blend_mask_##texturing(); \
4060 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4061 \
4062 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4063 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4064 cmp fb_ptr_cmp, #28; \
4065 bls 2f; \
4066 \
e1f6de8f 4067 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4068 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4069 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4070 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4071 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
e1f6de8f 4072 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4073 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4074 \
4075 3: \
4076 subs num_blocks, num_blocks, #1; \
4077 bne 0b; \
4078 \
4079 1: \
4080 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4081 \
4082 blend_blocks_subtract_msb_mask_##texturing(); \
4083 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
718a9e58 4084 blend_blocks_subtract_set_stp_##texturing(); \
75e28f62
E
4085 blend_blocks_subtract_combine_##texturing(); \
4086 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4087 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62 4088 \
3f0189c6 4089 restore_abi_regs(); \
75e28f62
E
4090 ldmia sp!, { r4, pc }; \
4091 \
4092 2: \
e1f6de8f 4093 vst1.u16 { blend_pixels }, [fb_ptr]; \
4094 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4095 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4096 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4097 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4098 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4099 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4100 bal 3b \
4101
4102
4103blend_blocks_subtract_builder(textured, off)
4104blend_blocks_subtract_builder(textured, on)
4105blend_blocks_subtract_builder(untextured, off)
4106blend_blocks_subtract_builder(untextured, on)
4107
4108
4109#define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
4110.align 3; \
4111 \
4112function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4113 stmdb sp!, { r4, r14 }; \
3f0189c6 4114 save_abi_regs(); \
75e28f62 4115 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 4116 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
4117 \
4118 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 4119 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
4120 \
4121 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4122 mov c_64, #64; \
4123 \
4124 vmov.u16 d128_0x7C1F, #0x7C00; \
4125 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62 4126 vmov.u16 d128_0x1C07, #0x1C00; \
d1c75d1e 4127 vmov.u16 d128_0x00E0, #0x00E0; \
75e28f62
E
4128 vorr.u16 d128_0x7C1F, #0x001F; \
4129 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62 4130 vorr.u16 d128_0x1C07, #0x0007; \
75e28f62 4131 \
e1f6de8f 4132 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
4133 ldr fb_ptr_next, [pixel_ptr, #28]; \
4134 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 4135 vclt.s16 blend_mask, pixels, #0; \
e1f6de8f 4136 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4137 blend_blocks_add_mask_set_##mask_evaluate(); \
4138 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4139 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
75e28f62
E
4140 \
4141 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4142 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4143 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4144 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4145 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4146 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4147 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4148 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4149 \
4150 subs num_blocks, num_blocks, #1; \
4151 beq 1f; \
4152 \
4153 0: \
4154 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4155 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4156 \
d1c75d1e 4157 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
718a9e58 4158 vorr.u16 blend_pixels, #0x8000; /* stp */ \
d1c75d1e
E
4159 vbif.u16 blend_pixels, pixels, blend_mask; \
4160 \
e1f6de8f 4161 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 4162 vclt.s16 blend_mask, pixels, #0; \
75e28f62 4163 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4164 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62
E
4165 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4166 \
4167 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4168 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4169 \
4170 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4171 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4172 cmp fb_ptr_cmp, #28; \
4173 bls 2f; \
4174 \
e1f6de8f 4175 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62 4176 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4177 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4178 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4179 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 4180 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4181 \
4182 3: \
d1c75d1e 4183 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4184 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4185 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4186 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4187 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4188 \
4189 subs num_blocks, num_blocks, #1; \
4190 bne 0b; \
4191 \
4192 1: \
4193 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
718a9e58 4194 vorr.u16 blend_pixels, #0x8000; /* stp */ \
d1c75d1e 4195 vbif.u16 blend_pixels, pixels, blend_mask; \
718a9e58 4196 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62 4197 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4198 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62 4199 \
3f0189c6 4200 restore_abi_regs(); \
75e28f62
E
4201 ldmia sp!, { r4, pc }; \
4202 \
4203 2: \
e1f6de8f 4204 vst1.u16 { blend_pixels }, [fb_ptr]; \
d1c75d1e 4205 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
75e28f62 4206 \
e1f6de8f 4207 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62 4208 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4209 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e 4210 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4211 bal 3b \
4212
4213
d1c75d1e 4214
75e28f62
E
4215#define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4216.align 3; \
4217 \
4218function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4219 stmdb sp!, { r4, r14 }; \
3f0189c6 4220 save_abi_regs(); \
75e28f62 4221 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 4222 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
4223 \
4224 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 4225 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
4226 \
4227 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4228 mov c_64, #64; \
4229 \
4230 vmov.u16 d128_0x7C1F, #0x7C00; \
4231 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62
E
4232 vmov.u16 d128_0x1C07, #0x1C00; \
4233 vmov.u16 d128_0x00E0, #0x00E0; \
4234 vorr.u16 d128_0x7C1F, #0x001F; \
4235 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62
E
4236 vorr.u16 d128_0x1C07, #0x0007; \
4237 \
e1f6de8f 4238 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
4239 ldr fb_ptr_next, [pixel_ptr, #28]; \
4240 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
4241 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4242 blend_blocks_add_mask_set_##mask_evaluate(); \
4243 vshr.s16 pixels_fourth, pixels, #2; \
4244 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4245 \
4246 blend_blocks_add_mask_copy_##mask_evaluate(); \
4247 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4248 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4249 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4250 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4251 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4252 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4253 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4254 \
4255 subs num_blocks, num_blocks, #1; \
4256 beq 1f; \
4257 \
4258 0: \
4259 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4260 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4261 \
e1f6de8f 4262 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
4263 \
4264 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4265 vshr.s16 pixels_fourth, pixels, #2; \
4266 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4267 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4268 \
4269 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4270 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4271 \
4272 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4273 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4274 cmp fb_ptr_cmp, #28; \
4275 bls 2f; \
4276 \
e1f6de8f 4277 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4278 blend_blocks_add_mask_set_##mask_evaluate(); \
4279 blend_blocks_add_mask_copy_##mask_evaluate(); \
4280 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4281 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 4282 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4283 \
4284 3: \
4285 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4286 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4287 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4288 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4289 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4290 \
4291 subs num_blocks, num_blocks, #1; \
4292 bne 0b; \
4293 \
4294 1: \
4295 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4296 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4297 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4298 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62 4299 \
3f0189c6 4300 restore_abi_regs(); \
75e28f62
E
4301 ldmia sp!, { r4, pc }; \
4302 \
4303 2: \
e1f6de8f 4304 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4305 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4306 \
e1f6de8f 4307 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4308 blend_blocks_add_mask_set_##mask_evaluate(); \
4309 blend_blocks_add_mask_copy_##mask_evaluate(); \
4310 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4311 bal 3b \
4312
4313
4314blend_blocks_add_fourth_textured_builder(off)
4315blend_blocks_add_fourth_textured_builder(on)
4316blend_blocks_add_fourth_untextured_builder(off)
4317blend_blocks_add_fourth_untextured_builder(on)
4318
4319// TODO: Optimize this more. Need a scene that actually uses it for
4320// confirmation..
4321
4322.align 3
4323
4324function(blend_blocks_textured_unblended_on)
4325 stmdb sp!, { r4, r14 }
3f0189c6 4326 save_abi_regs()
75e28f62 4327 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
e1f6de8f 4328 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
4329
4330 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
e1f6de8f 4331 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
4332
4333 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4334 mov c_64, #64
4335
e1f6de8f 4336 ldr fb_ptr, [pixel_ptr, #28]
4337 vld1.u16 { fb_pixels }, [fb_ptr]
4338 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62 4339 vclt.s16 write_mask, fb_pixels, #0
e1f6de8f 4340 vld1.u32 { pixels }, [pixel_ptr, :128], c_64
75e28f62
E
4341
4342 subs num_blocks, num_blocks, #1
4343 beq 1f
4344
4345 0:
134f81ec 4346 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4347 vorr.u16 draw_mask, draw_mask, write_mask
4348 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 4349 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62 4350
e1f6de8f 4351 ldr fb_ptr, [pixel_ptr, #28]
4352 vld1.u16 { fb_pixels }, [fb_ptr]
4353 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62 4354 vclt.s16 write_mask, fb_pixels, #0
e1f6de8f 4355 vld1.u32 { pixels }, [pixel_ptr, :128], c_64
75e28f62
E
4356
4357 subs num_blocks, num_blocks, #1
4358 bne 0b
4359
4360 1:
134f81ec 4361 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4362 vorr.u16 draw_mask, draw_mask, write_mask
4363 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 4364 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62 4365
3f0189c6 4366 restore_abi_regs()
75e28f62
E
4367 ldmia sp!, { r4, pc }
4368
4369
4370function(blend_blocks_textured_unblended_off)
4371 bx lr
4372
4373
4374function(warmup)
4375 mov r3, #64
4376 cmp r0, #0
4377 bxeq lr
4378
4379 0:
e1f6de8f 4380 vld1.u32 { u_whole_8, v_whole_8 }, [r1, :128], r3
75e28f62
E
4381
4382 subs r0, r0, #1
4383 bne 0b
4384
4385 bx lr
4386
6c4a10c4 4387#undef vram_ptr
75e28f62 4388#undef color
6c4a10c4 4389#undef width
75e28f62 4390#undef height
6c4a10c4 4391#undef pitch
75e28f62
E
4392
4393#define vram_ptr r0
6c4a10c4
E
4394#define color r1
4395#define width r2
4396#define height r3
75e28f62 4397
6c4a10c4 4398#define pitch r1
75e28f62 4399
6c4a10c4 4400#define num_width r12
75e28f62 4401
87c45ad1
E
4402#undef colors_a
4403#undef colors_b
75e28f62 4404
87c45ad1
E
4405#define colors_a q0
4406#define colors_b q1
75e28f62
E
4407
4408.align 3
4409
4410function(render_block_fill_body)
87c45ad1 4411 vdup.u16 colors_a, color
6c4a10c4 4412 mov pitch, #2048
75e28f62 4413
87c45ad1 4414 vmov colors_b, colors_a
75e28f62 4415 sub pitch, pitch, width, lsl #1
75e28f62 4416
6c4a10c4 4417 mov num_width, width
75e28f62 4418
6c4a10c4 4419 0:
e1f6de8f 4420 vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]!
75e28f62 4421
d1c75d1e 4422 subs num_width, num_width, #16
6c4a10c4 4423 bne 0b
75e28f62 4424
75e28f62 4425 add vram_ptr, vram_ptr, pitch
6c4a10c4
E
4426 mov num_width, width
4427
75e28f62
E
4428 subs height, height, #1
4429 bne 0b
75e28f62 4430
6c4a10c4
E
4431 bx lr
4432
75e28f62
E
4433
4434#undef x
4435#undef y
4436#undef width
4437#undef height
4438#undef fb_ptr
4439#undef texture_mask
4440#undef num_blocks
4441#undef temp
4442#undef dirty_textures_mask
4443#undef clut_ptr
4444#undef current_texture_mask
4445
4446#define psx_gpu r0
4447#define x r1
4448#define y r2
4449#define u r3
4450#define v r4
4451#define width r5
4452#define height r6
4453#define offset_u r8
4454#define offset_v r9
4455#define offset_u_right r10
4456#define width_rounded r11
4457#define height_rounded r12
4458
4459#define texture_offset_base r1
4460#define tile_width r2
4461#define tile_height r3
4462#define num_blocks r4
4463#define block r5
4464#define sub_tile_height r6
4465#define fb_ptr r7
4466#define texture_mask r8
4467#define column_data r9
4468#define texture_offset r10
4469#define tiles_remaining r11
4470#define fb_ptr_advance_column r12
4471#define texture_block_ptr r14
4472
8184d7c5 4473#define temp r14
4474
75e28f62
E
4475#define texture_page_ptr r3
4476#define left_block_mask r4
4477#define right_block_mask r5
4478#define texture_mask_rev r10
4479#define control_mask r11
4480
4481#define dirty_textures_mask r4
4482#define clut_ptr r5
4483#define current_texture_mask r6
4484
4485
4486#undef texels
4487#undef clut_low_a
4488#undef clut_low_b
4489#undef clut_high_a
4490#undef clut_high_b
4491#undef clut_a
4492#undef clut_b
4493#undef texels_low
4494#undef texels_high
4495
4496#define texels d0
4497#define draw_masks_fb_ptrs q1
4498
4499#define draw_mask_fb_ptr_left d2
4500#define draw_mask_fb_ptr_right d3
4501
59d15d23 4502#define draw_mask_fb_ptr_left_a d2
4503#define draw_mask_fb_ptr_left_b d3
4504#define draw_mask_fb_ptr_right_a d10
4505#define draw_mask_fb_ptr_right_b d11
4506#define draw_masks_fb_ptrs2 q5
4507
75e28f62
E
4508#define clut_low_a d4
4509#define clut_low_b d5
4510#define clut_high_a d6
4511#define clut_high_b d7
4512
4513#define block_masks d8
4514#define block_masks_shifted d9
4515
4516#define clut_a q2
4517#define clut_b q3
4518
59d15d23 4519#define texels_low d12
4520#define texels_high d13
75e28f62 4521
59d15d23 4522#define texels_wide_low d14
4523#define texels_wide_high d15
4524#define texels_wide q7
75e28f62
E
4525
4526
59d15d23 4527setup_sprite_flush_blocks:
4528 vpush { q1 - q5 }
75e28f62 4529
4d646738 4530 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 4531 bl flush_render_block_buffer
4d646738 4532 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 4533
59d15d23 4534 vpop { q1 - q5 }
75e28f62
E
4535
4536 add block, psx_gpu, #psx_gpu_blocks_offset
75e28f62
E
4537 bx lr
4538
4539
4540setup_sprite_update_texture_4bpp_cache:
4541 stmdb sp!, { r0 - r3, r14 }
4542 bl update_texture_4bpp_cache
4543 ldmia sp!, { r0 - r3, pc }
4544
4545
4546setup_sprite_update_texture_8bpp_cache:
4d646738 4547 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
75e28f62 4548 bl update_texture_8bpp_cache
4d646738 4549 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
75e28f62
E
4550
4551
4552#define setup_sprite_tiled_initialize_4bpp() \
4553 ldr dirty_textures_mask, \
e1f6de8f 4554 [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]; \
4555 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]; \
75e28f62 4556 \
e1f6de8f 4557 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]; \
4558 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]; \
75e28f62
E
4559 \
4560 tst current_texture_mask, dirty_textures_mask; \
4561 vuzp.u8 clut_a, clut_b; \
4562 \
4563 blne setup_sprite_update_texture_4bpp_cache \
4564
4565#define setup_sprite_tiled_initialize_8bpp() \
4566 ldr dirty_textures_mask, \
e1f6de8f 4567 [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]; \
4568 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]; \
75e28f62
E
4569 \
4570 tst current_texture_mask, dirty_textures_mask; \
4571 blne setup_sprite_update_texture_8bpp_cache \
4572
4573
75e28f62
E
4574#define setup_sprite_block_count_single() \
4575 sub_tile_height \
4576
4577#define setup_sprite_block_count_double() \
4578 sub_tile_height, lsl #1 \
4579
4580#define setup_sprite_tile_add_blocks(type) \
4581 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4582 cmp num_blocks, #MAX_BLOCKS; \
4583 \
59d15d23 4584 movgt num_blocks, setup_sprite_block_count_##type(); \
4585 blgt setup_sprite_flush_blocks \
75e28f62
E
4586
4587
4588#define setup_sprite_tile_full_4bpp(edge) \
4589 setup_sprite_tile_add_blocks(double); \
4590 \
4591 4: \
4592 and texture_block_ptr, texture_offset, texture_mask; \
4593 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4594 \
e1f6de8f 4595 pld [fb_ptr]; \
75e28f62 4596 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4597 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4598 \
4599 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4600 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4601 \
e1f6de8f 4602 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4603 add texture_block_ptr, texture_offset, #8; \
4604 \
4605 and texture_block_ptr, texture_block_ptr, texture_mask; \
4606 add block, block, #40; \
4607 \
4608 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4609 add fb_ptr, fb_ptr, #16; \
4610 \
e1f6de8f 4611 vst1.u32 { draw_mask_fb_ptr_left }, [block, :64]; \
75e28f62
E
4612 add block, block, #24; \
4613 \
e1f6de8f 4614 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4615 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4616 \
e1f6de8f 4617 pld [fb_ptr]; \
75e28f62
E
4618 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4619 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4620 \
e1f6de8f 4621 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4622 add block, block, #40; \
4623 \
4624 add texture_offset, texture_offset, #0x10; \
4625 add fb_ptr, fb_ptr, #(2048 - 16); \
4626 \
e1f6de8f 4627 vst1.u32 { draw_mask_fb_ptr_right }, [block, :64]; \
75e28f62
E
4628 add block, block, #24; \
4629 \
4630 subs sub_tile_height, sub_tile_height, #1; \
4631 bne 4b; \
4632 \
4633 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4634 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4635
4636
4637#define setup_sprite_tile_half_4bpp(edge) \
4638 setup_sprite_tile_add_blocks(single); \
4639 \
4640 4: \
4641 and texture_block_ptr, texture_offset, texture_mask; \
4642 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4643 \
e1f6de8f 4644 pld [fb_ptr]; \
75e28f62 4645 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4646 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4647 \
4648 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4649 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4650 \
e1f6de8f 4651 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4652 add block, block, #40; \
4653 \
4654 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4655 vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64]; \
75e28f62
E
4656 \
4657 add block, block, #24; \
4658 add texture_offset, texture_offset, #0x10; \
4659 \
4660 add fb_ptr, fb_ptr, #2048; \
4661 subs sub_tile_height, sub_tile_height, #1; \
4662 \
4663 bne 4b; \
4664 \
4665 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4666 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4667
4668
4669#define setup_sprite_tile_full_8bpp(edge) \
4670 setup_sprite_tile_add_blocks(double); \
4671 add block, block, #16; \
4672 \
4673 4: \
4674 and texture_block_ptr, texture_offset, texture_mask; \
4675 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4676 \
e1f6de8f 4677 pld [fb_ptr]; \
75e28f62 4678 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4679 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4680 \
4681 add texture_block_ptr, texture_offset, #8; \
e1f6de8f 4682 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4683 \
4684 and texture_block_ptr, texture_block_ptr, texture_mask; \
4685 add block, block, #24; \
4686 \
4687 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4688 \
4689 add fb_ptr, fb_ptr, #16; \
e1f6de8f 4690 vst1.u32 { draw_mask_fb_ptr_left }, [block, :64]; \
75e28f62
E
4691 \
4692 add block, block, #40; \
e1f6de8f 4693 vld1.u32 { texels }, [texture_block_ptr, :64]; \
4694 pld [fb_ptr]; \
75e28f62
E
4695 \
4696 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
e1f6de8f 4697 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4698 add block, block, #24; \
4699 \
4700 add texture_offset, texture_offset, #0x10; \
4701 add fb_ptr, fb_ptr, #(2048 - 16); \
4702 \
e1f6de8f 4703 vst1.u32 { draw_mask_fb_ptr_right }, [block, :64]; \
75e28f62
E
4704 add block, block, #40; \
4705 \
4706 subs sub_tile_height, sub_tile_height, #1; \
4707 bne 4b; \
4708 \
4709 sub block, block, #16; \
4710 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4711 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4712
4713
4714#define setup_sprite_tile_half_8bpp(edge) \
4715 setup_sprite_tile_add_blocks(single); \
4716 add block, block, #16; \
4717 \
4718 4: \
4719 and texture_block_ptr, texture_offset, texture_mask; \
4720 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
e1f6de8f 4721 pld [fb_ptr]; \
75e28f62
E
4722 \
4723 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4724 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62 4725 \
e1f6de8f 4726 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4727 add block, block, #24; \
4728 \
e1f6de8f 4729 vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64]; \
75e28f62
E
4730 add block, block, #40; \
4731 \
4732 add texture_offset, texture_offset, #0x10; \
4733 add fb_ptr, fb_ptr, #2048; \
4734 \
4735 subs sub_tile_height, sub_tile_height, #1; \
4736 bne 4b; \
4737 \
4738 sub block, block, #16; \
4739 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4740 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4741
4742
4743#define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4744 add texture_offset, texture_offset_base, #8; \
4745 add fb_ptr, fb_ptr, #16 \
4746
4747#define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4748 mov texture_offset, texture_offset_base \
4749
4750#define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4751 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4752
4753#define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4754 mov texture_offset, texture_offset_base \
4755
4756#define setup_sprite_tile_column_edge_post_adjust_half_right() \
4757 sub fb_ptr, fb_ptr, #16 \
4758
4759#define setup_sprite_tile_column_edge_post_adjust_half_left() \
4760
4761#define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4762 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4763
4764#define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4765
4766
59d15d23 4767#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode, \
4768 x4mode) \
75e28f62 4769 mov sub_tile_height, column_data; \
59d15d23 4770 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4771 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4772 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62 4773
59d15d23 4774#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode, \
4775 x4mode) \
75e28f62
E
4776 and sub_tile_height, column_data, #0xFF; \
4777 mov tiles_remaining, column_data, lsr #16; \
59d15d23 4778 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4779 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4780 \
4781 subs tiles_remaining, tiles_remaining, #1; \
4782 beq 2f; \
4783 \
4784 3: \
4785 mov sub_tile_height, #16; \
59d15d23 4786 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4787 subs tiles_remaining, tiles_remaining, #1; \
4788 bne 3b; \
4789 \
4790 2: \
4791 uxtb sub_tile_height, column_data, ror #8; \
59d15d23 4792 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4793 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62
E
4794
4795
4796#define setup_sprite_column_data_single() \
4797 mov column_data, height; \
e1f6de8f 4798 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset] \
75e28f62
E
4799
4800#define setup_sprite_column_data_multi() \
4801 and height_rounded, height_rounded, #0xF; \
4802 rsb column_data, offset_v, #16; \
4803 \
4804 add height_rounded, height_rounded, #1; \
4805 sub tile_height, tile_height, #1; \
4806 \
4807 orr column_data, column_data, tile_height, lsl #16; \
e1f6de8f 4808 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]; \
75e28f62
E
4809 \
4810 orr column_data, column_data, height_rounded, lsl #8 \
4811
59d15d23 4812#define setup_sprite_setup_left_draw_mask_fb_ptr() \
4813 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4814 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4815
4816#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column() \
4817 mov fb_ptr_advance_column, #32; \
4818 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4819 \
ed0fd81d 4820 sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \
59d15d23 4821 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4822
4823#define setup_sprite_setup_right_draw_mask_fb_ptr() \
4824 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4825 vdup.u8 draw_mask_fb_ptr_right, block_masks[5] \
4826
4827#define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode, \
4828 edge, x4mode) \
4829 setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode: \
75e28f62
E
4830 setup_sprite_column_data_##multi_height(); \
4831 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4832 vorr.u32 block_masks, block_masks, block_masks_shifted; \
59d15d23 4833 setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
75e28f62 4834 \
59d15d23 4835 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
3f0189c6 4836 restore_abi_regs(); \
75e28f62
E
4837 ldmia sp!, { r4 - r11, pc } \
4838
4839#define setup_sprite_tiled_advance_column() \
4840 add texture_offset_base, texture_offset_base, #0x100; \
4841 tst texture_offset_base, #0xF00; \
4842 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4843
4844#define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
59d15d23 4845 right_mode, x4mode) \
4846 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
75e28f62 4847 setup_sprite_column_data_##multi_height(); \
75e28f62 4848 \
59d15d23 4849 setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode(); \
75e28f62 4850 \
59d15d23 4851 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
75e28f62
E
4852 \
4853 subs tile_width, tile_width, #2; \
4854 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4855 \
75e28f62
E
4856 beq 1f; \
4857 \
59d15d23 4858 vmov.u8 draw_masks_fb_ptrs, #0; \
4859 vmov.u8 draw_masks_fb_ptrs2, #0; \
4860 \
75e28f62
E
4861 0: \
4862 setup_sprite_tiled_advance_column(); \
59d15d23 4863 setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode); \
75e28f62
E
4864 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4865 subs tile_width, tile_width, #1; \
4866 bne 0b; \
4867 \
4868 1: \
59d15d23 4869 setup_sprite_setup_right_draw_mask_fb_ptr##x4mode(); \
75e28f62
E
4870 \
4871 setup_sprite_tiled_advance_column(); \
59d15d23 4872 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
3f0189c6 4873 restore_abi_regs(); \
75e28f62
E
4874 ldmia sp!, { r4 - r11, pc } \
4875
4876
59d15d23 4877#define setup_sprite_offset_u_adjust() \
4878
4879#define setup_sprite_get_left_block_mask() \
4880 and left_block_mask, left_block_mask, #0xFF \
4881
4882#define setup_sprite_compare_left_block_mask() \
4883 cmp left_block_mask, #0xFF \
4884
4885#define setup_sprite_get_right_block_mask() \
4886 uxtb right_block_mask, right_block_mask, ror #8 \
4887
4888#define setup_sprite_compare_right_block_mask() \
4889 cmp right_block_mask, #0xFF \
4890
4891
4892
4893/* 4x stuff */
4894#define fb_ptr2 column_data
4895
4896#define setup_sprite_offset_u_adjust_4x() \
4897 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4898 lsl offset_u_right, #1; \
4899 lsl offset_u, #1; \
4900 add offset_u_right, #1 \
4901
4902#define setup_sprite_get_left_block_mask_4x() \
4903 sxth left_block_mask, left_block_mask \
4904
4905#define setup_sprite_compare_left_block_mask_4x() \
4906 cmp left_block_mask, #0xFFFFFFFF \
4907
4908#define setup_sprite_get_right_block_mask_4x() \
4909 sxth right_block_mask, right_block_mask, ror #16 \
4910
4911#define setup_sprite_compare_right_block_mask_4x() \
4912 cmp right_block_mask, #0xFFFFFFFF \
4913
4914
4915#define widen_texels_16bpp(texels_) \
4916 vmov texels_wide_low, texels_; \
4917 vmov texels_wide_high, texels_; \
4918 vzip.16 texels_wide_low, texels_wide_high \
4919
4920#define widen_texels_8bpp(texels_) \
4921 vmov texels_wide_low, texels_; \
4922 vmov texels_wide_high, texels_; \
4923 vzip.8 texels_wide_low, texels_wide_high \
4924
4925#define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
e1f6de8f 4926 vst1.u32 { texels_ }, [block_, :128]; \
59d15d23 4927 add block_, block_, #40; \
4928 \
4929 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
e1f6de8f 4930 vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64]; \
59d15d23 4931 add block_, block_, #24 \
4932
4933/* assumes 16-byte offset already added to block_ */
4934#define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
e1f6de8f 4935 vst1.u32 { texels_ }, [block_, :64]; \
59d15d23 4936 add block_, block_, #24; \
4937 \
4938 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
e1f6de8f 4939 vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64]; \
59d15d23 4940 add block_, block_, #40 \
4941
4942#define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4943 draw_mask_fb_ptr_b_) \
4944 widen_texels_16bpp(texels_low); \
4945 add fb_ptr_tmp, fb_ptr, #1024*2; \
4946 \
4947 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr); \
4948 \
4949 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4950 widen_texels_16bpp(texels_high); \
4951 \
4952 add fb_ptr_tmp, fb_ptr, #8*2; \
4953 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4954 \
4955 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4956 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4957
4958#define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4959 draw_mask_fb_ptr_b_) \
4960 widen_texels_8bpp(texels); \
4961 add fb_ptr_tmp, fb_ptr, #1024*2; \
4962 \
4963 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr); \
4964 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4965 \
4966 add fb_ptr_tmp, fb_ptr, #8*2; \
4967 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4968 \
4969 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4970 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4971
4972
4973#define setup_sprite_tiled_initialize_4bpp_4x() \
e1f6de8f 4974 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]; \
4975 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]; \
59d15d23 4976 \
4977 vuzp.u8 clut_a, clut_b \
4978
4979#define setup_sprite_tiled_initialize_8bpp_4x() \
4980
4981
4982#define setup_sprite_block_count_single_4x() \
4983 sub_tile_height, lsl #2 \
4984
4985#define setup_sprite_block_count_double_4x() \
4986 sub_tile_height, lsl #(1+2) \
4987
4988#define setup_sprite_tile_full_4bpp_4x(edge) \
4989 setup_sprite_tile_add_blocks(double_4x); \
4990 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4991 \
4992 4: \
4993 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 4994 pld [fb_ptr]; \
59d15d23 4995 \
4996 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4997 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 4998 \
4999 add texture_block_ptr, texture_offset, #8; \
5000 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
5001 \
5002 and texture_block_ptr, texture_block_ptr, texture_mask; \
5003 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
5004 \
5005 vzip.8 texels_low, texels_high; \
5006 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
5007 draw_mask_fb_ptr_left_b); \
5008 \
5009 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 5010 pld [fb_ptr, #2048]; \
59d15d23 5011 \
e1f6de8f 5012 vld1.u32 { texels }, [texture_block_ptr, :64]; \
8438c3c7 5013 add fb_ptr, fb_ptr, #16*2; \
59d15d23 5014 \
8438c3c7 5015 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
59d15d23 5016 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
5017 \
5018 vzip.8 texels_low, texels_high; \
5019 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
5020 draw_mask_fb_ptr_right_b); \
5021 \
5022 add texture_offset, texture_offset, #0x10; \
5023 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
5024 \
5025 subs sub_tile_height, sub_tile_height, #1; \
5026 bne 4b; \
5027 \
5028 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5029 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5030 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5031
5032
5033#define setup_sprite_tile_half_4bpp_4x(edge) \
5034 setup_sprite_tile_add_blocks(single_4x); \
5035 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5036 \
5037 4: \
5038 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 5039 pld [fb_ptr]; \
59d15d23 5040 \
5041 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 5042 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5043 \
5044 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5045 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
5046 \
5047 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
5048 add texture_offset, texture_offset, #0x10; \
5049 \
5050 vzip.8 texels_low, texels_high; \
5051 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5052 draw_mask_fb_ptr_##edge##_b); \
5053 \
e1f6de8f 5054 pld [fb_ptr, #2048]; \
59d15d23 5055 add fb_ptr, fb_ptr, #2048 * 2; \
59d15d23 5056 \
8438c3c7 5057 subs sub_tile_height, sub_tile_height, #1; \
59d15d23 5058 bne 4b; \
5059 \
5060 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5061 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5062 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5063
5064
5065#define setup_sprite_tile_full_8bpp_4x(edge) \
5066 setup_sprite_tile_add_blocks(double_4x); \
5067 add block, block, #16; \
5068 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5069 \
5070 4: \
5071 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 5072 pld [fb_ptr]; \
59d15d23 5073 \
5074 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 5075 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5076 \
5077 add texture_block_ptr, texture_offset, #8; \
5078 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
5079 draw_mask_fb_ptr_left_b); \
5080 \
e1f6de8f 5081 pld [fb_ptr, #2048]; \
59d15d23 5082 and texture_block_ptr, texture_block_ptr, texture_mask; \
5083 \
5084 add fb_ptr, fb_ptr, #16*2; \
5085 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5086 \
e1f6de8f 5087 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5088 \
5089 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
5090 draw_mask_fb_ptr_right_b); \
5091 \
5092 add texture_offset, texture_offset, #0x10; \
5093 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
5094 \
5095 subs sub_tile_height, sub_tile_height, #1; \
5096 bne 4b; \
5097 \
5098 sub block, block, #16; \
5099 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5100 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5101 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5102
5103
5104#define setup_sprite_tile_half_8bpp_4x(edge) \
5105 setup_sprite_tile_add_blocks(single_4x); \
5106 add block, block, #16; \
5107 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5108 \
5109 4: \
5110 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 5111 pld [fb_ptr]; \
59d15d23 5112 \
5113 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 5114 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5115 \
e1f6de8f 5116 pld [fb_ptr, #2048]; \
59d15d23 5117 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5118 draw_mask_fb_ptr_##edge##_b); \
5119 \
5120 add texture_offset, texture_offset, #0x10; \
5121 add fb_ptr, fb_ptr, #2048 * 2; \
5122 \
5123 subs sub_tile_height, sub_tile_height, #1; \
5124 bne 4b; \
5125 \
5126 sub block, block, #16; \
5127 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5128 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5129 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5130
5131
5132#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x() \
5133 add texture_offset, texture_offset_base, #8; \
5134 add fb_ptr, fb_ptr, #16 * 2 \
5135
5136#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x() \
5137 mov texture_offset, texture_offset_base \
5138
5139#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge) \
5140 setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x() \
5141
5142#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge) \
5143 mov texture_offset, texture_offset_base \
5144
5145#define setup_sprite_tile_column_edge_post_adjust_half_right_4x() \
5146 sub fb_ptr, fb_ptr, #16 * 2 \
5147
5148#define setup_sprite_tile_column_edge_post_adjust_half_left_4x() \
5149
5150#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge) \
5151 setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x() \
5152
5153#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge) \
5154
5155
5156#define setup_sprite_setup_left_draw_mask_fb_ptr_4x() \
5157 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5158 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5159 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5160 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5161
5162#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x() \
5163 mov fb_ptr_advance_column, #32 * 2; \
5164 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5165 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
ed0fd81d 5166 sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \
59d15d23 5167 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5168 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5169
5170#define setup_sprite_setup_right_draw_mask_fb_ptr_4x() \
5171 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4]; \
5172 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5]; \
5173 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6]; \
5174 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7] \
5175
5176
75e28f62
E
5177// r0: psx_gpu
5178// r1: x
5179// r2: y
5180// r3: u
e1f6de8f 5181// [sp]: v
5182// [sp + 4]: width
5183// [sp + 8]: height
5184// [sp + 12]: color (unused)
75e28f62 5185
59d15d23 5186#define setup_sprite_tiled_builder(texture_mode, x4mode) \
5187 \
5188setup_sprite_tile_column_width_multi(texture_mode, multi, full, full, \
5189 x4mode); \
5190setup_sprite_tile_column_width_single(texture_mode, multi, full, none, \
5191 x4mode); \
5192setup_sprite_tile_column_width_multi(texture_mode, single, full, full, \
5193 x4mode); \
5194setup_sprite_tile_column_width_single(texture_mode, single, full, none, \
5195 x4mode); \
5196setup_sprite_tile_column_width_multi(texture_mode, multi, half, full, \
5197 x4mode); \
5198setup_sprite_tile_column_width_single(texture_mode, multi, half, right, \
5199 x4mode); \
5200setup_sprite_tile_column_width_multi(texture_mode, single, half, full, \
5201 x4mode); \
5202setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
5203 x4mode); \
5204setup_sprite_tile_column_width_multi(texture_mode, multi, full, half, \
5205 x4mode); \
5206setup_sprite_tile_column_width_single(texture_mode, multi, half, left, \
5207 x4mode); \
5208setup_sprite_tile_column_width_multi(texture_mode, single, full, half, \
5209 x4mode); \
5210setup_sprite_tile_column_width_single(texture_mode, single, half, left, \
5211 x4mode); \
5212setup_sprite_tile_column_width_multi(texture_mode, multi, half, half, \
5213 x4mode); \
5214setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \
5215 x4mode); \
75e28f62
E
5216 \
5217.align 4; \
5218 \
59d15d23 5219function(setup_sprite_##texture_mode##x4mode) \
75e28f62 5220 stmdb sp!, { r4 - r11, r14 }; \
59d15d23 5221 setup_sprite_tiled_initialize_##texture_mode##x4mode(); \
75e28f62 5222 \
e1f6de8f 5223 ldr v, [sp, #36]; \
75e28f62
E
5224 and offset_u, u, #0xF; \
5225 \
e1f6de8f 5226 ldr width, [sp, #40]; \
5227 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62 5228 \
e1f6de8f 5229 ldr height, [sp, #44]; \
75e28f62
E
5230 add fb_ptr, fb_ptr, y, lsl #11; \
5231 \
3f0189c6 5232 save_abi_regs(); \
5233 \
75e28f62
E
5234 add fb_ptr, fb_ptr, x, lsl #1; \
5235 and offset_v, v, #0xF; \
5236 \
5237 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
5238 add width_rounded, offset_u, width; \
5239 \
5240 add height_rounded, offset_v, height; \
5241 add width_rounded, width_rounded, #15; \
5242 \
5243 add height_rounded, height_rounded, #15; \
5244 mov tile_width, width_rounded, lsr #4; \
5245 \
5246 /* texture_offset_base = VH-VL-00-00 */\
5247 mov texture_offset_base, v, lsl #8; \
5248 and offset_u_right, width_rounded, #0xF; \
5249 \
5250 /* texture_offset_base = VH-UH-UL-00 */\
5251 bfi texture_offset_base, u, #4, #8; \
59d15d23 5252 mov right_block_mask, #0xFFFFFFFE; \
5253 \
5254 setup_sprite_offset_u_adjust##x4mode(); \
75e28f62
E
5255 \
5256 /* texture_offset_base = VH-UH-VL-00 */\
5257 bfi texture_offset_base, v, #4, #4; \
59d15d23 5258 mov left_block_mask, #0xFFFFFFFF; \
75e28f62
E
5259 \
5260 mov tile_height, height_rounded, lsr #4; \
5261 mvn left_block_mask, left_block_mask, lsl offset_u; \
5262 \
5263 /* texture_mask = HH-HL-WH-WL */\
e1f6de8f 5264 ldrh texture_mask, [psx_gpu, #psx_gpu_texture_mask_width_offset]; \
75e28f62
E
5265 mov right_block_mask, right_block_mask, lsl offset_u_right; \
5266 \
5267 /* texture_mask_rev = WH-WL-HH-HL */\
5268 rev16 texture_mask_rev, texture_mask; \
5269 vmov block_masks, left_block_mask, right_block_mask; \
5270 \
5271 /* texture_mask = HH-HL-HL-WL */\
5272 bfi texture_mask, texture_mask_rev, #4, #4; \
5273 /* texture_mask_rev = 00-00-00-WH */\
5274 mov texture_mask_rev, texture_mask_rev, lsr #12; \
5275 \
5276 /* texture_mask = HH-WH-HL-WL */\
5277 bfi texture_mask, texture_mask_rev, #8, #4; \
59d15d23 5278 setup_sprite_get_left_block_mask##x4mode(); \
75e28f62
E
5279 \
5280 mov control_mask, #0; \
59d15d23 5281 setup_sprite_compare_left_block_mask##x4mode(); \
75e28f62 5282 \
59d15d23 5283 setup_sprite_get_right_block_mask##x4mode(); \
75e28f62
E
5284 orreq control_mask, control_mask, #0x4; \
5285 \
e1f6de8f 5286 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
59d15d23 5287 setup_sprite_compare_right_block_mask##x4mode(); \
75e28f62
E
5288 \
5289 orreq control_mask, control_mask, #0x8; \
5290 cmp tile_width, #1; \
5291 \
5292 add block, psx_gpu, #psx_gpu_blocks_offset; \
5293 orreq control_mask, control_mask, #0x1; \
5294 \
5295 cmp tile_height, #1; \
5296 add block, block, num_blocks, lsl #6; \
5297 \
5298 orreq control_mask, control_mask, #0x2; \
8184d7c5 5299 JT_OP_REL(9f, control_mask, temp); \
e1f6de8f 5300 JT_OP(ldr pc, [pc, control_mask, lsl #2]); \
75e28f62
E
5301 nop; \
5302 \
8184d7c5 5303 9: \
5304 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode); \
5305 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode); \
5306 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode); \
5307 .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5308 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode); \
5309 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5310 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode); \
5311 .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5312 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode); \
5313 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode); \
5314 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode); \
5315 .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5316 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode); \
75e28f62 5317 .word 0x00000000; \
8184d7c5 5318 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode); \
59d15d23 5319
5320
5321setup_sprite_tiled_builder(4bpp,);
5322setup_sprite_tiled_builder(8bpp,);
75e28f62 5323
59d15d23 5324#undef draw_mask_fb_ptr_left
5325#undef draw_mask_fb_ptr_right
75e28f62 5326
59d15d23 5327setup_sprite_tiled_builder(4bpp, _4x);
5328setup_sprite_tiled_builder(8bpp, _4x);
75e28f62
E
5329
5330
5331#undef block_ptr
5332#undef num_blocks
5333#undef clut_ptr
5334
5335#define psx_gpu r0
5336#define block_ptr r0
5337#define num_blocks r1
5338#define clut_ptr r2
5339#define texel_shift_mask r3
5340#define block_pixels_a r4
5341#define block_pixels_b r5
5342#define texel_0 r6
5343#define texel_2 r7
5344#define texel_4 r8
5345#define texel_6 r9
5346#define texel_1 r10
5347#define texel_3 r11
5348#define texel_5 r12
5349#define texel_7 r14
5350#define texels_01 r6
5351#define texels_23 r7
5352#define texels_45 r8
5353#define texels_67 r9
5354
5355function(texture_sprite_blocks_8bpp)
5356 stmdb sp!, { r4 - r11, r14 }
5357 movw texel_shift_mask, #(0xFF << 1)
5358
e1f6de8f 5359 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5360 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
75e28f62
E
5361
5362 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
e1f6de8f 5363 ldr block_pixels_a, [block_ptr, #16]
75e28f62
E
5364
5365 0:
5366 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
e1f6de8f 5367 ldr block_pixels_b, [block_ptr, #20]
75e28f62
E
5368
5369 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
e1f6de8f 5370 ldrh texel_0, [clut_ptr, texel_0]
75e28f62
E
5371
5372 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
e1f6de8f 5373 ldrh texel_1, [clut_ptr, texel_1]
75e28f62
E
5374
5375 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
e1f6de8f 5376 ldr block_pixels_a, [block_ptr, #(64 + 16)]
75e28f62 5377
e1f6de8f 5378 ldrh texel_2, [clut_ptr, texel_2]
75e28f62
E
5379 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5380
e1f6de8f 5381 ldrh texel_3, [clut_ptr, texel_3]
75e28f62
E
5382 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5383
e1f6de8f 5384 ldrh texel_4, [clut_ptr, texel_4]
75e28f62
E
5385 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5386
e1f6de8f 5387 ldrh texel_5, [clut_ptr, texel_5]
75e28f62
E
5388 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5389
e1f6de8f 5390 ldrh texel_6, [clut_ptr, texel_6]
75e28f62
E
5391 orr texels_01, texel_0, texel_1, lsl #16
5392
e1f6de8f 5393 ldrh texel_7, [clut_ptr, texel_7]
75e28f62
E
5394 orr texels_23, texel_2, texel_3, lsl #16
5395
5396 orr texels_45, texel_4, texel_5, lsl #16
e1f6de8f 5397 str texels_01, [block_ptr, #0]
75e28f62
E
5398
5399 orr texels_67, texel_6, texel_7, lsl #16
e1f6de8f 5400 str texels_23, [block_ptr, #4]
75e28f62
E
5401
5402 subs num_blocks, num_blocks, #1
e1f6de8f 5403 str texels_45, [block_ptr, #8]
75e28f62 5404
e1f6de8f 5405 str texels_67, [block_ptr, #12]
75e28f62
E
5406 add block_ptr, block_ptr, #64
5407
5408 bne 0b
5409
5410 ldmia sp!, { r4 - r11, pc }
5411
5412
5413#undef width_rounded
5414#undef texture_mask
5415#undef num_blocks
5416#undef texture_offset
59d15d23 5417#undef texels_low
5418#undef texels_high
5419#undef texels_wide_low
5420#undef texels_wide_high
5421#undef texels_wide
5422#undef fb_ptr2
8184d7c5 5423#undef temp
75e28f62
E
5424
5425#define psx_gpu r0
5426#define x r1
5427#define y r2
5428#define u r3
5429#define v r4
5430#define width r5
5431#define height r6
5432#define left_offset r8
5433#define width_rounded r9
5434#define right_width r10
59d15d23 5435
75e28f62
E
5436#define block_width r11
5437
5438#define texture_offset_base r1
5439#define texture_mask r2
5440#define texture_page_ptr r3
5441#define num_blocks r4
5442#define block r5
5443#define fb_ptr r7
5444#define texture_offset r8
5445#define blocks_remaining r9
59d15d23 5446#define fb_ptr2 r10
75e28f62
E
5447#define fb_ptr_pitch r12
5448#define texture_block_ptr r14
5449
5450#define texture_mask_width r2
5451#define texture_mask_height r3
5452#define left_mask_bits r4
5453#define right_mask_bits r5
5454
5455
5456#undef block_masks
5457#undef block_masks_shifted
5458#undef texels
5459
5460#define block_masks d0
5461#define block_masks_shifted d1
5462#define draw_mask_fb_ptr d2
5463#define texels q2
5464
59d15d23 5465#define draw_mask_fb_ptr_a d2
5466#define draw_mask_fb_ptr_b d3
5467#define texels_low d4
5468#define texels_high d5
5469#define texels_wide_low d6
5470#define texels_wide_high d7
5471#define texels_wide q3
75e28f62 5472
75e28f62 5473
59d15d23 5474setup_sprites_16bpp_flush:
5475 vpush { d0 - d3 }
75e28f62 5476
4d646738 5477 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 5478 bl flush_render_block_buffer
4d646738 5479 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 5480
59d15d23 5481 vpop { d0 - d3 }
75e28f62
E
5482
5483 add block, psx_gpu, #psx_gpu_blocks_offset
5484 mov num_blocks, block_width
5485
5486 bx lr
5487
5488function(setup_sprite_16bpp)
5489 stmdb sp!, { r4 - r11, r14 }
e1f6de8f 5490 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62 5491
e1f6de8f 5492 ldr v, [sp, #36]
75e28f62
E
5493 add fb_ptr, fb_ptr, y, lsl #11
5494
e1f6de8f 5495 ldr width, [sp, #40]
75e28f62
E
5496 add fb_ptr, fb_ptr, x, lsl #1
5497
e1f6de8f 5498 ldr height, [sp, #44]
75e28f62
E
5499 and left_offset, u, #0x7
5500
5501 add texture_offset_base, u, u
5502 add width_rounded, width, #7
5503
ed0fd81d 5504 add texture_offset_base, texture_offset_base, v, lsl #11
75e28f62
E
5505 mov left_mask_bits, #0xFF
5506
e1f6de8f 5507 ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
75e28f62
E
5508 add width_rounded, width_rounded, left_offset
5509
e1f6de8f 5510 ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
75e28f62
E
5511 sub fb_ptr, fb_ptr, left_offset, lsl #1
5512
5513 add texture_mask, texture_mask_width, texture_mask_width
5514 mov right_mask_bits, #0xFE
5515
5516 and right_width, width_rounded, #0x7
5517 mvn left_mask_bits, left_mask_bits, lsl left_offset
5518
ed0fd81d 5519 add texture_mask, texture_mask, texture_mask_height, lsl #11
75e28f62
E
5520 mov block_width, width_rounded, lsr #3
5521
5522 mov right_mask_bits, right_mask_bits, lsl right_width
5523 movw fb_ptr_pitch, #(2048 + 16)
5524
5525 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5526 vmov block_masks, left_mask_bits, right_mask_bits
5527
e1f6de8f 5528 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5529 add block, psx_gpu, #psx_gpu_blocks_offset
5530
6ea0f7bf 5531 bic texture_offset_base, texture_offset_base, #0xF
75e28f62
E
5532 cmp block_width, #1
5533
e1f6de8f 5534 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
75e28f62
E
5535 add block, block, num_blocks, lsl #6
5536
5537 bne 0f
5538
5539 vext.32 block_masks_shifted, block_masks, block_masks, #1
5540 vorr.u32 block_masks, block_masks, block_masks_shifted
5541 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5542
5543 1:
5544 add num_blocks, num_blocks, #1
5545 cmp num_blocks, #MAX_BLOCKS
59d15d23 5546 blgt setup_sprites_16bpp_flush
75e28f62
E
5547
5548 and texture_block_ptr, texture_offset_base, texture_mask
5549 subs height, height, #1
5550
5551 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5552 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5553
e1f6de8f 5554 vst1.u32 { texels }, [block, :128]
75e28f62
E
5555 add block, block, #40
5556
5557 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5558 pld [fb_ptr]
75e28f62 5559
e1f6de8f 5560 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5561
5562 add block, block, #24
5563 add texture_offset_base, texture_offset_base, #2048
5564 add fb_ptr, fb_ptr, #2048
e1f6de8f 5565 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5566 bne 1b
5567
5568 ldmia sp!, { r4 - r11, pc }
5569
5570 0:
5571 add num_blocks, num_blocks, block_width
5572 mov texture_offset, texture_offset_base
5573
5574 cmp num_blocks, #MAX_BLOCKS
59d15d23 5575 blgt setup_sprites_16bpp_flush
75e28f62
E
5576
5577 add texture_offset_base, texture_offset_base, #2048
5578 and texture_block_ptr, texture_offset, texture_mask
5579
5580 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5581 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5582
e1f6de8f 5583 vst1.u32 { texels }, [block, :128]
75e28f62
E
5584 add block, block, #40
5585
5586 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5587 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5588 pld [fb_ptr]
75e28f62 5589
e1f6de8f 5590 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5591 subs blocks_remaining, block_width, #2
5592
5593 add texture_offset, texture_offset, #16
5594 add fb_ptr, fb_ptr, #16
5595
5596 vmov.u8 draw_mask_fb_ptr, #0
5597
5598 add block, block, #24
5599 beq 2f
5600
5601 1:
5602 and texture_block_ptr, texture_offset, texture_mask
5603 subs blocks_remaining, blocks_remaining, #1
5604
5605 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5606 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5607
e1f6de8f 5608 vst1.u32 { texels }, [block, :128]
75e28f62
E
5609 add block, block, #40
5610
5611 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5612 pld [fb_ptr]
75e28f62 5613
e1f6de8f 5614 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5615
5616 add texture_offset, texture_offset, #16
5617 add fb_ptr, fb_ptr, #16
5618
5619 add block, block, #24
5620 bne 1b
5621
5622 2:
5623 and texture_block_ptr, texture_offset, texture_mask
5624 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5625
e1f6de8f 5626 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62
E
5627 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5628
e1f6de8f 5629 vst1.u32 { texels }, [block, :128]
75e28f62
E
5630 add block, block, #40
5631
5632 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5633 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5634
5635 add block, block, #24
5636 subs height, height, #1
5637
5638 add fb_ptr, fb_ptr, fb_ptr_pitch
e1f6de8f 5639 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5640
5641 bne 0b
5642
5643 ldmia sp!, { r4 - r11, pc }
5644
5645
59d15d23 5646// 4x version
5647// FIXME: duplicate code with normal version :(
5648#undef draw_mask_fb_ptr
5649
5650function(setup_sprite_16bpp_4x)
5651 stmdb sp!, { r4 - r11, r14 }
e1f6de8f 5652 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
59d15d23 5653
e1f6de8f 5654 ldr v, [sp, #36]
59d15d23 5655 add fb_ptr, fb_ptr, y, lsl #11
5656
e1f6de8f 5657 ldr width, [sp, #40]
59d15d23 5658 add fb_ptr, fb_ptr, x, lsl #1
5659
e1f6de8f 5660 ldr height, [sp, #44]
59d15d23 5661 and left_offset, u, #0x7
5662
5663 add texture_offset_base, u, u
5664 add width_rounded, width, #7
5665
ed0fd81d 5666 add texture_offset_base, texture_offset_base, v, lsl #11
59d15d23 5667 movw left_mask_bits, #0xFFFF
5668
e1f6de8f 5669 ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
59d15d23 5670 add width_rounded, width_rounded, left_offset
5671
5672 lsl left_offset, #1
5673
e1f6de8f 5674 ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
59d15d23 5675 sub fb_ptr, fb_ptr, left_offset, lsl #1
5676
5677 add texture_mask, texture_mask_width, texture_mask_width
5678 movw right_mask_bits, #0xFFFC
5679
5680 and right_width, width_rounded, #0x7
5681 mvn left_mask_bits, left_mask_bits, lsl left_offset
5682
5683 lsl right_width, #1
5684
ed0fd81d 5685 add texture_mask, texture_mask, texture_mask_height, lsl #11
59d15d23 5686 mov block_width, width_rounded, lsr #3
5687
5688 mov right_mask_bits, right_mask_bits, lsl right_width
5689 movw fb_ptr_pitch, #(2048 + 16) * 2
5690
5691 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5692 vmov block_masks, left_mask_bits, right_mask_bits
5693
e1f6de8f 5694 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5695 add block, psx_gpu, #psx_gpu_blocks_offset
5696
5697 bic texture_offset_base, texture_offset_base, #0xF
5698 cmp block_width, #1
5699
e1f6de8f 5700 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
59d15d23 5701 add block, block, num_blocks, lsl #6
5702
5703 lsl block_width, #2
5704 bne 0f
5705
5706 vext.32 block_masks_shifted, block_masks, block_masks, #1
5707 vorr.u32 block_masks, block_masks, block_masks_shifted
5708 vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5709 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5710
5711 1:
5712 add num_blocks, num_blocks, block_width
5713 cmp num_blocks, #MAX_BLOCKS
5714 blgt setup_sprites_16bpp_flush
5715
5716 and texture_block_ptr, texture_offset_base, texture_mask
5717 subs height, height, #1
5718
5719 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5720 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5721
5722 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5723
5724 add texture_offset_base, texture_offset_base, #2048
5725 add fb_ptr, fb_ptr, #2048*2
e1f6de8f 5726 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5727 bne 1b
5728
5729 ldmia sp!, { r4 - r11, pc }
5730
5731 0:
5732 add num_blocks, num_blocks, block_width
5733 mov texture_offset, texture_offset_base
5734
5735 vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5736 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5737
5738 cmp num_blocks, #MAX_BLOCKS
5739 blgt setup_sprites_16bpp_flush
5740
5741 add texture_offset_base, texture_offset_base, #2048
5742 and texture_block_ptr, texture_offset, texture_mask
5743
5744 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5745 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5746
5747 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5748
5749 subs blocks_remaining, block_width, #2*4
5750 add texture_offset, texture_offset, #16
5751
5752 vmov.u8 draw_mask_fb_ptr_a, #0
5753 vmov.u8 draw_mask_fb_ptr_b, #0
5754
5755 add fb_ptr, fb_ptr, #16*2
5756 beq 2f
5757
5758 1:
5759 and texture_block_ptr, texture_offset, texture_mask
5760 subs blocks_remaining, blocks_remaining, #4
5761
5762 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5763 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5764
5765 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5766 add texture_offset, texture_offset, #16
5767
5768 add fb_ptr, fb_ptr, #16*2
5769 bgt 1b
5770
5771 2:
5772 vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5773 vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5774
5775 and texture_block_ptr, texture_offset, texture_mask
5776 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5777
e1f6de8f 5778 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5779
5780 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5781 subs height, height, #1
5782
5783 add fb_ptr, fb_ptr, fb_ptr_pitch
e1f6de8f 5784 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5785
5786 bne 0b
5787
5788 ldmia sp!, { r4 - r11, pc }
5789
5790
f0931e56 5791#undef width
5792#undef right_width
5793#undef right_mask_bits
5794#undef color
5795#undef height
5796#undef blocks_remaining
5797#undef colors
5798#undef right_mask
5799#undef test_mask
5800#undef draw_mask
5801
5802#define psx_gpu r0
5803#define x r1
5804#define y r2
5805#define width r3
5806#define right_width r5
5807#define right_mask_bits r6
5808#define fb_ptr r7
5809#define color r8
5810#define height r9
5811#define fb_ptr_pitch r12
5812
5813// referenced by setup_sprites_16bpp_flush
5814#define num_blocks r4
5815#define block r5
5816#define block_width r11
5817
5818#define color_r r1
5819#define color_g r2
5820#define color_b r8
5821#define blocks_remaining r6
5822
5823#define colors q0
5824#define right_mask q1
5825#define test_mask q2
5826#define draw_mask q2
5827#define draw_mask_bits_fb_ptr d6
5828
5829
5830.align 3
5831
2d658c89 5832function(setup_sprite_untextured_512)
f0931e56 5833 stmdb sp!, { r4 - r11, r14 }
5834
e1f6de8f 5835 ldr width, [sp, #40]
5836 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
f0931e56 5837
e1f6de8f 5838 ldr height, [sp, #44]
f0931e56 5839 add fb_ptr, fb_ptr, y, lsl #11
5840
5841 add fb_ptr, fb_ptr, x, lsl #1
5842 sub right_width, width, #1
5843
e1f6de8f 5844 ldr color, [sp, #48]
f0931e56 5845 and right_width, #7
5846
5847 add block_width, width, #7
5848 add right_width, #1
5849
5850 lsr block_width, #3
5851 mov right_mask_bits, #0xff
5852
5853 sub fb_ptr_pitch, block_width, #1
5854 lsl right_mask_bits, right_width
5855
5856 lsl fb_ptr_pitch, #3+1
5857 ubfx color_r, color, #3, #5
5858
5859 rsb fb_ptr_pitch, #1024*2
5860 ubfx color_g, color, #11, #5
5861
e1f6de8f 5862 vld1.u32 { test_mask }, [psx_gpu, :128]
f0931e56 5863 ubfx color_b, color, #19, #5
5864
5865 vdup.u16 right_mask, right_mask_bits
5866 orr color, color_r, color_b, lsl #10
5867
e1f6de8f 5868 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
f0931e56 5869 orr color, color, color_g, lsl #5
5870
5871 vtst.u16 right_mask, right_mask, test_mask
5872 add block, psx_gpu, #psx_gpu_blocks_offset
5873
5874 vdup.u16 colors, color
5875 add block, block, num_blocks, lsl #6
5876
5877
5878setup_sprite_untextured_height_loop:
5879 add num_blocks, block_width
5880 sub blocks_remaining, block_width, #1
5881
5882 cmp num_blocks, #MAX_BLOCKS
5883 blgt setup_sprites_16bpp_flush
5884
5885 cmp blocks_remaining, #0
5886 ble 1f
5887
5888 vmov.u8 draw_mask, #0 /* zero_mask */
5889 vmov.u8 draw_mask_bits_fb_ptr, #0
5890
5891 0:
e1f6de8f 5892 vst1.u32 { draw_mask }, [block, :128]!
f0931e56 5893 subs blocks_remaining, #1
5894
e1f6de8f 5895 vst1.u32 { colors }, [block, :128]
f0931e56 5896 add block, block, #24
5897
5898 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
e1f6de8f 5899 vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
f0931e56 5900
5901 add block, block, #24
5902 add fb_ptr, #8*2
5903 bgt 0b
5904
5905 1:
e1f6de8f 5906 vst1.u32 { right_mask }, [block, :128]!
f0931e56 5907 subs height, #1
5908
e1f6de8f 5909 vst1.u32 { colors }, [block, :128]
f0931e56 5910 add block, block, #24
5911
5912 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
e1f6de8f 5913 vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
f0931e56 5914
5915 add block, block, #24
5916 add fb_ptr, fb_ptr_pitch
5917
e1f6de8f 5918 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
f0931e56 5919 bgt setup_sprite_untextured_height_loop
5920
5921 ldmia sp!, { r4 - r11, pc }
5922
5923
5924
75e28f62
E
5925#undef texture_page_ptr
5926#undef vram_ptr
5927#undef dirty_textures_mask
5928#undef current_texture_mask
5929
5930#define psx_gpu r0
5931#define current_texture_page r1
5932#define texture_page_ptr r2
5933#define vram_ptr_a r3
5934#define current_texture_page_x r12
5935#define current_texture_page_y r4
5936#define dirty_textures_mask r5
5937#define tile_y r6
5938#define tile_x r7
5939#define sub_y r8
5940#define current_texture_mask r9
5941#define c_4096 r10
5942#define vram_ptr_b r11
5943
5944#define texel_block_a d0
5945#define texel_block_b d1
5946#define texel_block_expanded_a q1
5947#define texel_block_expanded_b q2
5948#define texel_block_expanded_ab q2
5949#define texel_block_expanded_c q3
fab27ba2 5950#define texel_block_expanded_d q0
75e28f62
E
5951#define texel_block_expanded_cd q3
5952
5953function(update_texture_4bpp_cache)
5954 stmdb sp!, { r4 - r11, r14 }
5955 vpush { q0 - q3 }
5956
e1f6de8f 5957 ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
75e28f62 5958
e1f6de8f 5959 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
5960 ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
75e28f62
E
5961
5962 and current_texture_page_x, current_texture_page, #0xF
e1f6de8f 5963 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62
E
5964
5965 mov current_texture_page_y, current_texture_page, lsr #4
e1f6de8f 5966 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
5967
5968 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5969 mov tile_y, #16
5970
5971 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5972 bic dirty_textures_mask, current_texture_mask
5973
5974 mov tile_x, #16
e1f6de8f 5975 str dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
5976
5977 mov sub_y, #8
5978 movw c_4096, #4096
5979
5980 add vram_ptr_b, vram_ptr_a, #2048
5981
5982 0:
e1f6de8f 5983 vld1.u32 { texel_block_a }, [vram_ptr_a, :64], c_4096
5984 vld1.u32 { texel_block_b }, [vram_ptr_b, :64], c_4096
75e28f62
E
5985
5986 vmovl.u8 texel_block_expanded_a, texel_block_a
5987 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5988 vmovl.u8 texel_block_expanded_c, texel_block_b
5989 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5990
5991 vbic.u16 texel_block_expanded_a, #0x00F0
5992 vbic.u16 texel_block_expanded_b, #0x00F0
5993 vbic.u16 texel_block_expanded_c, #0x00F0
5994 vbic.u16 texel_block_expanded_d, #0x00F0
5995
5996 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5997 texel_block_expanded_b
5998 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5999 texel_block_expanded_d
6000
6001 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
e1f6de8f 6002 [texture_page_ptr, :256]!
75e28f62
E
6003
6004 subs sub_y, sub_y, #1
6005 bne 0b
6006
6007 mov sub_y, #8
6008 add vram_ptr_a, vram_ptr_a, #8
6009 add vram_ptr_b, vram_ptr_b, #8
6010
6011 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6012 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6013
6014 subs tile_x, tile_x, #1
6015 bne 0b
6016
6017 mov tile_x, #16
6018 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6019 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6020
6021 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6022 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6023
6024 subs tile_y, tile_y, #1
6025 bne 0b
6026
6027 vpop { q0 - q3 }
6028 ldmia sp!, { r4 - r11, pc }
6029
6030
6031#undef current_texture_page
6032
6033#define psx_gpu r0
6034#define texture_page r1
6035#define texture_page_ptr r2
6036#define vram_ptr_a r3
6037#define texture_page_x r12
6038#define texture_page_y r4
6039#define current_texture_page r5
6040#define tile_y r6
6041#define tile_x r7
6042#define sub_y r8
6043#define c_4096 r10
6044#define vram_ptr_b r11
6045
6046
6047#undef texels_a
6048#undef texels_b
6049
6050#define texels_a q0
6051#define texels_b q1
6052#define texels_c q2
6053#define texels_d q3
6054
6055
6056function(update_texture_8bpp_cache_slice)
6057 stmdb sp!, { r4 - r11, r14 }
6058 vpush { q0 - q3 }
6059
e1f6de8f 6060 ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
6061 ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
75e28f62 6062
e1f6de8f 6063 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
75e28f62
E
6064 mov tile_y, #16
6065
6066 and texture_page_x, texture_page, #0xF
6067 mov texture_page_y, texture_page, lsr #4
6068
6069 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
6070 mov tile_x, #8
6071
6072 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6073 eor current_texture_page, current_texture_page, texture_page
6074
6075 ands current_texture_page, current_texture_page, #0x1
6076 mov sub_y, #4
6077
6078 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6079 movw c_4096, #4096
6080
6081 add vram_ptr_b, vram_ptr_a, #2048
6082
6083 0:
e1f6de8f 6084 vld1.u32 { texels_a }, [vram_ptr_a, :128], c_4096
6085 vld1.u32 { texels_b }, [vram_ptr_b, :128], c_4096
6086 vld1.u32 { texels_c }, [vram_ptr_a, :128], c_4096
6087 vld1.u32 { texels_d }, [vram_ptr_b, :128], c_4096
75e28f62 6088
e1f6de8f 6089 vst1.u32 { texels_a, texels_b }, [texture_page_ptr, :256]!
6090 vst1.u32 { texels_c, texels_d }, [texture_page_ptr, :256]!
75e28f62
E
6091
6092 subs sub_y, sub_y, #1
6093 bne 0b
6094
6095 mov sub_y, #4
6096
6097 add vram_ptr_a, vram_ptr_a, #16
6098 add vram_ptr_b, vram_ptr_b, #16
6099
6100 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6101 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6102
6103 subs tile_x, tile_x, #1
6104 bne 0b
6105
6106 mov tile_x, #8
6107
6108 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6109 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6110
6111 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6112 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6113
6114 subs tile_y, tile_y, #1
6115 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6116
6117 bne 0b
6118
6119 vpop { q0 - q3 }
6120 ldmia sp!, { r4 - r11, pc }
6121
50f9355a 6122
6123/* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6124function(scale2x_tiles8)
6125 push { r4, r14 }
6126
6127 mov r4, r1
6128 add r12, r0, #1024*2
6129 mov r14, r2
6130
61310:
c2a25f67 6132 pld [r1, #1024*2]
e1f6de8f 6133 vld1.u16 { q0 }, [r1, :128]!
6134 vld1.u16 { q2 }, [r1, :128]!
50f9355a 6135 vmov q1, q0
6136 vmov q3, q2
6137 vzip.16 q0, q1
6138 vzip.16 q2, q3
6139 subs r14, #2
e1f6de8f 6140 vst1.u16 { q0, q1 }, [r0, :128]!
6141 vst1.u16 { q0, q1 }, [r12, :128]!
50f9355a 6142 blt 1f
e1f6de8f 6143 vst1.u16 { q2, q3 }, [r0, :128]!
6144 vst1.u16 { q2, q3 }, [r12, :128]!
50f9355a 6145 bgt 0b
61461:
6147 subs r3, #1
6148 mov r14, r2
6149 add r0, #1024*2*2
6150 add r4, #1024*2
ed0fd81d 6151 sub r0, r0, r2, lsl #4+1
50f9355a 6152 mov r1, r4
6153 add r12, r0, #1024*2
6154 bgt 0b
6155 nop
6156
6157 pop { r4, pc }
59d15d23 6158
6159// vim:filetype=armasm