drc/psx_gpu: handle more calling conventions
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
CommitLineData
75e28f62
E
1/*
2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
59d15d23 3 * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
75e28f62
E
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of
8 * the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15
16#define MAX_SPANS 512
17#define MAX_BLOCKS 64
18#define MAX_BLOCKS_PER_ROW 128
19
f0931e56 20#define RENDER_STATE_MASK_EVALUATE 0x20
21#define RENDER_FLAGS_MODULATE_TEXELS 0x1
22#define RENDER_FLAGS_BLEND 0x2
d5c08ed3 23#define RENDER_INTERLACE_ENABLED 0x1
f0931e56 24
cb88320b 25#include "psx_gpu_offsets.h"
75e28f62 26
cb88320b 27#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
75e28f62 28
75e28f62
E
29#define edge_data_left_x_offset 0
30#define edge_data_num_blocks_offset 2
31#define edge_data_right_mask_offset 4
32#define edge_data_y_offset 6
33
ed0fd81d 34.syntax unified
35.text
75e28f62
E
36
37#define psx_gpu r0
38#define v_a r1
39#define v_b r2
40#define v_c r3
41
42#define x0 r4
43#define x1 r5
44#define x2 r6
45#define x0_x1 r5
46#define x1_x2 r6
47#define y0 r7
48#define y1 r8
49#define y2 r9
50#define y0_y1 r7
51#define y1_y2 r8
52#define b0 r9
53#define b1 r10
54#define b2 r11
55#define b0_b1 r10
56#define b1_b2 r11
57
58
59#define area_r_s r5
60
61#define g_bx0 r2
62#define g_bx r3
63#define g_bx2 r4
64#define g_bx3 r5
65#define b_base r6
66#define g_by r8
67
68#define gs_bx r7
69#define gs_by r10
70
71#define ga_bx g_bx
72#define ga_by g_by
73
74#define gw_bx_h g_bx
75#define gw_by_h g_by
76
77#define gw_bx_l r11
78#define gw_by_l gw_bx_l
79
80#define store_a r0
81#define store_b r1
82#define store_inc r5
83
84
85#define v0 q0
86#define uvrgb0 d0
87#define x0_y0 d1
88
89#define v1 q1
90#define uvrgb1 d2
91#define x1_y1 d3
92
93#define v2 q2
94#define uvrgb2 d4
95#define x2_y2 d5
96
97#define x0_ab q3
98#define uvrg_xxxx0 q3
99#define uvrg0 d6
100#define xxxx0 d7
101
102#define x1_ab q4
103#define uvrg_xxxx1 q4
104#define uvrg1 d8
105#define xxxx1 d9
106
107#define x2_ab q5
108#define uvrg_xxxx2 q5
109#define uvrg2 d10
110#define xxxx2 d11
111
112#define y0_ab q6
113#define yyyy_uvrg0 q6
114#define yyyy0 d12
115#define uvrg0b d13
116
117#define y1_ab q7
118#define yyyy_uvrg1 q7
119#define yyyy1 d14
120#define uvrg1b d15
121
122#define y2_ab q8
123#define yyyy_uvrg2 q8
124#define yyyy2 d16
125#define uvrg2b d17
126
127#define d0_ab q9
128#define d0_a d18
129#define d0_b d19
130
131#define d1_ab q10
132#define d1_a d20
133#define d1_b d21
134
135#define d2_ab q11
136#define d2_a d22
137#define d2_b d23
138
139#define d3_ab q12
140#define d3_a d24
141#define d3_b d25
142
143#define ga_uvrg_x q1
144#define ga_uvrg_y q4
145
146#define dx x0_x1
147#define dy y0_y1
148#define db b0_b1
149
150#define uvrg_base q11
151
152#define gs_uvrg_x q5
153#define gs_uvrg_y q6
154
155#define g_uvrg_x q1
156#define ga_uv_x d2
157#define g_uv_x d2
158#define ga_rg_x d3
159#define g_rg_x d3
160
161#define g_uvrg_y q4
162#define ga_uv_y d8
163#define g_uv_y d8
164#define ga_rg_y d9
165#define g_rg_y d9
166
167#define gw_uv_x q1
168#define gw_rg_x q2
169#define gw_uv_y q4
170#define gw_rg_y q3
171
172#define w_mask q9
173#define w_mask_l d18
174
175#define r_shift q10
176
177#define uvrg_dx0 q0
178#define uvrg_dx0l d0
179#define uvrg_dx0h d1
180
181#define uvrg_dx1 q1
182#define uvrg_dx1l d2
183#define uvrg_dx1h d3
184
185#define uvrg_dx2 q2
186#define uvrg_dx2l d4
187#define uvrg_dx2h d5
188
189#define uvrg_dx3 q3
190#define uvrg_dx3l d6
191#define uvrg_dx3h d7
192
c6063f89 193#define uvrgb_phase q13
75e28f62
E
194
195.align 4
196
8184d7c5 197#ifndef __MACH__
198
75e28f62
E
199#define function(name) \
200 .global name; \
8184d7c5 201 .type name, %function; \
75e28f62
E
202 name: \
203
8184d7c5 204#define JT_OP_REL(table_label, index_reg, temp)
205#define JT_OP(x...) x
206#define JTE(start, target) target
207
4d646738 208#define EXTRA_UNSAVED_REGS
209
8184d7c5 210#else
211
212#define function(name) \
213 .globl _##name; \
214 name: \
215 _##name: \
216
217#define JT_OP_REL(table_label, index_reg, temp) \
218 adr temp, table_label; \
219 ldr temp, [ temp, index_reg, lsl #2 ]; \
220 add pc, pc, temp \
221
222#define JT_OP(x...)
223#define JTE(start, target) (target - start)
224
4d646738 225// r7 is preserved, but add it for EABI alignment..
226#define EXTRA_UNSAVED_REGS r7, r9,
227
8184d7c5 228#define flush_render_block_buffer _flush_render_block_buffer
229#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
230#define update_texture_8bpp_cache _update_texture_8bpp_cache
231
232#endif
233
75e28f62
E
234@ r0: psx_gpu
235@ r1: v_a
236@ r2: v_b
237@ r3: v_c
238
239function(compute_all_gradients)
240 // First compute the triangle area reciprocal and shift. The division will
241 // happen concurrently with much of the work which follows.
242 @ r12 = psx_gpu->triangle_area
243 ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
244 stmdb sp!, { r4 - r11, lr }
245
246 @ load exponent of 62 into upper half of double
247 movw r4, #0
248 clz r14, r12 @ r14 = shift
249
250 movt r4, #((62 + 1023) << 4)
251 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
252
253 @ load area normalized into lower half of double
254 mov r5, r12, lsr #10
255 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
256
257 movt r4, #((1022 + 31) << 4)
258 mov r5, r12, lsl #20
259
260 add r4, r4, r12, lsr #11
261 vmov.f64 d31, r5, r4
262
263 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
264
265 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
266 // ( d0 * d1 ) - ( d2 * d3 ) =
267 // ( m0 ) - ( m1 ) = gradient
268
269 // This is split to do 12 elements at a time over three sets: a, b, and c.
270 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
271 // two of the slots are unused.
272
273 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
274 // is g.
275
276 // First type is: uvrg bxxx xxxx
277 // Second type is: yyyy ybyy uvrg
278 // Since x_a and y_c are the same the same variable is used for both.
279
280 vld1.u32 { v0 }, [ v_a, : 128 ] @ v0 = { uvrg0, b0, x0, y0 }
281 ldrsh x0, [ v_a, #8 ] @ load x0
282
283 vld1.u32 { v1 }, [ v_b, : 128 ] @ v1 = { uvrg1, b1, x1, y1}
284 ldrh x1, [ v_b, #8 ] @ load x1
285
286 vld1.u32 { v2 }, [ v_c, : 128 ] @ v2 = { uvrg2, b2, x2, y2 }
287 ldrh x2, [ v_c, #8 ] @ load x2
288
289 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
290 ldrh y0, [ v_a, #10 ] @ load y0
291
292 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
293 ldrh y1, [ v_b, #10 ] @ load y1
294
295 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
296 ldrh y2, [ v_c, #10 ] @ load y2
297
298 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
299 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
300
301 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
302 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
303
304 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
305 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
306
307 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
308 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
309
310 ldrb b2, [ v_c, #4 ] @ load b2
311 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
312
313 ldrb b1, [ v_b, #4 ] @ load b1
314 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
315
316 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
317 vsub.s16 d0_ab, x1_ab, x0_ab
318
319 ldrb b0, [ v_a, #4 ] @ load b0
320 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
321
322 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
323 vsub.s16 d2_ab, x2_ab, x1_ab
324
325 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
326 vsub.s16 d1_ab, y2_ab, y1_ab
327
328 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
329 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
330
331 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
332 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
333
334 vsub.s16 d3_ab, y1_ab, y0_ab
335 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
336 @ ((x2 - X1) * (b1 - b0))
337 vmull.s16 ga_uvrg_x, d0_a, d1_a
338 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
339 @ ((b2 - b1) * (y1 - y0))
340 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
341 movs gs_bx, ga_bx, asr #31
342
343 vmull.s16 ga_uvrg_y, d0_b, d1_b
344 rsbmi ga_bx, ga_bx, #0
345
c6063f89 346 @ r12 = psx_gpu->uvrgb_phase
347 ldr r12, [ psx_gpu, #psx_gpu_uvrgb_phase_offset ]
348
75e28f62
E
349 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
350 movs gs_by, ga_by, asr #31
351
352 vshr.u64 d0, d30, #22
c6063f89 353 add b_base, r12, b0, lsl #16
354
355 vdup.u32 uvrgb_phase, r12
75e28f62
E
356
357 rsbmi ga_by, ga_by, #0
358 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
359
360 @ r12 = psx_gpu->triangle_winding_offset
361 ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
362 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
363
75e28f62
E
364 rsb r12, r12, #0 @ r12 = -(triangle->winding)
365
366 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
367 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
368
369 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
370 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
371
c6063f89 372 vadd.u32 uvrg_base, uvrgb_phase
75e28f62
E
373 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
374
375 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
376 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
377
378 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
379 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
380 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
381 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
382
383 vshl.u64 gw_rg_x, gw_rg_x, r_shift
384 vshl.u64 gw_uv_x, gw_uv_x, r_shift
385 vshl.u64 gw_rg_y, gw_rg_y, r_shift
386 vshl.u64 gw_uv_y, gw_uv_y, r_shift
387
388 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
389 vmovn.u64 g_uv_x, gw_uv_x
390
391 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
392 vmovn.u64 g_rg_x, gw_rg_x
393
394 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
395 vmovn.u64 g_uv_y, gw_uv_y
396
397 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
398 vmovn.u64 g_rg_y, gw_rg_y
399
400 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
401 mov ga_bx, ga_bx, lsl #13
402
403 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
404 mov ga_by, ga_by, lsl #13
405
406 vdup.u32 x0_y0, x0
407 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
408
409 vshl.u32 g_uvrg_x, g_uvrg_x, #4
410 vshl.u32 g_uvrg_y, g_uvrg_y, #4
411
412 umull gw_by_l, gw_by_h, ga_by, area_r_s
413 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
414
415 eor gs_bx, gs_bx, r12
416 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
417
418 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
419 eor gs_by, gs_by, r12
420
421 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
422 add store_a, psx_gpu, #psx_gpu_uvrg_offset
423
424 sub r11, r11, #(32 - 13)
425
426 add store_b, store_a, #16
427 mov store_inc, #32
428
429 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
430 vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
431
432 vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
433 mov g_bx, gw_bx_h, lsr r11
434
435 vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
436 mov g_by, gw_by_h, lsr r11
437
438 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
439 [ store_b, : 128 ], store_inc
440 eor g_bx, g_bx, gs_bx
441
442 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
443 [ store_b, : 128 ], store_inc
444 sub g_bx, g_bx, gs_bx
445
446 lsl g_bx, g_bx, #4
447 eor g_by, g_by, gs_by
448
449 mls b_base, g_bx, x0, b_base
450 sub g_by, g_by, gs_by
451
452 lsl g_by, g_by, #4
453 mov g_bx0, #0
454
455 add g_bx2, g_bx, g_bx
456 add g_bx3, g_bx, g_bx2
457
458 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
459
460 ldmia sp!, { r4 - r11, pc }
461
462
463#define psx_gpu r0
464#define v_a r1
465#define v_b r2
466#define v_c r3
467
468#define temp r14
469
470#define x_a r4
471#define x_b r5
472#define x_c r6
473#define y_a r1
474#define y_b r2
475#define y_c r3
476
477#define height_minor_a r7
478#define height_minor_b r8
479#define height_major r9
480#define height r9
481
482#define reciprocal_table_ptr r10
483
484#define edge_alt_low r4
485#define edge_alt_high r5
486#define edge_dx_dy_alt r6
487#define edge_shift_alt r10
488
489#define edge_dx_dy_alt_low r4
490#define edge_dx_dy_alt_high r5
491
492#define span_edge_data r4
493#define span_uvrg_offset r5
494#define span_b_offset r6
495
496#define clip r14
497
498#define b r11
499#define b_dy r12
500
501
502#define alternate_x q0
503#define alternate_dx_dy q1
504#define alternate_x_32 q2
505
506#define alternate_x_low d0
507#define alternate_x_high d1
508#define alternate_dx_dy_low d2
509#define alternate_dx_dy_high d3
510#define alternate_x_32_low d4
511#define alternate_x_32_high d5
512
513#define left_x q3
514#define right_x q4
515#define left_dx_dy q5
516#define right_dx_dy q6
517#define left_edge q7
518#define right_edge q8
519
520#define left_x_low d6
521#define left_x_high d7
522#define right_x_low d8
523#define right_x_high d9
524#define left_dx_dy_low d10
525#define left_dx_dy_high d11
526#define right_dx_dy_low d12
527#define right_dx_dy_high d13
528#define left_edge_low d14
529#define left_edge_high d15
530#define right_edge_low d16
531#define right_edge_high d17
532
533#define y_mid_point d18
534#define c_0x0004 d19
535
536#define left_right_x_16 q11
537#define span_shifts_y q12
538#define c_0x0001 q13
539
540#define span_shifts d24
541#define y_x4 d25
542#define c_0xFFFE d26
543#define c_0x0007 d27
544
545#define left_right_x_16_low d22
546#define left_right_x_16_high d23
547
548#define uvrg q14
549#define uvrg_dy q15
550
551#define alternate_x_16 d4
552
553#define v_clip q3
554#define v_clip_low d6
555
556#define right_x_32 q10
557#define left_x_32 q11
558#define alternate_select d24
559
560#define right_x_32_low d20
561#define right_x_32_high d21
562#define left_x_32_low d22
563#define left_x_32_high d23
564
565#define edges_xy q0
566#define edges_dx_dy d2
567#define edge_shifts d3
568#define edge_shifts_64 q2
569
570#define edges_xy_left d0
571#define edges_xy_right d1
572
573#define height_reciprocals d6
574#define heights d7
575
576#define widths d8
577#define c_0x01 d9
578#define x_starts d10
579#define x_ends d11
580
581#define heights_b d12
582#define edges_dx_dy_64 q10
583
584#define edges_dx_dy_64_left d20
585#define edges_dx_dy_64_right d21
586
587
588#define setup_spans_prologue() \
589 stmdb sp!, { r4 - r11, lr }; \
590 \
591 ldrsh x_a, [ v_a, #8 ]; \
592 ldrsh x_b, [ v_b, #8 ]; \
593 ldrsh x_c, [ v_c, #8 ]; \
594 ldrsh y_a, [ v_a, #10 ]; \
595 ldrsh y_b, [ v_b, #10 ]; \
596 ldrsh y_c, [ v_c, #10 ]; \
597 \
598 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
599 vld1.32 { uvrg }, [ temp ]; \
600 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
601 vld1.32 { uvrg_dy }, [ temp ]; \
ed0fd81d 602 ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \
75e28f62
E
603 \
604 vmov.u32 c_0x01, #0x01 \
605
606#define setup_spans_load_b() \
607 ldr b, [ psx_gpu, #psx_gpu_b_offset ]; \
608 ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ] \
609
610#define setup_spans_prologue_b() \
611 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
612 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
613 \
614 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
615 vmov.u16 c_0x0004, #0x0004; \
616 \
617 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
618 vmov.u16 c_0x0001, #0x0001; \
619 \
620 vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ]; \
621 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
622 \
623 vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ]; \
624 vadd.u16 right_edge, right_edge, c_0x0001; \
625 \
626 vmov.u16 c_0x0007, #0x0007; \
627 vmvn.u16 c_0xFFFE, #0x0001 \
628
629
630#define compute_edge_delta_x2() \
631 ldr temp, [ reciprocal_table_ptr, height, lsl #2 ]; \
632 \
633 vdup.u32 heights, height; \
634 vsub.u32 widths, x_ends, x_starts; \
635 \
636 vdup.u32 edge_shifts, temp; \
637 vsub.u32 heights_b, heights, c_0x01; \
7d5140f5 638 vshr.u32 height_reciprocals, edge_shifts, #10; \
75e28f62
E
639 \
640 vmla.s32 heights_b, x_starts, heights; \
641 vbic.u16 edge_shifts, #0xE0; \
642 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
643 vmull.s32 edges_xy, heights_b, height_reciprocals \
644
645#define width_alt r6
646#define height_reciprocal_alt r11
647#define height_b_alt r12
648
649#define compute_edge_delta_x3(start_c, height_a, height_b) \
ed0fd81d 650 vmov heights, height_a, height_b; \
75e28f62
E
651 ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
652 vmov.u32 edge_shifts[0], temp; \
653 ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
654 vmov.u32 edge_shifts[1], temp; \
655 ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ]; \
656 \
657 vsub.u32 widths, x_ends, x_starts; \
658 sub width_alt, x_c, start_c; \
659 \
660 vsub.u32 heights_b, heights, c_0x01; \
661 sub height_b_alt, height_minor_b, #1; \
662 \
7d5140f5
E
663 vshr.u32 height_reciprocals, edge_shifts, #10; \
664 lsr height_reciprocal_alt, edge_shift_alt, #10; \
75e28f62
E
665 \
666 vmla.s32 heights_b, x_starts, heights; \
667 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
668 \
669 vbic.u16 edge_shifts, #0xE0; \
670 and edge_shift_alt, edge_shift_alt, #0x1F; \
671 \
672 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
673 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
674 \
675 vmull.s32 edges_xy, heights_b, height_reciprocals; \
676 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
677
678
679#define setup_spans_adjust_y_up() \
680 vsub.u32 y_x4, y_x4, c_0x0004 \
681
682#define setup_spans_adjust_y_down() \
683 vadd.u32 y_x4, y_x4, c_0x0004 \
684
685#define setup_spans_adjust_interpolants_up() \
686 vsub.u32 uvrg, uvrg, uvrg_dy; \
687 sub b, b, b_dy \
688
689#define setup_spans_adjust_interpolants_down() \
690 vadd.u32 uvrg, uvrg, uvrg_dy; \
691 add b, b, b_dy \
692
693
694#define setup_spans_clip_interpolants_increment() \
695 mla b, b_dy, clip, b; \
696 vmla.s32 uvrg, uvrg_dy, v_clip \
697
698#define setup_spans_clip_interpolants_decrement() \
699 mls b, b_dy, clip, b; \
700 vmls.s32 uvrg, uvrg_dy, v_clip \
701
702#define setup_spans_clip_alternate_yes() \
703 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
704
705#define setup_spans_clip_alternate_no() \
706
707#define setup_spans_clip(direction, alternate_active) \
708 vdup.u32 v_clip, clip; \
709 setup_spans_clip_alternate_##alternate_active(); \
710 setup_spans_clip_interpolants_##direction(); \
711 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
712
713
714#define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
715 vmovl.s32 edge_shifts_64, edge_shifts; \
716 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
717 \
718 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
719 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
720 \
721 vmov left_x_low, edges_xy_##left_index; \
722 vmov right_x_low, edges_xy_##right_index; \
723 \
724 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
725 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
726 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
727 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
728 \
729 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
730 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
731 \
732 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
733 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
734
735
736#define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
737 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
738 \
739 vdup.u16 y_mid_point, y_b; \
740 rsb temp, edge_shift_alt, #32; \
741 \
742 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
743 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
744 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
745 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
746 \
747 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
748 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
749 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
750 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
751 \
752 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
753 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
754
755
756#define setup_spans_y_select_up() \
757 vclt.s16 alternate_select, y_x4, y_mid_point \
758
759#define setup_spans_y_select_down() \
760 vcgt.s16 alternate_select, y_x4, y_mid_point \
761
762
763#define setup_spans_alternate_select_left() \
764 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
765
766#define setup_spans_alternate_select_right() \
767 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
768
769
770#define setup_spans_set_x4_alternate_yes(alternate, direction) \
771 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
772 vshrn.s64 left_x_32_low, left_x, #32; \
773 vshrn.s64 right_x_32_low, right_x, #32; \
774 \
775 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
776 vadd.u64 left_x, left_x, left_dx_dy; \
777 vadd.u64 right_x, right_x, right_dx_dy; \
778 \
779 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
780 vshrn.s64 left_x_32_high, left_x, #32; \
781 vshrn.s64 right_x_32_high, right_x, #32; \
782 \
783 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
784 vadd.u64 left_x, left_x, left_dx_dy; \
785 vadd.u64 right_x, right_x, right_dx_dy; \
786 \
787 vmovn.u32 alternate_x_16, alternate_x_32; \
788 setup_spans_y_select_##direction(); \
789 vmovn.u32 left_right_x_16_low, left_x_32; \
790 \
791 vmovn.u32 left_right_x_16_high, right_x_32; \
792 setup_spans_alternate_select_##alternate(); \
793 \
794 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
795 str b, [ span_b_offset ], #4; \
796 setup_spans_adjust_interpolants_##direction(); \
797 \
798 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
799 \
800 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
801 str b, [ span_b_offset ], #4; \
802 setup_spans_adjust_interpolants_##direction(); \
803 \
804 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
805 \
806 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
807 str b, [ span_b_offset ], #4; \
808 setup_spans_adjust_interpolants_##direction(); \
809 \
810 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
811 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
812 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
813 \
814 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
815 str b, [ span_b_offset ], #4; \
816 setup_spans_adjust_interpolants_##direction(); \
817 \
818 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
819 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
820 \
821 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
822 \
823 setup_spans_adjust_y_##direction() \
824
825
826#define setup_spans_set_x4_alternate_no(alternate, direction) \
827 vshrn.s64 left_x_32_low, left_x, #32; \
828 vshrn.s64 right_x_32_low, right_x, #32; \
829 \
830 vadd.u64 left_x, left_x, left_dx_dy; \
831 vadd.u64 right_x, right_x, right_dx_dy; \
832 \
833 vshrn.s64 left_x_32_high, left_x, #32; \
834 vshrn.s64 right_x_32_high, right_x, #32; \
835 \
836 vadd.u64 left_x, left_x, left_dx_dy; \
837 vadd.u64 right_x, right_x, right_dx_dy; \
838 \
839 vmovn.u32 left_right_x_16_low, left_x_32; \
840 vmovn.u32 left_right_x_16_high, right_x_32; \
841 \
842 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
843 str b, [ span_b_offset ], #4; \
844 setup_spans_adjust_interpolants_##direction(); \
845 \
846 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
847 \
848 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
849 str b, [ span_b_offset ], #4; \
850 setup_spans_adjust_interpolants_##direction(); \
851 \
852 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
853 \
854 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
855 str b, [ span_b_offset ], #4; \
856 setup_spans_adjust_interpolants_##direction(); \
857 \
858 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
859 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
860 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
861 \
862 vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!; \
863 str b, [ span_b_offset ], #4; \
864 setup_spans_adjust_interpolants_##direction(); \
865 \
866 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
867 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
868 \
869 vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!; \
870 \
871 setup_spans_adjust_y_##direction() \
872
873
874#define edge_adjust_low r11
875#define edge_adjust_high r12
876
877#define setup_spans_alternate_adjust_yes() \
878 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
879 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
880 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
881
882#define setup_spans_alternate_adjust_no() \
883
884
885#define setup_spans_down(left_index, right_index, alternate, alternate_active) \
886 setup_spans_alternate_adjust_##alternate_active(); \
887 setup_spans_load_b(); \
888 \
889 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
890 subs y_c, y_c, temp; \
891 subgt height, height, y_c; \
892 addgt height, height, #1; \
893 \
894 ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
895 subs clip, temp, y_a; \
896 ble 0f; \
897 \
898 sub height, height, clip; \
899 add y_a, y_a, clip; \
900 setup_spans_clip(increment, alternate_active); \
901 \
902 0: \
903 cmp height, #0; \
904 ble 1f; \
905 \
906 orr temp, y_a, y_a, lsl #16; \
907 add temp, temp, #(1 << 16); \
908 add y_a, temp, #2; \
909 add y_a, y_a, #(2 << 16); \
ed0fd81d 910 vmov y_x4, temp, y_a; \
75e28f62
E
911 \
912 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
913 right_index); \
914 setup_spans_prologue_b(); \
915 \
916 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
917 \
918 2: \
919 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
920 subs height, height, #4; \
921 bhi 2b; \
922 \
923 1: \
924
925
926#define setup_spans_alternate_pre_increment_yes() \
927 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
928 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
929
930#define setup_spans_alternate_pre_increment_no() \
931
932
933#define setup_spans_up_decrement_yes() \
934 suble height, height, #1 \
935
936#define setup_spans_up_decrement_no() \
937
938
939#define setup_spans_up(left_index, right_index, alternate, alternate_active) \
940 setup_spans_alternate_adjust_##alternate_active(); \
941 setup_spans_load_b(); \
942 sub y_a, y_a, #1; \
943 \
944 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]; \
945 subs temp, temp, y_c; \
946 subgt height, height, temp; \
947 setup_spans_up_decrement_##alternate_active(); \
948 \
949 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]; \
950 subs clip, y_a, temp; \
951 ble 0f; \
952 \
953 sub height, height, clip; \
954 sub y_a, y_a, clip; \
955 setup_spans_clip(decrement, alternate_active); \
956 \
957 0: \
958 cmp height, #0; \
959 ble 1f; \
960 \
961 orr temp, y_a, y_a, lsl #16; \
962 sub temp, temp, #(1 << 16); \
963 sub y_a, temp, #2; \
964 sub y_a, y_a, #(2 << 16); \
ed0fd81d 965 vmov y_x4, temp, y_a; \
75e28f62
E
966 \
967 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
968 \
969 setup_spans_alternate_pre_increment_##alternate_active(); \
970 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
971 right_index); \
972 setup_spans_adjust_interpolants_up(); \
973 setup_spans_prologue_b(); \
974 \
975 strh height, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
976 \
977 2: \
978 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
979 subs height, height, #4; \
980 bhi 2b; \
981 \
982 1: \
983
984
985#define setup_spans_epilogue() \
986 ldmia sp!, { r4 - r11, pc } \
987
988
989#define setup_spans_up_up(minor, major) \
990 setup_spans_prologue(); \
991 sub height_minor_a, y_a, y_b; \
992 sub height_minor_b, y_b, y_c; \
993 sub height, y_a, y_c; \
994 \
995 vdup.u32 x_starts, x_a; \
ed0fd81d 996 vmov x_ends, x_c, x_b; \
75e28f62
E
997 \
998 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
999 setup_spans_up(major, minor, minor, yes); \
1000 setup_spans_epilogue() \
1001
1002function(setup_spans_up_left)
1003 setup_spans_up_up(left, right)
1004
1005function(setup_spans_up_right)
1006 setup_spans_up_up(right, left)
1007
75e28f62
E
1008#define setup_spans_down_down(minor, major) \
1009 setup_spans_prologue(); \
1010 sub height_minor_a, y_b, y_a; \
1011 sub height_minor_b, y_c, y_b; \
1012 sub height, y_c, y_a; \
1013 \
1014 vdup.u32 x_starts, x_a; \
ed0fd81d 1015 vmov x_ends, x_c, x_b; \
75e28f62
E
1016 \
1017 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1018 setup_spans_down(major, minor, minor, yes); \
1019 setup_spans_epilogue() \
1020
1021function(setup_spans_down_left)
1022 setup_spans_down_down(left, right)
1023
1024function(setup_spans_down_right)
1025 setup_spans_down_down(right, left)
1026
1027
1028#define setup_spans_up_flat() \
1029 sub height, y_a, y_c; \
1030 \
1031 compute_edge_delta_x2(); \
1032 setup_spans_up(left, right, none, no); \
1033 setup_spans_epilogue() \
1034
1035function(setup_spans_up_a)
1036 setup_spans_prologue()
1037
ed0fd81d 1038 vmov x_starts, x_a, x_b
75e28f62
E
1039 vdup.u32 x_ends, x_c
1040
1041 setup_spans_up_flat()
1042
1043function(setup_spans_up_b)
1044 setup_spans_prologue()
1045
1046 vdup.u32 x_starts, x_a
ed0fd81d 1047 vmov x_ends, x_b, x_c
75e28f62
E
1048
1049 setup_spans_up_flat()
1050
1051#define setup_spans_down_flat() \
1052 sub height, y_c, y_a; \
1053 \
1054 compute_edge_delta_x2(); \
1055 setup_spans_down(left, right, none, no); \
1056 setup_spans_epilogue() \
1057
1058function(setup_spans_down_a)
1059 setup_spans_prologue()
1060
ed0fd81d 1061 vmov x_starts, x_a, x_b
75e28f62
E
1062 vdup.u32 x_ends, x_c
1063
1064 setup_spans_down_flat()
1065
1066function(setup_spans_down_b)
1067 setup_spans_prologue()
1068
1069 vdup.u32 x_starts, x_a
ed0fd81d 1070 vmov x_ends, x_b, x_c
75e28f62
E
1071
1072 setup_spans_down_flat()
1073
1074
1075#define middle_y r9
1076
1077#define edges_xy_b q11
1078#define edges_dx_dy_b d26
1079#define edge_shifts_b d27
1080#define edges_dx_dy_and_shifts_b q13
1081#define height_increment d20
1082
1083#define edges_dx_dy_and_shifts q1
1084
1085#define edges_xy_b_left d22
1086#define edges_xy_b_right d23
1087
1088#define setup_spans_up_down_load_edge_set_b() \
1089 vmov edges_xy, edges_xy_b; \
1090 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1091
1092
1093function(setup_spans_up_down)
1094 setup_spans_prologue()
1095
1096 // s32 middle_y = y_a;
1097 sub height_minor_a, y_a, y_b
1098 sub height_minor_b, y_c, y_a
1099 sub height_major, y_c, y_b
1100
ed0fd81d 1101 vmov x_starts, x_a, x_c
75e28f62
E
1102 vdup.u32 x_ends, x_b
1103
1104 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1105
1106 mov temp, #0
ed0fd81d 1107 vmov height_increment, temp, height_minor_b
75e28f62
E
1108 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1109
1110 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1111 vmov edges_xy_b_right, edges_xy_right
1112
1113 vmov edge_shifts_b, edge_shifts
1114 vmov.u32 edge_shifts_b[0], edge_shift_alt
1115
1116 vneg.s32 edges_dx_dy_b, edges_dx_dy
1117 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1118
1119 mov middle_y, y_a
1120
1121 setup_spans_load_b()
1122 sub y_a, y_a, #1
1123
1124 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1125 subs temp, temp, y_b
1126 subgt height_minor_a, height_minor_a, temp
1127
1128 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1129 subs clip, y_a, temp
1130 ble 0f
1131
1132 sub height_minor_a, height_minor_a, clip
1133 sub y_a, y_a, clip
1134 setup_spans_clip(decrement, no)
1135
1136 0:
1137 cmp height_minor_a, #0
1138 ble 3f
1139
1140 orr temp, y_a, y_a, lsl #16
1141 sub temp, temp, #(1 << 16)
1142 sub y_a, temp, #2
1143 sub y_a, y_a, #(2 << 16)
ed0fd81d 1144 vmov y_x4, temp, y_a
75e28f62
E
1145
1146 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1147
1148 strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1149
1150 setup_spans_adjust_edges_alternate_no(left, right);
1151 setup_spans_adjust_interpolants_up()
1152 setup_spans_up_down_load_edge_set_b()
1153
1154 setup_spans_prologue_b()
1155
1156
1157 2:
1158 setup_spans_set_x4_alternate_no(none, up)
1159 subs height_minor_a, height_minor_a, #4
1160 bhi 2b
1161
1162 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1163 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1164 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1165
1166 4:
1167 add temp, psx_gpu, #psx_gpu_uvrg_offset
1168 vld1.32 { uvrg }, [ temp ]
1169 mov y_a, middle_y
1170
1171 setup_spans_load_b()
1172
1173 ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1174 subs y_c, y_c, temp
1175 subgt height_minor_b, height_minor_b, y_c
1176 addgt height_minor_b, height_minor_b, #1
1177
1178 ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1179 subs clip, temp, y_a
1180 ble 0f
1181
1182 sub height_minor_b, height_minor_b, clip
1183 add y_a, y_a, clip
1184 setup_spans_clip(increment, no)
1185
1186 0:
1187 cmp height_minor_b, #0
1188 ble 1f
1189
1190 orr temp, y_a, y_a, lsl #16
1191 add temp, temp, #(1 << 16)
1192 add y_a, temp, #2
1193 add y_a, y_a, #(2 << 16)
ed0fd81d 1194 vmov y_x4, temp, y_a
75e28f62
E
1195
1196 setup_spans_adjust_edges_alternate_no(left, right)
1197
1198 ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1199 add temp, temp, height_minor_b
b7569147 1200
1201 cmp temp, #MAX_SPANS
1202 beq 5f
1203
75e28f62
E
1204 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1205
1206 2:
1207 setup_spans_set_x4_alternate_no(none, down)
1208 subs height_minor_b, height_minor_b, #4
1209 bhi 2b
1210
1211 1:
1212 setup_spans_epilogue()
1213
1214 3:
1215 setup_spans_up_down_load_edge_set_b()
1216 setup_spans_prologue_b()
1217 bal 4b
1218
b7569147 1219 5:
1220 // FIXME: overflow corner case
1221 sub temp, temp, height_minor_b
1222 bics height_minor_b, #3
1223 add temp, temp, height_minor_b
1224 strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1225 bne 2b
1226 bal 1b
1227
75e28f62
E
1228#undef span_uvrg_offset
1229#undef span_edge_data
1230#undef span_b_offset
1231#undef left_x
1232#undef b
1233
1234#define psx_gpu r0
1235#define num_spans r1
1236#define span_uvrg_offset r2
1237#define span_edge_data r3
1238#define span_b_offset r4
1239#define b_dx r5
1240#define span_num_blocks r6
1241#define y r7
1242#define left_x r8
1243#define b r9
1244#define dither_offset_ptr r10
1245#define block_ptr_a r11
1246#define fb_ptr r12
1247#define num_blocks r14
1248
1249#define uvrg_dx_ptr r2
1250#define texture_mask_ptr r3
1251#define dither_shift r8
1252#define dither_row r10
1253
1254#define c_32 r7
1255#define b_dx4 r8
1256#define b_dx8 r9
1257#define block_ptr_b r10
1258
1259#define block_span_ptr r10
1260#define right_mask r8
1261
1262#define color r2
1263#define color_r r3
1264#define color_g r4
1265#define color_b r5
1266
1267#undef uvrg
1268
1269#define u_block q0
1270#define v_block q1
1271#define r_block q2
1272#define g_block q3
1273#define b_block q4
1274
1275#define uv_dx4 d10
1276#define rg_dx4 d11
1277#define uv_dx8 d12
1278#define rg_dx8 d13
1279#define b_whole_8 d14
1280#define fb_mask_ptrs d15
1281
1282#define uvrg_dx4 q5
1283#define uvrg_dx8 q6
1284#define uv_dx8 d12
1285#define rg_dx8 d13
1286
1287#define u_whole q8
1288#define v_whole q9
1289#define r_whole q10
1290#define g_whole q11
1291#define b_whole q12
1292
1293#define u_whole_low d16
1294#define u_whole_high d17
1295#define v_whole_low d18
1296#define v_whole_high d19
1297#define r_whole_low d20
1298#define r_whole_high d21
1299#define g_whole_low d22
1300#define g_whole_high d23
1301#define b_whole_low d24
1302#define b_whole_high d25
1303
1304#define dx4 q13
1305#define dx8 q13
1306
1307#define u_whole_8 d26
1308#define v_whole_8 d27
1309#define u_whole_8b d24
1310#define r_whole_8 d24
1311#define g_whole_8 d25
1312
1313#define uv_whole_8 q13
1314#define uv_whole_8b q14
1315
1316#define dither_offsets q14
1317#define texture_mask q15
1318#define texture_mask_u d30
1319#define texture_mask_v d31
1320
1321#define dither_offsets_short d28
1322
1323#define v_left_x q8
1324#define uvrg q9
1325#define block_span q10
1326
1327#define uv d18
1328#define rg d19
1329
1330#define draw_mask q1
1331#define draw_mask_edge q13
1332#define test_mask q0
1333
1334#define uvrg_dx q3
1335
1336#define colors q2
1337
1338#define setup_blocks_texture_swizzled() \
1339 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1340 vsli.u8 u_whole_8, v_whole_8, #4; \
1341 vsri.u8 v_whole_8, u_whole_8b, #4 \
1342
1343#define setup_blocks_texture_unswizzled() \
1344
1345
1346#define setup_blocks_shaded_textured_builder(swizzling) \
1347.align 3; \
1348 \
1349function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
1350 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1351 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1352 \
1353 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1354 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1355 \
1356 cmp num_spans, #0; \
1357 bxeq lr; \
1358 \
1359 stmdb sp!, { r4 - r11, r14 }; \
1360 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1361 \
1362 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
1363 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1364 \
1365 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1366 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1367 \
1368 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1369 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1370 \
1371 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1372 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1373 \
1374 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1375 \
1376 0: \
1377 vmov.u8 fb_mask_ptrs, #0; \
1378 \
1379 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1380 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1381 \
1382 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 1383 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
1384 \
1385 cmp span_num_blocks, #0; \
1386 beq 1f; \
1387 \
1388 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1389 add num_blocks, span_num_blocks, num_blocks; \
1390 \
1391 cmp num_blocks, #MAX_BLOCKS; \
1392 bgt 2f; \
1393 \
1394 3: \
1395 ldr b, [ span_b_offset ]; \
1396 add fb_ptr, fb_ptr, y, lsl #11; \
1397 \
1398 vdup.u32 v_left_x, left_x; \
1399 and y, y, #0x3; \
1400 \
1401 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1402 add fb_ptr, fb_ptr, left_x, lsl #1; \
1403 \
1404 mla b, b_dx, left_x, b; \
1405 and dither_shift, left_x, #0x03; \
1406 \
1407 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1408 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1409 \
1410 mov dither_shift, dither_shift, lsl #3; \
1411 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1412 \
1413 mov c_32, #32; \
1414 subs span_num_blocks, span_num_blocks, #1; \
1415 \
1416 mov dither_row, dither_row, ror dither_shift; \
1417 mov b_dx4, b_dx, lsl #2; \
1418 \
1419 vdup.u32 dither_offsets_short, dither_row; \
1420 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1421 \
1422 vdup.u32 b_block, b; \
1423 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1424 \
1425 vdup.u32 u_block, uv[0]; \
1426 mov b_dx8, b_dx, lsl #3; \
1427 \
1428 vdup.u32 v_block, uv[1]; \
1429 vdup.u32 r_block, rg[0]; \
1430 vdup.u32 g_block, rg[1]; \
1431 \
1432 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1433 \
1434 vadd.u32 u_block, u_block, block_span; \
1435 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1436 \
1437 vadd.u32 v_block, v_block, block_span; \
1438 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1439 \
1440 vadd.u32 r_block, r_block, block_span; \
1441 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1442 \
1443 vadd.u32 g_block, g_block, block_span; \
1444 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
1445 \
1446 vadd.u32 b_block, b_block, block_span; \
1447 add block_ptr_b, block_ptr_a, #16; \
1448 \
1449 vshrn.u32 u_whole_low, u_block, #16; \
1450 vshrn.u32 v_whole_low, v_block, #16; \
1451 vshrn.u32 r_whole_low, r_block, #16; \
1452 vshrn.u32 g_whole_low, g_block, #16; \
1453 \
1454 vdup.u32 dx4, uv_dx4[0]; \
1455 vshrn.u32 b_whole_low, b_block, #16; \
1456 \
1457 vaddhn.u32 u_whole_high, u_block, dx4; \
1458 vdup.u32 dx4, uv_dx4[1]; \
1459 \
1460 vaddhn.u32 v_whole_high, v_block, dx4; \
1461 vdup.u32 dx4, rg_dx4[0]; \
1462 \
1463 vaddhn.u32 r_whole_high, r_block, dx4; \
1464 vdup.u32 dx4, rg_dx4[1]; \
1465 \
1466 vaddhn.u32 g_whole_high, g_block, dx4; \
1467 vdup.u32 dx4, b_dx4; \
1468 \
1469 vaddhn.u32 b_whole_high, b_block, dx4; \
1470 vdup.u32 dx8, uv_dx8[0]; \
1471 \
1472 vadd.u32 u_block, u_block, dx8; \
1473 vdup.u32 dx8, uv_dx8[1]; \
1474 \
1475 vadd.u32 v_block, v_block, dx8; \
1476 vdup.u32 dx8, rg_dx8[0]; \
1477 \
1478 vadd.u32 r_block, r_block, dx8; \
1479 vdup.u32 dx8, rg_dx8[1]; \
1480 \
1481 vadd.u32 g_block, g_block, dx8; \
1482 vdup.u32 dx8, b_dx8; \
1483 \
1484 vadd.u32 b_block, b_block, dx8; \
1485 vmovn.u16 u_whole_8, u_whole; \
1486 \
1487 vmovn.u16 v_whole_8, v_whole; \
1488 \
1489 vmovn.u16 b_whole_8, b_whole; \
1490 pld [ fb_ptr ]; \
1491 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1492 \
1493 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1494 setup_blocks_texture_##swizzling(); \
1495 \
1496 vmovn.u16 r_whole_8, r_whole; \
1497 beq 5f; \
1498 \
1499 4: \
1500 vmovn.u16 g_whole_8, g_whole; \
1501 vshrn.u32 u_whole_low, u_block, #16; \
1502 \
1503 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1504 vshrn.u32 v_whole_low, v_block, #16; \
1505 \
1506 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1507 vshrn.u32 r_whole_low, r_block, #16; \
1508 \
1509 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1510 vshrn.u32 g_whole_low, g_block, #16; \
1511 \
1512 vdup.u32 dx4, uv_dx4[0]; \
1513 vshrn.u32 b_whole_low, b_block, #16; \
1514 \
1515 vaddhn.u32 u_whole_high, u_block, dx4; \
1516 vdup.u32 dx4, uv_dx4[1]; \
1517 \
1518 vaddhn.u32 v_whole_high, v_block, dx4; \
1519 vdup.u32 dx4, rg_dx4[0]; \
1520 \
1521 vaddhn.u32 r_whole_high, r_block, dx4; \
1522 vdup.u32 dx4, rg_dx4[1]; \
1523 \
1524 vaddhn.u32 g_whole_high, g_block, dx4; \
1525 vdup.u32 dx4, b_dx4; \
1526 \
1527 vaddhn.u32 b_whole_high, b_block, dx4; \
1528 vdup.u32 dx8, uv_dx8[0]; \
1529 \
1530 vadd.u32 u_block, u_block, dx8; \
1531 vdup.u32 dx8, uv_dx8[1]; \
1532 \
1533 vadd.u32 v_block, v_block, dx8; \
1534 vdup.u32 dx8, rg_dx8[0]; \
1535 \
1536 vadd.u32 r_block, r_block, dx8; \
1537 vdup.u32 dx8, rg_dx8[1]; \
1538 \
1539 vadd.u32 g_block, g_block, dx8; \
1540 vdup.u32 dx8, b_dx8; \
1541 \
1542 vadd.u32 b_block, b_block, dx8; \
1543 vmovn.u16 u_whole_8, u_whole; \
1544 \
1545 add fb_ptr, fb_ptr, #16; \
1546 vmovn.u16 v_whole_8, v_whole; \
1547 \
1548 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1549 vmovn.u16 b_whole_8, b_whole; \
1550 \
1551 pld [ fb_ptr ]; \
1552 \
1553 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1554 subs span_num_blocks, span_num_blocks, #1; \
1555 \
1556 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1557 setup_blocks_texture_##swizzling(); \
1558 \
1559 vmovn.u16 r_whole_8, r_whole; \
1560 bne 4b; \
1561 \
1562 5: \
1563 vmovn.u16 g_whole_8, g_whole; \
1564 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1565 \
1566 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1567 vdup.u8 draw_mask, right_mask; \
1568 \
1569 vmov.u32 fb_mask_ptrs[0], right_mask; \
1570 vtst.u16 draw_mask, draw_mask, test_mask; \
1571 vzip.u8 u_whole_8, v_whole_8; \
1572 \
1573 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1574 vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32; \
1575 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1576 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1577 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1578 \
1579 1: \
1580 add span_uvrg_offset, span_uvrg_offset, #16; \
1581 add span_b_offset, span_b_offset, #4; \
1582 \
1583 add span_edge_data, span_edge_data, #8; \
1584 subs num_spans, num_spans, #1; \
1585 \
1586 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1587 bne 0b; \
1588 \
1589 ldmia sp!, { r4 - r11, pc }; \
1590 \
1591 2: \
1592 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1593 vpush { texture_mask }; \
1594 vpush { uvrg_dx4 }; \
1595 \
4d646738 1596 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 1597 bl flush_render_block_buffer; \
4d646738 1598 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
1599 \
1600 vpop { uvrg_dx4 }; \
1601 vpop { texture_mask }; \
1602 \
1603 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1604 vmov.u8 fb_mask_ptrs, #0; \
1605 \
1606 mov num_blocks, span_num_blocks; \
1607 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1608 bal 3b \
1609
1610
1611setup_blocks_shaded_textured_builder(swizzled)
1612setup_blocks_shaded_textured_builder(unswizzled)
1613
1614
1615#define setup_blocks_unshaded_textured_builder(swizzling) \
1616.align 3; \
1617 \
1618function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
1619 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
1620 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1621 \
1622 vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ]; \
1623 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1624 \
1625 cmp num_spans, #0; \
1626 bxeq lr; \
1627 \
1628 stmdb sp!, { r4 - r11, r14 }; \
1629 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1630 \
1631 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1632 \
1633 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ]; \
1634 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1635 \
1636 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1637 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1638 \
1639 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1640 \
1641 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1642 \
1643 0: \
1644 vmov.u8 fb_mask_ptrs, #0; \
1645 \
1646 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
1647 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1648 \
1649 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 1650 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
1651 \
1652 cmp span_num_blocks, #0; \
1653 beq 1f; \
1654 \
1655 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
1656 add num_blocks, span_num_blocks, num_blocks; \
1657 \
1658 cmp num_blocks, #MAX_BLOCKS; \
1659 bgt 2f; \
1660 \
1661 3: \
1662 add fb_ptr, fb_ptr, y, lsl #11; \
1663 \
1664 vdup.u32 v_left_x, left_x; \
1665 and y, y, #0x3; \
1666 \
1667 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
1668 add fb_ptr, fb_ptr, left_x, lsl #1; \
1669 \
1670 and dither_shift, left_x, #0x03; \
1671 \
1672 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
1673 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1674 \
1675 mov dither_shift, dither_shift, lsl #3; \
1676 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1677 \
1678 mov c_32, #32; \
1679 subs span_num_blocks, span_num_blocks, #1; \
1680 \
1681 mov dither_row, dither_row, ror dither_shift; \
1682 \
1683 vdup.u32 dither_offsets_short, dither_row; \
1684 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1685 \
1686 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1687 \
1688 vdup.u32 u_block, uv[0]; \
1689 \
1690 vdup.u32 v_block, uv[1]; \
1691 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1692 \
1693 vadd.u32 u_block, u_block, block_span; \
1694 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
1695 \
1696 vadd.u32 v_block, v_block, block_span; \
1697 add block_ptr_b, block_ptr_a, #16; \
1698 \
1699 vshrn.u32 u_whole_low, u_block, #16; \
1700 vshrn.u32 v_whole_low, v_block, #16; \
1701 \
1702 vdup.u32 dx4, uv_dx4[0]; \
1703 \
1704 vaddhn.u32 u_whole_high, u_block, dx4; \
1705 vdup.u32 dx4, uv_dx4[1]; \
1706 \
1707 vaddhn.u32 v_whole_high, v_block, dx4; \
1708 vdup.u32 dx8, uv_dx8[0]; \
1709 \
1710 vadd.u32 u_block, u_block, dx8; \
1711 vdup.u32 dx8, uv_dx8[1]; \
1712 \
1713 vadd.u32 v_block, v_block, dx8; \
1714 vmovn.u16 u_whole_8, u_whole; \
1715 \
1716 vmovn.u16 v_whole_8, v_whole; \
1717 \
1718 pld [ fb_ptr ]; \
1719 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1720 \
1721 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1722 setup_blocks_texture_##swizzling(); \
1723 \
1724 beq 5f; \
1725 \
1726 4: \
1727 vshrn.u32 u_whole_low, u_block, #16; \
1728 \
1729 vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1730 vshrn.u32 v_whole_low, v_block, #16; \
1731 \
1732 add block_ptr_b, block_ptr_b, #32; \
1733 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1734 \
1735 vdup.u32 dx4, uv_dx4[0]; \
1736 vaddhn.u32 u_whole_high, u_block, dx4; \
1737 vdup.u32 dx4, uv_dx4[1]; \
1738 \
1739 vaddhn.u32 v_whole_high, v_block, dx4; \
1740 vdup.u32 dx8, uv_dx8[0]; \
1741 \
1742 vadd.u32 u_block, u_block, dx8; \
1743 vdup.u32 dx8, uv_dx8[1]; \
1744 \
1745 vadd.u32 v_block, v_block, dx8; \
1746 vmovn.u16 u_whole_8, u_whole; \
1747 \
1748 add fb_ptr, fb_ptr, #16; \
1749 vmovn.u16 v_whole_8, v_whole; \
1750 \
1751 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1752 pld [ fb_ptr ]; \
1753 \
1754 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1755 subs span_num_blocks, span_num_blocks, #1; \
1756 \
1757 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1758 setup_blocks_texture_##swizzling(); \
1759 \
1760 bne 4b; \
1761 \
1762 5: \
1763 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
1764 \
1765 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
1766 vdup.u8 draw_mask, right_mask; \
1767 \
1768 vmov.u32 fb_mask_ptrs[0], right_mask; \
1769 vtst.u16 draw_mask, draw_mask, test_mask; \
1770 vzip.u8 u_whole_8, v_whole_8; \
1771 \
1772 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1773 add block_ptr_b, block_ptr_b, #32; \
1774 vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32; \
1775 vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32; \
1776 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32; \
1777 \
1778 1: \
1779 add span_uvrg_offset, span_uvrg_offset, #16; \
1780 add span_edge_data, span_edge_data, #8; \
1781 subs num_spans, num_spans, #1; \
1782 \
1783 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
1784 bne 0b; \
1785 \
1786 ldmia sp!, { r4 - r11, pc }; \
1787 \
1788 2: \
1789 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1790 vpush { texture_mask }; \
1791 vpush { uvrg_dx4 }; \
1792 \
4d646738 1793 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 1794 bl flush_render_block_buffer; \
4d646738 1795 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
1796 \
1797 vpop { uvrg_dx4 }; \
1798 vpop { texture_mask }; \
1799 \
1800 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1801 vmov.u8 fb_mask_ptrs, #0; \
1802 \
1803 mov num_blocks, span_num_blocks; \
1804 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1805 bal 3b \
1806
1807
1808setup_blocks_unshaded_textured_builder(swizzled)
1809setup_blocks_unshaded_textured_builder(unswizzled)
1810
1811
1812.align 3
1813
1814function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1815 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1816 veor.u32 draw_mask, draw_mask, draw_mask
1817
1818 cmp num_spans, #0
1819 bxeq lr
1820
1821 stmdb sp!, { r4 - r11, r14 }
1822 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1823
1824 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1825
1826 ubfx color_r, color, #3, #5
1827 ubfx color_g, color, #11, #5
1828 ubfx color_b, color, #19, #5
1829
1830 orr color, color_r, color_b, lsl #10
1831 orr color, color, color_g, lsl #5
1832
1833 vdup.u16 colors, color
1834
1835 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1836 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1837
1838 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1839 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1840
1841 0:
1842 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1843 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1844
c1817bd9 1845 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
1846
1847 cmp span_num_blocks, #0
1848 beq 1f
1849
1850 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1851 add num_blocks, span_num_blocks, num_blocks
1852
1853 cmp num_blocks, #MAX_BLOCKS
1854 bgt 2f
1855
1856 3:
1857 add fb_ptr, fb_ptr, y, lsl #11
1858 and y, y, #0x3
1859
1860 add fb_ptr, fb_ptr, left_x, lsl #1
1861 mov c_32, #32
1862
1863 subs span_num_blocks, span_num_blocks, #1
1864
1865 add block_ptr_b, block_ptr_a, #16
1866 pld [ fb_ptr ]
1867
1868 vmov.u32 fb_mask_ptrs[1], fb_ptr
1869 beq 5f
1870
1871 4:
1872 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1873 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1874 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1875
1876 add fb_ptr, fb_ptr, #16
1877 add block_ptr_b, block_ptr_b, #32
1878
1879 pld [ fb_ptr ]
1880
1881 vmov.u32 fb_mask_ptrs[1], fb_ptr
1882 subs span_num_blocks, span_num_blocks, #1
1883
1884 bne 4b
1885
1886 5:
1887 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1888
1889 vdup.u8 draw_mask_edge, right_mask
1890 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1891
1892 vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1893 vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1894 add block_ptr_b, block_ptr_b, #32
1895 vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1896
1897 1:
1898 add span_edge_data, span_edge_data, #8
1899 subs num_spans, num_spans, #1
1900
1901 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1902 bne 0b
1903
1904 ldmia sp!, { r4 - r11, pc }
1905
1906 2:
1907 vpush { colors }
1908
4d646738 1909 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 1910 bl flush_render_block_buffer
4d646738 1911 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62
E
1912
1913 vpop { colors }
1914
1915 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1916 veor.u32 draw_mask, draw_mask, draw_mask
1917
1918 mov num_blocks, span_num_blocks
1919 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1920 bal 3b
1921
1922
1923#define mask_msb_scalar r14
1924
1925#define msb_mask q15
1926
1927#define pixels_low d16
1928
1929#define msb_mask_low d30
1930#define msb_mask_high d31
1931
1932
1933.align 3
1934
1935function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1936 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1937
1938 cmp num_spans, #0
1939 bxeq lr
1940
1941 stmdb sp!, { r4 - r11, r14 }
1942
1943 ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1944
1945 ubfx color_r, color, #3, #5
1946 ubfx color_g, color, #11, #5
1947
1948 ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1949 ubfx color_b, color, #19, #5
1950
1951 orr color, color_r, color_b, lsl #10
1952 orr color, color, color_g, lsl #5
1953 orr color, color, mask_msb_scalar
1954
1955 vdup.u16 colors, color
1956
1957 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
ed0fd81d 1958 orr color, color, color, lsl #16
3867c6ef 1959
75e28f62
E
1960
1961 0:
1962 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1963 ldrh y, [ span_edge_data, #edge_data_y_offset ]
1964
c1817bd9 1965 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
1966
1967 cmp span_num_blocks, #0
1968 beq 1f
1969
1970 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1971
1972 add fb_ptr, fb_ptr, y, lsl #11
1973 subs span_num_blocks, span_num_blocks, #1
1974
1975 add fb_ptr, fb_ptr, left_x, lsl #1
1976 beq 3f
1977
1978 2:
1979 vst1.u32 { colors }, [ fb_ptr ]!
1980 subs span_num_blocks, span_num_blocks, #1
1981
1982 bne 2b
1983
1984 3:
1985 ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
75e28f62 1986
3867c6ef
E
1987 cmp right_mask, #0x0
1988 beq 5f
1989
1990 tst right_mask, #0xF
1991 streq color, [ fb_ptr ], #4
1992 moveq right_mask, right_mask, lsr #4
1993 streq color, [ fb_ptr ], #4
1994
1995 tst right_mask, #0x3
1996 streq color, [ fb_ptr ], #4
1997 moveq right_mask, right_mask, lsr #2
1998
1999 tst right_mask, #0x1
ed0fd81d 2000 strheq color, [ fb_ptr ]
75e28f62
E
2001
2002 1:
2003 add span_edge_data, span_edge_data, #8
2004 subs num_spans, num_spans, #1
75e28f62
E
2005 bne 0b
2006
2007 ldmia sp!, { r4 - r11, pc }
2008
3867c6ef
E
2009 5:
2010 vst1.u32 { colors }, [ fb_ptr ]
2011 bal 1b
75e28f62
E
2012
2013
2014#undef c_64
2015
2016#define c_64 r7
2017#define rg_dx_ptr r2
2018
2019
2020#undef r_block
2021#undef g_block
2022#undef b_block
2023#undef r_whole
2024#undef g_whole
2025#undef b_whole
2026#undef r_whole_low
2027#undef r_whole_high
2028#undef g_whole_low
2029#undef g_whole_high
2030#undef b_whole_low
2031#undef b_whole_high
2032#undef r_whole_8
2033#undef g_whole_8
2034#undef b_whole_8
2035#undef dither_offsets
2036#undef rg_dx4
2037#undef rg_dx8
2038#undef dx4
2039#undef dx8
2040#undef v_left_x
2041#undef uvrg
2042#undef block_span
2043#undef rg
2044#undef draw_mask
2045#undef test_mask
2046
2047#define r_block q0
2048#define g_block q1
2049#define b_block q2
2050
2051#define r_whole q3
2052#define g_whole q4
2053#define b_whole q5
2054
2055#define r_whole_low d6
2056#define r_whole_high d7
2057#define g_whole_low d8
2058#define g_whole_high d9
2059#define b_whole_low d10
2060#define b_whole_high d11
2061
2062#define gb_whole_8 q6
2063
2064#define g_whole_8 d12
2065#define b_whole_8 d13
2066
2067#define r_whole_8 d14
2068
2069#define pixels q8
2070
2071#define rg_dx4 d18
2072#define rg_dx8 d19
2073
2074#define dx4 q10
2075#define dx8 q10
2076
2077#define v_left_x d6
2078#define uvrg q4
2079#define block_span q5
2080
2081#define rg d9
2082
2083#define d64_1 d22
2084#define d64_128 d23
2085
2086#define d128_4 q12
2087#define d128_0x7 q13
2088
2089#define d64_4 d24
2090
2091#define dither_offsets q14
2092#define draw_mask q15
2093
2094#define dither_offsets_low d28
2095
2096#define rg_dx d0
2097#define test_mask q10
2098
2099
2100#define setup_blocks_shaded_untextured_dither_a_dithered() \
2101 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2102 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2103
2104#define setup_blocks_shaded_untextured_dither_b_dithered() \
2105 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2106 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2107
2108#define setup_blocks_shaded_untextured_dither_a_undithered() \
2109
2110#define setup_blocks_shaded_untextured_dither_b_undithered() \
2111
2112
2113#define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2114.align 3; \
2115 \
2116function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
2117 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2118 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2119 \
2120 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2121 \
2122 cmp num_spans, #0; \
2123 bxeq lr; \
2124 \
2125 stmdb sp!, { r4 - r11, r14 }; \
2126 vshl.u32 rg_dx4, rg_dx, #2; \
2127 \
2128 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2129 vshl.u32 rg_dx8, rg_dx, #3; \
2130 \
2131 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2132 \
2133 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2134 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2135 \
2136 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2137 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2138 \
2139 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2140 vmov.u8 d64_1, #1; \
2141 \
2142 vmov.u8 d128_4, #4; \
2143 vmov.u8 d64_128, #128; \
2144 \
2145 vmov.u8 d128_0x7, #0x7; \
2146 \
2147 0: \
2148 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2149 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2150 \
2151 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 2152 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
2153 \
2154 cmp span_num_blocks, #0; \
2155 beq 1f; \
2156 \
2157 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2158 add num_blocks, span_num_blocks, num_blocks; \
2159 \
2160 cmp num_blocks, #MAX_BLOCKS; \
2161 bgt 2f; \
2162 \
2163 3: \
2164 ldr b, [ span_b_offset ]; \
2165 add fb_ptr, fb_ptr, y, lsl #11; \
2166 \
2167 vdup.u32 v_left_x, left_x; \
2168 and y, y, #0x3; \
2169 \
2170 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2171 add fb_ptr, fb_ptr, left_x, lsl #1; \
2172 \
2173 mla b, b_dx, left_x, b; \
2174 and dither_shift, left_x, #0x03; \
2175 \
2176 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2177 vshr.u32 rg_dx, rg_dx4, #2; \
2178 \
2179 mov dither_shift, dither_shift, lsl #3; \
2180 vmla.u32 rg, rg_dx, v_left_x; \
2181 \
2182 mov c_64, #64; \
2183 subs span_num_blocks, span_num_blocks, #1; \
2184 \
2185 mov dither_row, dither_row, ror dither_shift; \
2186 mov b_dx4, b_dx, lsl #2; \
2187 \
2188 vdup.u32 dither_offsets, dither_row; \
2189 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2190 \
2191 vdup.u32 b_block, b; \
2192 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2193 \
2194 mov b_dx8, b_dx, lsl #3; \
2195 vdup.u32 r_block, rg[0]; \
2196 vdup.u32 g_block, rg[1]; \
2197 \
2198 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2199 \
2200 vadd.u32 r_block, r_block, block_span; \
2201 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2202 \
2203 vadd.u32 g_block, g_block, block_span; \
2204 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2205 \
2206 vadd.u32 b_block, b_block, block_span; \
2207 add block_ptr_b, block_ptr_a, #16; \
2208 \
2209 vshrn.u32 r_whole_low, r_block, #16; \
2210 vshrn.u32 g_whole_low, g_block, #16; \
2211 vshrn.u32 b_whole_low, b_block, #16; \
2212 vdup.u32 dx4, rg_dx4[0]; \
2213 \
2214 vaddhn.u32 r_whole_high, r_block, dx4; \
2215 vdup.u32 dx4, rg_dx4[1]; \
2216 \
2217 vaddhn.u32 g_whole_high, g_block, dx4; \
2218 vdup.u32 dx4, b_dx4; \
2219 \
2220 vaddhn.u32 b_whole_high, b_block, dx4; \
2221 vdup.u32 dx8, rg_dx8[0]; \
2222 \
2223 vadd.u32 r_block, r_block, dx8; \
2224 vdup.u32 dx8, rg_dx8[1]; \
2225 \
2226 vadd.u32 g_block, g_block, dx8; \
2227 vdup.u32 dx8, b_dx8; \
2228 \
2229 vadd.u32 b_block, b_block, dx8; \
2230 \
2231 vmovn.u16 r_whole_8, r_whole; \
2232 vmovn.u16 g_whole_8, g_whole; \
2233 vmovn.u16 b_whole_8, b_whole; \
2234 \
2235 beq 5f; \
2236 veor.u32 draw_mask, draw_mask, draw_mask; \
2237 \
2238 4: \
2239 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2240 vshrn.u32 r_whole_low, r_block, #16; \
2241 \
2242 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2243 vshrn.u32 g_whole_low, g_block, #16; \
2244 \
2245 vshrn.u32 b_whole_low, b_block, #16; \
2246 str fb_ptr, [ block_ptr_a, #44 ]; \
2247 \
2248 vdup.u32 dx4, rg_dx4[0]; \
2249 vshr.u8 r_whole_8, r_whole_8, #3; \
2250 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2251 \
2252 vaddhn.u32 r_whole_high, r_block, dx4; \
2253 vdup.u32 dx4, rg_dx4[1]; \
2254 \
2255 vaddhn.u32 g_whole_high, g_block, dx4; \
2256 vdup.u32 dx4, b_dx4; \
2257 \
2258 vaddhn.u32 b_whole_high, b_block, dx4; \
2259 vdup.u32 dx8, rg_dx8[0]; \
2260 \
2261 vmull.u8 pixels, r_whole_8, d64_1; \
2262 vmlal.u8 pixels, g_whole_8, d64_4; \
2263 vmlal.u8 pixels, b_whole_8, d64_128; \
2264 \
2265 vadd.u32 r_block, r_block, dx8; \
2266 vdup.u32 dx8, rg_dx8[1]; \
2267 \
2268 vadd.u32 g_block, g_block, dx8; \
2269 vdup.u32 dx8, b_dx8; \
2270 \
2271 vadd.u32 b_block, b_block, dx8; \
2272 add fb_ptr, fb_ptr, #16; \
2273 \
2274 vmovn.u16 r_whole_8, r_whole; \
2275 vmovn.u16 g_whole_8, g_whole; \
2276 vmovn.u16 b_whole_8, b_whole; \
2277 \
2278 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2279 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2280 \
2281 pld [ fb_ptr ]; \
2282 \
2283 subs span_num_blocks, span_num_blocks, #1; \
2284 bne 4b; \
2285 \
2286 5: \
2287 str fb_ptr, [ block_ptr_a, #44 ]; \
2288 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2289 \
2290 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
2291 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2292 \
2293 vshr.u8 r_whole_8, r_whole_8, #3; \
2294 vdup.u8 draw_mask, right_mask; \
2295 \
2296 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2297 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
2298 \
2299 vtst.u16 draw_mask, draw_mask, test_mask; \
2300 \
2301 vmull.u8 pixels, r_whole_8, d64_1; \
2302 vmlal.u8 pixels, g_whole_8, d64_4; \
2303 vmlal.u8 pixels, b_whole_8, d64_128; \
2304 \
2305 vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64; \
2306 vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64; \
2307 \
2308 1: \
2309 add span_uvrg_offset, span_uvrg_offset, #16; \
2310 add span_b_offset, span_b_offset, #4; \
2311 \
2312 add span_edge_data, span_edge_data, #8; \
2313 subs num_spans, num_spans, #1; \
2314 \
2315 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
2316 bne 0b; \
2317 \
2318 ldmia sp!, { r4 - r11, pc }; \
2319 \
2320 2: \
2321 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2322 vpush { rg_dx4 }; \
2323 \
4d646738 2324 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 2325 bl flush_render_block_buffer; \
4d646738 2326 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
2327 \
2328 vpop { rg_dx4 }; \
2329 \
2330 vmov.u8 d64_1, #1; \
2331 vmov.u8 d128_4, #4; \
2332 vmov.u8 d64_128, #128; \
2333 vmov.u8 d128_0x7, #0x7; \
2334 \
2335 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2336 \
2337 mov num_blocks, span_num_blocks; \
2338 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2339 bal 3b \
2340
2341
2342setup_blocks_shaded_untextured_indirect_builder(undithered)
2343setup_blocks_shaded_untextured_indirect_builder(dithered)
2344
2345
2346#undef draw_mask
2347
2348#define mask_msb_ptr r14
2349
2350#define draw_mask q0
2351#define pixels_low d16
3867c6ef 2352#define pixels_high d17
75e28f62
E
2353
2354
2355
2356#define setup_blocks_shaded_untextured_direct_builder(dithering) \
2357.align 3; \
2358 \
2359function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
2360 ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]; \
2361 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2362 \
2363 vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ]; \
2364 \
2365 cmp num_spans, #0; \
2366 bxeq lr; \
2367 \
2368 stmdb sp!, { r4 - r11, r14 }; \
2369 vshl.u32 rg_dx4, rg_dx, #2; \
2370 \
2371 ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ]; \
2372 vshl.u32 rg_dx8, rg_dx, #3; \
2373 \
2374 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2375 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2376 \
2377 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2378 vmov.u8 d64_1, #1; \
2379 \
2380 vmov.u8 d128_4, #4; \
2381 vmov.u8 d64_128, #128; \
2382 \
2383 vmov.u8 d128_0x7, #0x7; \
2384 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
2385 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
2386 \
2387 0: \
2388 ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]; \
2389 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2390 \
2391 ldrh y, [ span_edge_data, #edge_data_y_offset ]; \
c1817bd9 2392 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
2393 \
2394 cmp span_num_blocks, #0; \
2395 beq 1f; \
2396 \
2397 ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]; \
2398 add fb_ptr, fb_ptr, y, lsl #11; \
2399 \
2400 ldr b, [ span_b_offset ]; \
2401 vdup.u32 v_left_x, left_x; \
2402 and y, y, #0x3; \
2403 \
2404 ldr dither_row, [ dither_offset_ptr, y, lsl #2 ]; \
2405 add fb_ptr, fb_ptr, left_x, lsl #1; \
2406 \
2407 mla b, b_dx, left_x, b; \
2408 and dither_shift, left_x, #0x03; \
2409 \
2410 vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ]; \
2411 vshr.u32 rg_dx, rg_dx4, #2; \
2412 \
2413 mov dither_shift, dither_shift, lsl #3; \
2414 vmla.u32 rg, rg_dx, v_left_x; \
2415 \
2416 subs span_num_blocks, span_num_blocks, #1; \
2417 \
2418 mov dither_row, dither_row, ror dither_shift; \
2419 mov b_dx4, b_dx, lsl #2; \
2420 \
2421 vdup.u32 dither_offsets, dither_row; \
2422 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2423 \
2424 vdup.u32 b_block, b; \
2425 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2426 \
2427 mov b_dx8, b_dx, lsl #3; \
2428 vdup.u32 r_block, rg[0]; \
2429 vdup.u32 g_block, rg[1]; \
2430 \
2431 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2432 \
2433 vadd.u32 r_block, r_block, block_span; \
2434 vld1.u32 { block_span }, [ block_span_ptr, :128 ]!; \
2435 \
2436 vadd.u32 g_block, g_block, block_span; \
2437 vld1.u32 { block_span }, [ block_span_ptr, :128 ]; \
2438 \
2439 vadd.u32 b_block, b_block, block_span; \
2440 add block_ptr_b, block_ptr_a, #16; \
2441 \
2442 vshrn.u32 r_whole_low, r_block, #16; \
2443 vshrn.u32 g_whole_low, g_block, #16; \
2444 vshrn.u32 b_whole_low, b_block, #16; \
2445 vdup.u32 dx4, rg_dx4[0]; \
2446 \
2447 vaddhn.u32 r_whole_high, r_block, dx4; \
2448 vdup.u32 dx4, rg_dx4[1]; \
2449 \
2450 vaddhn.u32 g_whole_high, g_block, dx4; \
2451 vdup.u32 dx4, b_dx4; \
2452 \
2453 vaddhn.u32 b_whole_high, b_block, dx4; \
2454 vdup.u32 dx8, rg_dx8[0]; \
2455 \
2456 vadd.u32 r_block, r_block, dx8; \
2457 vdup.u32 dx8, rg_dx8[1]; \
2458 \
2459 vadd.u32 g_block, g_block, dx8; \
2460 vdup.u32 dx8, b_dx8; \
2461 \
2462 vadd.u32 b_block, b_block, dx8; \
2463 \
2464 vmovn.u16 r_whole_8, r_whole; \
2465 vmovn.u16 g_whole_8, g_whole; \
2466 vmovn.u16 b_whole_8, b_whole; \
2467 \
2468 beq 3f; \
2469 \
2470 2: \
2471 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2472 vshrn.u32 r_whole_low, r_block, #16; \
2473 \
2474 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2475 vshrn.u32 g_whole_low, g_block, #16; \
2476 \
2477 vshrn.u32 b_whole_low, b_block, #16; \
2478 \
2479 vdup.u32 dx4, rg_dx4[0]; \
2480 vshr.u8 r_whole_8, r_whole_8, #3; \
2481 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2482 \
2483 vaddhn.u32 r_whole_high, r_block, dx4; \
2484 vdup.u32 dx4, rg_dx4[1]; \
2485 \
2486 vmov pixels, msb_mask; \
2487 vaddhn.u32 g_whole_high, g_block, dx4; \
2488 vdup.u32 dx4, b_dx4; \
2489 \
2490 vaddhn.u32 b_whole_high, b_block, dx4; \
2491 vdup.u32 dx8, rg_dx8[0]; \
2492 \
2493 vmlal.u8 pixels, r_whole_8, d64_1; \
2494 vmlal.u8 pixels, g_whole_8, d64_4; \
2495 vmlal.u8 pixels, b_whole_8, d64_128; \
2496 \
2497 vadd.u32 r_block, r_block, dx8; \
2498 vdup.u32 dx8, rg_dx8[1]; \
2499 \
2500 vadd.u32 g_block, g_block, dx8; \
2501 vdup.u32 dx8, b_dx8; \
2502 \
2503 vadd.u32 b_block, b_block, dx8; \
2504 \
2505 vmovn.u16 r_whole_8, r_whole; \
2506 vmovn.u16 g_whole_8, g_whole; \
2507 vmovn.u16 b_whole_8, b_whole; \
2508 \
2509 vst1.u32 { pixels }, [ fb_ptr ]!; \
2510 subs span_num_blocks, span_num_blocks, #1; \
2511 bne 2b; \
2512 \
2513 3: \
2514 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2515 \
3867c6ef 2516 ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]; \
75e28f62
E
2517 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2518 \
2519 vshr.u8 r_whole_8, r_whole_8, #3; \
3867c6ef 2520 rbit right_mask, right_mask; \
75e28f62
E
2521 vmov pixels, msb_mask; \
2522 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
3867c6ef 2523 clz right_mask, right_mask; \
75e28f62
E
2524 \
2525 vmlal.u8 pixels, r_whole_8, d64_1; \
2526 vmlal.u8 pixels, g_whole_8, d64_4; \
2527 vmlal.u8 pixels, b_whole_8, d64_128; \
2528 \
8184d7c5 2529 JT_OP_REL(100f, right_mask, temp); \
2530 JT_OP(ldr pc, [ pc, right_mask, lsl #2 ]); \
3867c6ef 2531 nop; \
8184d7c5 2532 100: \
3867c6ef 2533 nop; \
8184d7c5 2534 .word JTE(100b, 4f); \
2535 .word JTE(100b, 5f); \
2536 .word JTE(100b, 6f); \
2537 .word JTE(100b, 7f); \
2538 .word JTE(100b, 8f); \
2539 .word JTE(100b, 9f); \
2540 .word JTE(100b, 10f); \
2541 .word JTE(100b, 11f); \
3867c6ef 2542 \
75e28f62 2543 4: \
3867c6ef
E
2544 vst1.u16 { pixels_low[0] }, [ fb_ptr ]; \
2545 bal 1f; \
2546 \
2547 5: \
2548 vst1.u32 { pixels_low[0] }, [ fb_ptr ]; \
2549 bal 1f; \
2550 \
2551 6: \
2552 vst1.u32 { pixels_low[0] }, [ fb_ptr ]!; \
2553 vst1.u16 { pixels_low[2] }, [ fb_ptr ]; \
2554 bal 1f; \
2555 \
2556 7: \
2557 vst1.u32 { pixels_low }, [ fb_ptr ]; \
2558 bal 1f; \
2559 \
2560 8: \
2561 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2562 vst1.u16 { pixels_high[0] }, [ fb_ptr ]; \
2563 bal 1f; \
2564 \
2565 9: \
2566 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2567 vst1.u32 { pixels_high[0] }, [ fb_ptr ]!; \
2568 bal 1f; \
2569 \
2570 10: \
2571 vst1.u32 { pixels_low }, [ fb_ptr ]!; \
2572 vst1.u32 { pixels_high[0] }, [ fb_ptr ]!; \
2573 vst1.u16 { pixels_high[2] }, [ fb_ptr ]; \
2574 bal 1f; \
2575 \
2576 11: \
2577 vst1.u32 { pixels }, [ fb_ptr ]; \
2578 bal 1f; \
75e28f62
E
2579 \
2580 1: \
2581 add span_uvrg_offset, span_uvrg_offset, #16; \
2582 add span_b_offset, span_b_offset, #4; \
2583 \
2584 add span_edge_data, span_edge_data, #8; \
2585 subs num_spans, num_spans, #1; \
2586 \
2587 bne 0b; \
2588 \
2589 ldmia sp!, { r4 - r11, pc } \
2590
2591setup_blocks_shaded_untextured_direct_builder(undithered)
2592setup_blocks_shaded_untextured_direct_builder(dithered)
2593
2594
2595#undef psx_gpu
2596#undef num_blocks
2597#undef triangle
2598#undef c_64
2599
2600#define psx_gpu r0
2601#define block_ptr r1
2602#define num_blocks r2
2603#define uv_01 r3
2604#define uv_23 r4
2605#define uv_45 r5
2606#define uv_67 r6
2607#define uv_0 r7
2608#define uv_1 r3
2609#define uv_2 r8
2610#define uv_3 r4
2611#define uv_4 r9
2612#define uv_5 r5
2613#define uv_6 r10
2614#define uv_7 r6
2615#define texture_ptr r11
2616
2617#define pixel_0 r7
2618#define pixel_1 r3
2619#define pixel_2 r8
2620#define pixel_3 r4
2621#define pixel_4 r9
2622#define pixel_5 r5
2623#define pixel_6 r10
2624#define pixel_7 r6
2625
2626#define pixels_a r7
2627#define pixels_b r9
2628#define pixels_c r8
2629#define pixels_d r10
2630
2631#define c_64 r0
2632
2633#define clut_ptr r12
2634#define current_texture_mask r5
2635#define dirty_textures_mask r6
2636
2637#define texels d0
2638
2639#define clut_low_a d2
2640#define clut_low_b d3
2641#define clut_high_a d4
2642#define clut_high_b d5
2643
2644#define clut_a q1
2645#define clut_b q2
2646
2647#define texels_low d6
2648#define texels_high d7
2649
2650.align 3
2651
2652function(texture_blocks_untextured)
2653 bx lr
2654
2655
2656.align 3
2657
2658function(texture_blocks_4bpp)
2659 stmdb sp!, { r3 - r11, r14 }
2660 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2661
2662 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2663 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2664
2665 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2666 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]
2667
2668 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2669 vuzp.u8 clut_a, clut_b
2670
2671 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
2672 tst dirty_textures_mask, current_texture_mask
2673
2674 bne 1f
2675 mov c_64, #64
2676
26770:
2678 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2679
2680 uxtah uv_0, texture_ptr, uv_01
2681 uxtah uv_1, texture_ptr, uv_01, ror #16
2682
2683 uxtah uv_2, texture_ptr, uv_23
2684 uxtah uv_3, texture_ptr, uv_23, ror #16
2685
2686 uxtah uv_4, texture_ptr, uv_45
2687 ldrb pixel_0, [ uv_0 ]
2688
2689 uxtah uv_5, texture_ptr, uv_45, ror #16
2690 ldrb pixel_1, [ uv_1 ]
2691
2692 uxtah uv_6, texture_ptr, uv_67
2693 ldrb pixel_2, [ uv_2 ]
2694
2695 uxtah uv_7, texture_ptr, uv_67, ror #16
2696 ldrb pixel_3, [ uv_3 ]
2697
2698 ldrb pixel_4, [ uv_4 ]
2699 subs num_blocks, num_blocks, #1
2700
2701 ldrb pixel_5, [ uv_5 ]
2702 orr pixels_a, pixel_0, pixel_1, lsl #8
2703
2704 ldrb pixel_6, [ uv_6 ]
2705 orr pixels_b, pixel_4, pixel_5, lsl #8
2706
2707 ldrb pixel_7, [ uv_7 ]
2708 orr pixels_a, pixels_a, pixel_2, lsl #16
2709
2710 orr pixels_b, pixels_b, pixel_6, lsl #16
2711 orr pixels_a, pixels_a, pixel_3, lsl #24
2712
2713 orr pixels_b, pixels_b, pixel_7, lsl #24
ed0fd81d 2714 vmov texels, pixels_a, pixels_b
75e28f62
E
2715
2716 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2717 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2718
2719 vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64
2720 bne 0b
2721
2722 ldmia sp!, { r3 - r11, pc }
2723
27241:
2725 stmdb sp!, { r1 - r2 }
2726 bl update_texture_4bpp_cache
2727
2728 mov c_64, #64
2729 ldmia sp!, { r1 - r2 }
2730 bal 0b
2731
2732
2733.align 3
2734
2735function(texture_blocks_8bpp)
2736 stmdb sp!, { r3 - r11, r14 }
2737 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2738
2739 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2740 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2741
2742 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2743 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2744
2745 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]
2746 tst dirty_textures_mask, current_texture_mask
2747
2748 bne 1f
2749 nop
2750
27510:
2752 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2753
2754 uxtah uv_0, texture_ptr, uv_01
2755 uxtah uv_1, texture_ptr, uv_01, ror #16
2756
2757 uxtah uv_2, texture_ptr, uv_23
2758 uxtah uv_3, texture_ptr, uv_23, ror #16
2759
2760 uxtah uv_4, texture_ptr, uv_45
2761 ldrb pixel_0, [ uv_0 ]
2762
2763 uxtah uv_5, texture_ptr, uv_45, ror #16
2764 ldrb pixel_1, [ uv_1 ]
2765
2766 uxtah uv_6, texture_ptr, uv_67
2767 ldrb pixel_2, [ uv_2 ]
2768
2769 uxtah uv_7, texture_ptr, uv_67, ror #16
2770 ldrb pixel_3, [ uv_3 ]
2771
2772 ldrb pixel_4, [ uv_4 ]
2773 add pixel_0, pixel_0, pixel_0
2774
2775 ldrb pixel_5, [ uv_5 ]
2776 add pixel_1, pixel_1, pixel_1
2777
2778 ldrb pixel_6, [ uv_6 ]
2779 add pixel_2, pixel_2, pixel_2
2780
2781 ldrb pixel_7, [ uv_7 ]
2782 add pixel_3, pixel_3, pixel_3
2783
2784 ldrh pixel_0, [ clut_ptr, pixel_0 ]
2785 add pixel_4, pixel_4, pixel_4
2786
2787 ldrh pixel_1, [ clut_ptr, pixel_1 ]
2788 add pixel_5, pixel_5, pixel_5
2789
2790 ldrh pixel_2, [ clut_ptr, pixel_2 ]
2791 add pixel_6, pixel_6, pixel_6
2792
2793 ldrh pixel_3, [ clut_ptr, pixel_3 ]
2794 add pixel_7, pixel_7, pixel_7
2795
2796 ldrh pixel_4, [ clut_ptr, pixel_4 ]
2797 orr pixels_a, pixel_0, pixel_1, lsl #16
2798
2799 ldrh pixel_5, [ clut_ptr, pixel_5 ]
2800 orr pixels_c, pixel_2, pixel_3, lsl #16
2801
2802 ldrh pixel_6, [ clut_ptr, pixel_6 ]
2803 subs num_blocks, num_blocks, #1
2804
2805 ldrh pixel_7, [ clut_ptr, pixel_7 ]
2806 orr pixels_b, pixel_4, pixel_5, lsl #16
2807
2808 orr pixels_d, pixel_6, pixel_7, lsl #16
2809 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2810
2811 add block_ptr, block_ptr, #64
2812 bne 0b
2813
2814 ldmia sp!, { r3 - r11, pc }
2815
28161:
4d646738 2817 stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
75e28f62
E
2818
2819 bl update_texture_8bpp_cache
2820
4d646738 2821 ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
75e28f62
E
2822 bal 0b
2823
2824
2825#undef uv_0
2826#undef uv_1
2827#undef uv_2
2828#undef uv_3
2829#undef uv_4
2830#undef uv_5
2831#undef uv_6
2832#undef uv_7
2833
2834#undef pixel_0
2835#undef pixel_1
2836#undef pixel_2
2837#undef pixel_3
2838#undef pixel_4
2839#undef pixel_5
2840#undef pixel_6
2841#undef pixel_7
2842
2843#undef texture_ptr
2844
2845#undef pixels_a
2846#undef pixels_b
2847#undef pixels_c
2848#undef pixels_d
2849
2850#define psx_gpu r0
2851#define block_ptr r1
2852#define num_blocks r2
2853
2854#define uv_0 r3
2855#define uv_1 r4
2856#define u_0 r3
2857#define u_1 r4
2858#define v_0 r5
2859#define v_1 r6
2860
2861#define uv_2 r5
2862#define uv_3 r6
2863#define u_2 r5
2864#define u_3 r6
2865#define v_2 r7
2866#define v_3 r8
2867
2868#define uv_4 r7
2869#define uv_5 r8
2870#define u_4 r7
2871#define u_5 r8
2872#define v_4 r9
2873#define v_5 r10
2874
2875#define uv_6 r9
2876#define uv_7 r10
2877#define u_6 r9
2878#define u_7 r10
2879#define v_6 r11
2880#define v_7 r0
2881
2882#define pixel_0 r3
2883#define pixel_1 r4
2884#define pixel_2 r5
2885#define pixel_3 r6
2886#define pixel_4 r7
2887#define pixel_5 r8
2888#define pixel_6 r9
2889#define pixel_7 r10
2890
2891#define pixels_a r3
2892#define pixels_b r5
2893#define pixels_c r7
2894#define pixels_d r9
2895
2896#define texture_ptr r12
2897
2898
2899.align 3
2900
2901function(texture_blocks_16bpp)
2902 stmdb sp!, { r3 - r11, r14 }
2903 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2904
2905 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2906 ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2907
29080:
2909 ldrh uv_0, [ block_ptr ]
2910 subs num_blocks, num_blocks, #1
2911
2912 ldrh uv_1, [ block_ptr, #2 ]
2913
2914 and v_0, uv_0, #0xFF00
2915 and v_1, uv_1, #0xFF00
2916
2917 and u_0, uv_0, #0xFF
2918 and u_1, uv_1, #0xFF
2919
2920 add uv_0, u_0, v_0, lsl #2
2921 ldrh uv_2, [ block_ptr, #4 ]
2922
2923 add uv_1, u_1, v_1, lsl #2
2924 ldrh uv_3, [ block_ptr, #6 ]
2925
2926 add uv_0, uv_0, uv_0
2927 add uv_1, uv_1, uv_1
2928
2929 and v_2, uv_2, #0xFF00
2930 and v_3, uv_3, #0xFF00
2931
2932 and u_2, uv_2, #0xFF
2933 and u_3, uv_3, #0xFF
2934
2935 add uv_2, u_2, v_2, lsl #2
2936 ldrh uv_4, [ block_ptr, #8 ]
2937
2938 add uv_3, u_3, v_3, lsl #2
2939 ldrh uv_5, [ block_ptr, #10 ]
2940
2941 add uv_2, uv_2, uv_2
2942 add uv_3, uv_3, uv_3
2943
2944 and v_4, uv_4, #0xFF00
2945 and v_5, uv_5, #0xFF00
2946
2947 and u_4, uv_4, #0xFF
2948 and u_5, uv_5, #0xFF
2949
2950 add uv_4, u_4, v_4, lsl #2
2951 ldrh uv_6, [ block_ptr, #12 ]
2952
2953 add uv_5, u_5, v_5, lsl #2
2954 ldrh uv_7, [ block_ptr, #14 ]
2955
2956 add uv_4, uv_4, uv_4
2957 ldrh pixel_0, [ texture_ptr, uv_0 ]
2958
2959 add uv_5, uv_5, uv_5
2960 ldrh pixel_1, [ texture_ptr, uv_1 ]
2961
2962 and v_6, uv_6, #0xFF00
2963 ldrh pixel_2, [ texture_ptr, uv_2 ]
2964
2965 and v_7, uv_7, #0xFF00
2966 ldrh pixel_3, [ texture_ptr, uv_3 ]
2967
2968 and u_6, uv_6, #0xFF
2969 ldrh pixel_4, [ texture_ptr, uv_4 ]
2970
2971 and u_7, uv_7, #0xFF
2972 ldrh pixel_5, [ texture_ptr, uv_5 ]
2973
2974 add uv_6, u_6, v_6, lsl #2
2975 add uv_7, u_7, v_7, lsl #2
2976
2977 add uv_6, uv_6, uv_6
2978 add uv_7, uv_7, uv_7
2979
2980 orr pixels_a, pixel_0, pixel_1, lsl #16
2981 orr pixels_b, pixel_2, pixel_3, lsl #16
2982
2983 ldrh pixel_6, [ texture_ptr, uv_6 ]
2984 orr pixels_c, pixel_4, pixel_5, lsl #16
2985
2986 ldrh pixel_7, [ texture_ptr, uv_7 ]
2987 orr pixels_d, pixel_6, pixel_7, lsl #16
2988
2989 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2990 add block_ptr, block_ptr, #64
2991
2992 bne 0b
2993
2994 ldmia sp!, { r3 - r11, pc }
2995
2996
2997#undef num_blocks
2998
2999#undef test_mask
3000#undef texels
3001#undef pixels_b
3002#undef pixels
3003#undef d64_1
3004#undef d64_4
3005#undef d64_128
3006#undef draw_mask
3007#undef msb_mask
3008#undef msb_mask_low
3009#undef msb_mask_high
3010#undef fb_pixels
3011
3012#undef c_32
3013#undef fb_ptr
3014#undef mask_msb_ptr
3015
3016#define psx_gpu r0
3017#define num_blocks r1
3018#define color_ptr r2
3867c6ef
E
3019#define colors_scalar r2
3020#define colors_scalar_compare r3
75e28f62
E
3021#define mask_msb_ptr r2
3022
3023#define block_ptr_load_a r0
3024#define block_ptr_store r3
3025#define block_ptr_load_b r12
3026#define c_32 r2
3027
3028#define c_48 r4
3029#define fb_ptr r14
3030#define draw_mask_bits_scalar r5
3031
3032#define d128_0x07 q0
3033#define d128_0x1F q1
3034#define d128_0x8000 q2
3035#define test_mask q3
3036#define texels q4
3037#define colors_rg q5
3038#define colors_b_dm_bits q6
3039#define texels_rg q7
3040#define pixels_r q8
3041#define pixels_g q9
3042#define pixels_b q10
3043#define pixels q11
3044#define zero_mask q4
3045#define draw_mask q12
3046#define msb_mask q13
3047
3048#define fb_pixels q8
3049
3050#define pixels_gb_low q9
3051
3052#define colors_r d10
3053#define colors_g d11
3054#define colors_b d12
3055#define draw_mask_bits d13
3056#define texels_r d14
3057#define texels_g d15
3058#define pixels_r_low d16
3059#define pixels_g_low d18
3060#define pixels_b_low d19
3061#define msb_mask_low d26
3062#define msb_mask_high d27
3063
3064#define d64_1 d28
3065#define d64_4 d29
3066#define d64_128 d30
3067#define texels_b d31
3068
3069#define shade_blocks_textured_modulated_prologue_indirect() \
3070 mov c_48, #48; \
3071 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3072
3073#define shade_blocks_textured_modulated_prologue_direct() \
3074 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3075 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ] \
3076
75e28f62 3077
3867c6ef
E
3078#define shade_blocks_textured_modulated_prologue_shaded(dithering, target) \
3079
3080#define shade_blocks_textured_false_modulation_check_undithered(target) \
3081 ldr colors_scalar, [ psx_gpu, #psx_gpu_triangle_color_offset ]; \
3082 movw colors_scalar_compare, #0x8080; \
3083 \
3084 movt colors_scalar_compare, #0x80; \
3085 cmp colors_scalar, colors_scalar_compare; \
3086 beq shade_blocks_textured_unmodulated_##target \
3087
3088#define shade_blocks_textured_false_modulation_check_dithered(target) \
3089
3090#define shade_blocks_textured_modulated_prologue_unshaded(dithering, target) \
3091 shade_blocks_textured_false_modulation_check_##dithering(target); \
75e28f62
E
3092 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
3093 vld1.u32 { colors_r[] }, [ color_ptr, :32 ]; \
3094 vdup.u8 colors_g, colors_r[1]; \
3095 vdup.u8 colors_b, colors_r[2]; \
3096 vdup.u8 colors_r, colors_r[0] \
3097
3098
3099#define shade_blocks_textured_modulated_load_dithered(target) \
3100 vld1.u32 { target }, [ block_ptr_load_b, :128 ] \
3101
3102#define shade_blocks_textured_modulated_load_last_dithered(target) \
3103 vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32 \
3104
3105#define shade_blocks_textured_modulated_load_undithered(target) \
3106
3107#define shade_blocks_textured_modulated_load_last_undithered(target) \
3108 add block_ptr_load_b, block_ptr_load_b, #32 \
3109
3110#define shade_blocks_textured_modulate_dithered(channel) \
3111 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3112
3113#define shade_blocks_textured_modulate_undithered(channel) \
3114 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3115
3116
3117#define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
3118 vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]! \
3119
3120#define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
3121 ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ]; \
3122 vld1.u32 { fb_pixels }, [ fb_ptr ]; \
3123 vbit.u16 pixels, fb_pixels, draw_mask \
3124
3125#define shade_blocks_textured_modulated_store_pixels_indirect() \
3126 vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48 \
3127
3128#define shade_blocks_textured_modulated_store_pixels_direct() \
3129 vst1.u32 { pixels }, [ fb_ptr ] \
3130
3131
3132#define shade_blocks_textured_modulated_load_rg_shaded() \
3133 vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32 \
3134
3135#define shade_blocks_textured_modulated_load_rg_unshaded() \
3136 add block_ptr_load_b, block_ptr_load_b, #32 \
3137
3138#define shade_blocks_textured_modulated_load_bdm_shaded() \
3139 vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32 \
3140
3141#define shade_blocks_textured_modulated_load_bdm_unshaded() \
3142 ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ]; \
3143 add block_ptr_load_a, block_ptr_load_a, #32 \
3144
3145#define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3146 vdup.u16 draw_mask, draw_mask_bits[0] \
3147
3148#define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3149 vdup.u16 draw_mask, draw_mask_bits_scalar \
3150
3151
3152#define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3153
3154#define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3155 vorr.u16 pixels, pixels, msb_mask \
3156
3157
3158#define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3159.align 3; \
3160 \
3161function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
3867c6ef 3162 shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
75e28f62
E
3163 stmdb sp!, { r4 - r5, lr }; \
3164 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3165 \
3166 vld1.u32 { test_mask }, [ psx_gpu, :128 ]; \
3167 \
3168 shade_blocks_textured_modulated_prologue_##target(); \
75e28f62
E
3169 \
3170 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3171 mov c_32, #32; \
3172 \
3173 add block_ptr_load_b, block_ptr_load_a, #16; \
3174 vmov.u8 d64_1, #1; \
3175 vmov.u8 d64_4, #4; \
3176 vmov.u8 d64_128, #128; \
3177 \
3178 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3179 vmov.u8 d128_0x07, #0x07; \
3180 \
3181 shade_blocks_textured_modulated_load_rg_##shading(); \
3182 vmov.u8 d128_0x1F, #0x1F; \
3183 \
3184 shade_blocks_textured_modulated_load_bdm_##shading(); \
3185 vmov.u16 d128_0x8000, #0x8000; \
3186 \
3187 vmovn.u16 texels_r, texels; \
3188 vshrn.u16 texels_g, texels, #5; \
3189 \
3190 vshrn.u16 texels_b, texels, #7; \
3191 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3192 \
3193 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3194 vtst.u16 draw_mask, draw_mask, test_mask; \
3195 \
3196 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3197 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3198 \
3199 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3200 vshr.u8 texels_b, texels_b, #3; \
3201 \
3202 shade_blocks_textured_modulate_##dithering(r); \
3203 shade_blocks_textured_modulate_##dithering(g); \
3204 shade_blocks_textured_modulate_##dithering(b); \
3205 \
3206 vand.u16 pixels, texels, d128_0x8000; \
3207 vceq.u16 zero_mask, texels, #0; \
3208 \
3209 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3210 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3211 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3212 \
3213 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3214 vorr.u16 draw_mask, draw_mask, zero_mask; \
3215 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3216 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3217 \
3218 subs num_blocks, num_blocks, #1; \
3219 beq 1f; \
3220 \
3221 .align 3; \
3222 \
3223 0: \
3224 vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32; \
3225 shade_blocks_textured_modulated_load_rg_##shading(); \
3226 vshrn.u16 texels_g, texels, #5; \
3227 \
3228 shade_blocks_textured_modulated_load_bdm_##shading(); \
3229 vshrn.u16 texels_b, texels, #7; \
3230 \
59d15d23 3231 pld [ block_ptr_load_a ]; \
75e28f62
E
3232 vmovn.u16 texels_r, texels; \
3233 vmlal.u8 pixels, pixels_r_low, d64_1; \
3234 \
3235 vmlal.u8 pixels, pixels_g_low, d64_4; \
3236 vmlal.u8 pixels, pixels_b_low, d64_128; \
3237 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3238 \
3239 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3240 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3241 \
3242 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3243 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3244 \
3245 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3246 vtst.u16 draw_mask, draw_mask, test_mask; \
3247 \
3248 shade_blocks_textured_modulated_store_pixels_##target(); \
3249 vshr.u8 texels_b, texels_b, #3; \
3250 \
3251 shade_blocks_textured_modulate_##dithering(r); \
3252 shade_blocks_textured_modulate_##dithering(g); \
3253 shade_blocks_textured_modulate_##dithering(b); \
3254 \
3255 vand.u16 pixels, texels, d128_0x8000; \
3256 vceq.u16 zero_mask, texels, #0; \
3257 \
3258 subs num_blocks, num_blocks, #1; \
3259 \
3260 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3261 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3262 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3263 \
3264 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3265 vorr.u16 draw_mask, draw_mask, zero_mask; \
3266 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3267 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3268 \
3269 bne 0b; \
3270 \
3271 1: \
3272 vmlal.u8 pixels, pixels_r_low, d64_1; \
3273 vmlal.u8 pixels, pixels_g_low, d64_4; \
3274 vmlal.u8 pixels, pixels_b_low, d64_128; \
3275 \
3276 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3277 shade_blocks_textured_modulated_store_pixels_##target(); \
3278 \
3279 ldmia sp!, { r4 - r5, pc } \
3280
3281
3282shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3283shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3284shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3285shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3286
3287shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3288shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3289shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3290shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3291
3292
3293#undef c_64
3294#undef fb_ptr
3295#undef color_ptr
3296
3297#undef color_r
3298#undef color_g
3299#undef color_b
3300
3301#undef test_mask
3302#undef pixels
3303#undef draw_mask
3304#undef zero_mask
3305#undef fb_pixels
3306#undef msb_mask
3307#undef msb_mask_low
3308#undef msb_mask_high
3309
3310#define psx_gpu r0
3311#define num_blocks r1
3312#define mask_msb_ptr r2
3313#define color_ptr r3
3314
3315#define block_ptr_load r0
3316#define draw_mask_store_ptr r3
3317#define draw_mask_bits_ptr r12
3318#define draw_mask_ptr r12
3319#define pixel_store_ptr r14
3320
3321#define fb_ptr_cmp r4
3322
3323#define fb_ptr r3
3324#define fb_ptr_next r14
3325
3326#define c_64 r2
3327
3328#define test_mask q0
3329#define pixels q1
3330#define draw_mask q2
3331#define zero_mask q3
3332#define draw_mask_combined q4
3333#define fb_pixels q5
3334#define fb_pixels_next q6
3335#define msb_mask q7
3336
3337#define draw_mask_low d4
3338#define draw_mask_high d5
3339#define msb_mask_low d14
3340#define msb_mask_high d15
3341
3342.align 3
3343function(shade_blocks_textured_unmodulated_indirect)
3344 str r14, [ sp, #-4 ]
3345 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3346
3347 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3348 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3349
3350 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3351 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3352
3353 mov c_64, #64
3354 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3355
3356 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3357 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3358 [ draw_mask_bits_ptr, :16 ], c_64
3359 vceq.u16 zero_mask, pixels, #0
3360
3361 vtst.u16 draw_mask, draw_mask, test_mask
3362 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3363
3364 subs num_blocks, num_blocks, #1
3365 beq 1f
3366
3367 0:
3368 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3369 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3370
3371 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3372 [ draw_mask_bits_ptr, :16 ], c_64
3373 vceq.u16 zero_mask, pixels, #0
3374
3375 vtst.u16 draw_mask, draw_mask, test_mask
3376 vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3377
3378 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3379 subs num_blocks, num_blocks, #1
3380
3381 bne 0b
3382
3383 1:
3384 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3385 vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3386
3387 ldr pc, [ sp, #-4 ]
3388
3389
3390.align 3
3391
3392function(shade_blocks_textured_unmodulated_direct)
3393 stmdb sp!, { r4, r14 }
3394 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3395
3396 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3397 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3398
3399 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3400 mov c_64, #64
3401
3402 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3403 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3404
3405 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3406 [ draw_mask_bits_ptr, :16 ], c_64
3407 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3408
3409 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3410 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3411 vceq.u16 zero_mask, pixels, #0
3412 vtst.u16 draw_mask, draw_mask, test_mask
3413
3414 subs num_blocks, num_blocks, #1
3415 beq 1f
3416
3417 0:
3418 mov fb_ptr, fb_ptr_next
3419 ldr fb_ptr_next, [ block_ptr_load, #44 ]
3420
3421 vorr.u16 pixels, pixels, msb_mask
3422
3423 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3424 vmov fb_pixels, fb_pixels_next
3425
3426 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
3427 [ draw_mask_bits_ptr, :16 ], c_64
3428 vbif.u16 fb_pixels, pixels, draw_mask_combined
3429
75e28f62 3430 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
8438c3c7 3431 pld [ fb_ptr_next, #64 ]
3432
75e28f62 3433 add fb_ptr_cmp, fb_ptr_cmp, #14
8438c3c7 3434 vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3435
75e28f62
E
3436 cmp fb_ptr_cmp, #28
3437 bls 4f
3438
3439 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3440 vceq.u16 zero_mask, pixels, #0
3441
3442 vst1.u16 { fb_pixels }, [ fb_ptr ]
3443 vtst.u16 draw_mask, draw_mask, test_mask
3444
3445 3:
3446 subs num_blocks, num_blocks, #1
3447 bne 0b
3448
3449 1:
3450 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3451 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3452
3453 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3454
3455 ldmia sp!, { r4, pc }
3456
3457 4:
3458 vst1.u16 { fb_pixels }, [ fb_ptr ]
3459 vceq.u16 zero_mask, pixels, #0
3460
3461 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3462 vtst.u16 draw_mask, draw_mask, test_mask
3463
3464 bal 3b
3465
3466
3467function(shade_blocks_unshaded_untextured_indirect)
3468 bx lr
3469
3470.align 3
3471
3472function(shade_blocks_unshaded_untextured_direct)
3473 stmdb sp!, { r4, r14 }
3474 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3475
3476 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3477 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3478
3479 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3480 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3481
3482 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3483 vld1.u16 { pixels }, [ color_ptr, :128 ]
3484
3485 mov c_64, #64
3486 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3487
3488 vorr.u16 pixels, pixels, msb_mask
3489 subs num_blocks, num_blocks, #1
3490
3491 ldr fb_ptr_next, [ block_ptr_load ], #64
3492
3493 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3494 beq 1f
3495
3496 0:
3497 vmov fb_pixels, fb_pixels_next
3498 mov fb_ptr, fb_ptr_next
3499 ldr fb_ptr_next, [ block_ptr_load ], #64
3500
3501 vbif.u16 fb_pixels, pixels, draw_mask
3502 vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3503
3504 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3505 add fb_ptr_cmp, fb_ptr_cmp, #14
3506 cmp fb_ptr_cmp, #28
3507 bls 4f
3508
3509 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3510 vst1.u16 { fb_pixels }, [ fb_ptr ]
3511
3512 3:
3513 subs num_blocks, num_blocks, #1
3514 bne 0b
3515
3516 1:
3517 vbif.u16 fb_pixels_next, pixels, draw_mask
3518 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3519
3520 ldmia sp!, { r4, pc }
3521
3522 4:
3523 vst1.u16 { fb_pixels }, [ fb_ptr ]
3524 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3525 bal 3b
3526
3527
3528#undef draw_mask_ptr
3529#undef c_64
3530#undef fb_ptr
3531#undef fb_ptr_next
3532#undef fb_ptr_cmp
3533
3534#define psx_gpu r0
3535#define num_blocks r1
3536#define msb_mask_ptr r2
3537#define pixel_ptr r3
3538#define draw_mask_ptr r0
3539#define c_64 r2
3540#define fb_ptr r12
3541#define fb_ptr_next r14
3542#define fb_ptr_cmp r4
3543
3544#undef msb_mask
3545#undef draw_mask
3546#undef pixels
3547#undef fb_pixels
3548#undef d128_0x8000
3549#undef msb_mask_low
3550#undef msb_mask_high
3551#undef draw_mask_next
3552#undef pixels_g
3553#undef blend_pixels
3554#undef fb_pixels_next
3555
3556#define msb_mask q0
3557#define draw_mask q1
3558#define pixels q2
3559#define fb_pixels q3
3560#define blend_pixels q4
3561#define pixels_no_msb q5
3562#define blend_mask q6
3563#define fb_pixels_no_msb q7
3564#define d128_0x8000 q8
3565#define d128_0x0421 q9
3566#define fb_pixels_next q10
3567#define blend_pixels_next q11
3568#define pixels_next q12
3569#define draw_mask_next q13
3570#define write_mask q14
3571
3572#define pixels_rb q5
3573#define pixels_mg q7
3574#define pixels_g q7
3575#define d128_0x7C1F q8
3576#define d128_0x03E0 q9
3577#define fb_pixels_rb q10
3578#define fb_pixels_g q11
3579#define fb_pixels_masked q11
3580#define d128_0x83E0 q15
3581#define pixels_fourth q7
3582#define d128_0x1C07 q12
3583#define d128_0x00E0 q13
3584#define d128_0x80E0 q13
3585
3586#define msb_mask_low d0
3587#define msb_mask_high d1
3588
3589#define blend_blocks_average_set_blend_mask_textured(source) \
3590 vclt.s16 blend_mask, source, #0 \
3591
3592#define blend_blocks_average_set_stp_bit_textured() \
3593 vorr.u16 blend_pixels, #0x8000 \
3594
3595#define blend_blocks_average_combine_textured(source) \
3596 vbif.u16 blend_pixels, source, blend_mask \
3597
3598#define blend_blocks_average_set_blend_mask_untextured(source) \
3599
3600#define blend_blocks_average_set_stp_bit_untextured() \
3601
3602#define blend_blocks_average_combine_untextured(source) \
3603
3604#define blend_blocks_average_mask_set_on() \
3605 vclt.s16 write_mask, fb_pixels_next, #0 \
3606
3607#define blend_blocks_average_mask_copy_on() \
3608 vorr.u16 draw_mask, draw_mask_next, write_mask \
3609
3610#define blend_blocks_average_mask_copy_b_on() \
3611 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3612
3613#define blend_blocks_average_mask_set_off() \
3614
3615#define blend_blocks_average_mask_copy_off() \
3616 vmov draw_mask, draw_mask_next \
3617
3618#define blend_blocks_average_mask_copy_b_off() \
3619
3620#define blend_blocks_average_builder(texturing, mask_evaluate) \
3621.align 3; \
3622 \
3623function(blend_blocks_##texturing##_average_##mask_evaluate) \
3624 stmdb sp!, { r4, r14 }; \
3625 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3626 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3627 \
3628 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3629 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3630 \
3631 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3632 mov c_64, #64; \
3633 \
3634 vmov.u16 d128_0x8000, #0x8000; \
3635 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3636 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3637 \
3638 vmov.u16 d128_0x0421, #0x0400; \
3639 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3640 \
3641 vorr.u16 d128_0x0421, #0x0021; \
3642 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3643 \
3644 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3645 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3646 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3647 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3648 blend_blocks_average_mask_set_##mask_evaluate(); \
3649 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3650 \
3651 subs num_blocks, num_blocks, #1; \
3652 beq 1f; \
3653 \
3654 0: \
3655 mov fb_ptr, fb_ptr_next; \
3656 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3657 \
3658 vmov pixels, pixels_next; \
3659 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3660 \
3661 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3662 \
3663 blend_blocks_average_mask_copy_##mask_evaluate(); \
3664 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3665 \
3666 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3667 blend_blocks_average_set_stp_bit_##texturing(); \
3668 vmov fb_pixels, fb_pixels_next; \
3669 blend_blocks_average_combine_##texturing(pixels); \
3670 \
3671 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3672 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3673 cmp fb_ptr_cmp, #28; \
3674 bls 2f; \
3675 \
3676 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3677 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3678 \
3679 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3680 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3681 \
3682 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3683 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3684 \
3685 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3686 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3687 blend_blocks_average_mask_set_##mask_evaluate(); \
3688 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3689 \
3690 3: \
3691 subs num_blocks, num_blocks, #1; \
3692 bne 0b; \
3693 \
3694 1: \
3695 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3696 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3697 \
3698 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3699 blend_blocks_average_set_stp_bit_##texturing(); \
3700 blend_blocks_average_combine_##texturing(pixels_next); \
3701 \
3702 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3703 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
3704 vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3705 \
3706 ldmia sp!, { r4, pc }; \
3707 \
3708 2: \
3709 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3710 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3711 vst1.u16 { fb_pixels }, [ fb_ptr ]; \
3712 \
3713 vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]; \
3714 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3715 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3716 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3717 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3718 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3719 \
3720 bal 3b \
3721
3722blend_blocks_average_builder(textured, off)
3723blend_blocks_average_builder(untextured, off)
3724blend_blocks_average_builder(textured, on)
3725blend_blocks_average_builder(untextured, on)
3726
3727
3728#define blend_blocks_add_mask_set_on() \
3729 vclt.s16 write_mask, fb_pixels, #0 \
3730
3731#define blend_blocks_add_mask_copy_on() \
3732 vorr.u16 draw_mask, draw_mask, write_mask \
3733
3734#define blend_blocks_add_mask_set_off() \
3735
3736#define blend_blocks_add_mask_copy_off() \
3737
3738
3739#define blend_blocks_add_textured_builder(mask_evaluate) \
3740.align 3; \
3741 \
3742function(blend_blocks_textured_add_##mask_evaluate) \
3743 stmdb sp!, { r4, r14 }; \
3744 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3745 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3746 \
3747 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3748 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3749 \
3750 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3751 mov c_64, #64; \
3752 \
3753 vmov.u16 d128_0x7C1F, #0x7C00; \
3754 vmov.u16 d128_0x03E0, #0x0300; \
3755 vmov.u16 d128_0x83E0, #0x8000; \
3756 vorr.u16 d128_0x03E0, #0x00E0; \
3757 vorr.u16 d128_0x7C1F, #0x001F; \
3758 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3759 \
3760 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3761 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3762 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3763 vclt.s16 blend_mask, pixels, #0; \
3764 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3765 blend_blocks_add_mask_set_##mask_evaluate(); \
3766 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3767 \
3768 blend_blocks_add_mask_copy_##mask_evaluate(); \
3769 vorr.u16 pixels, pixels, msb_mask; \
3770 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3771 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3772 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3773 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3774 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3775 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3776 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3777 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3778 \
3779 subs num_blocks, num_blocks, #1; \
3780 beq 1f; \
3781 \
3782 0: \
3783 mov fb_ptr, fb_ptr_next; \
3784 \
3785 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3786 \
3787 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3788 vclt.s16 blend_mask, pixels, #0; \
3789 \
3790 vorr.u16 pixels, pixels, msb_mask; \
3791 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3792 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3793 \
8438c3c7 3794 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3795 pld [ fb_ptr_next, #64 ]; \
75e28f62
E
3796 \
3797 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
8438c3c7 3798 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3799 \
75e28f62 3800 add fb_ptr_cmp, fb_ptr_cmp, #14; \
8438c3c7 3801 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3802 \
75e28f62
E
3803 cmp fb_ptr_cmp, #28; \
3804 bls 2f; \
3805 \
3806 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3807 blend_blocks_add_mask_set_##mask_evaluate(); \
3808 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3809 blend_blocks_add_mask_copy_##mask_evaluate(); \
3810 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3811 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3812 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3813 \
3814 3: \
3815 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3816 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3817 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3818 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3819 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3820 \
3821 subs num_blocks, num_blocks, #1; \
3822 bne 0b; \
3823 \
3824 1: \
3825 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3826 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3827 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3828 \
3829 ldmia sp!, { r4, pc }; \
3830 \
3831 2: \
3832 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3833 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3834 \
3835 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3836 blend_blocks_add_mask_set_##mask_evaluate(); \
3837 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3838 blend_blocks_add_mask_copy_##mask_evaluate(); \
3839 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3840 bal 3b \
3841
3842
3843#define blend_blocks_add_untextured_builder(mask_evaluate) \
3844.align 3; \
3845 \
3846function(blend_blocks_untextured_add_##mask_evaluate) \
3847 stmdb sp!, { r4, r14 }; \
3848 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3849 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3850 \
3851 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3852 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3853 \
3854 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3855 mov c_64, #64; \
3856 \
3857 vmov.u16 d128_0x7C1F, #0x7C00; \
3858 vmov.u16 d128_0x03E0, #0x0300; \
3859 vorr.u16 d128_0x7C1F, #0x001F; \
3860 vorr.u16 d128_0x03E0, #0x00E0; \
3861 \
3862 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3863 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3864 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3865 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3866 blend_blocks_add_mask_set_##mask_evaluate(); \
3867 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3868 \
3869 blend_blocks_add_mask_copy_##mask_evaluate(); \
3870 vand.u16 pixels_g, pixels, d128_0x03E0; \
3871 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3872 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3873 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3874 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3875 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3876 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3877 \
3878 subs num_blocks, num_blocks, #1; \
3879 beq 1f; \
3880 \
3881 0: \
3882 mov fb_ptr, fb_ptr_next; \
3883 \
3884 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3885 \
3886 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
3887 \
3888 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3889 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3890 vand.u16 pixels_g, pixels, d128_0x03E0; \
3891 \
3892 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3893 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
3894 \
3895 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3896 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3897 cmp fb_ptr_cmp, #28; \
3898 bls 2f; \
3899 \
3900 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3901 blend_blocks_add_mask_set_##mask_evaluate(); \
3902 blend_blocks_add_mask_copy_##mask_evaluate(); \
3903 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3904 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3905 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3906 \
3907 3: \
3908 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3909 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3910 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3911 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3912 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3913 \
3914 subs num_blocks, num_blocks, #1; \
3915 bne 0b; \
3916 \
3917 1: \
3918 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3919 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3920 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3921 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
3922 \
3923 ldmia sp!, { r4, pc }; \
3924 \
3925 2: \
3926 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
3927 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3928 \
3929 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3930 blend_blocks_add_mask_set_##mask_evaluate(); \
3931 blend_blocks_add_mask_copy_##mask_evaluate(); \
3932 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3933 bal 3b \
3934
3935
3936blend_blocks_add_textured_builder(off)
3937blend_blocks_add_textured_builder(on)
3938blend_blocks_add_untextured_builder(off)
3939blend_blocks_add_untextured_builder(on)
3940
3941#define blend_blocks_subtract_set_blend_mask_textured() \
3942 vclt.s16 blend_mask, pixels_next, #0 \
3943
3944#define blend_blocks_subtract_combine_textured() \
3945 vbif.u16 blend_pixels, pixels, blend_mask \
3946
3947#define blend_blocks_subtract_set_stb_textured() \
3948 vorr.u16 blend_pixels, #0x8000 \
3949
3950#define blend_blocks_subtract_msb_mask_textured() \
3951 vorr.u16 pixels, pixels_next, msb_mask \
3952
3953#define blend_blocks_subtract_set_blend_mask_untextured() \
3954
3955#define blend_blocks_subtract_combine_untextured() \
3956
3957#define blend_blocks_subtract_set_stb_untextured() \
3958 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3959
3960#define blend_blocks_subtract_msb_mask_untextured() \
3961
3962
3963#define blend_blocks_subtract_mask_set_on() \
3964 vclt.s16 write_mask, fb_pixels, #0 \
3965
3966#define blend_blocks_subtract_mask_copy_on() \
3967 vorr.u16 draw_mask, draw_mask_next, write_mask \
3968
3969#define blend_blocks_subtract_mask_set_off() \
3970
3971#define blend_blocks_subtract_mask_copy_off() \
3972 vmov draw_mask, draw_mask_next \
3973
3974
3975#define blend_blocks_subtract_builder(texturing, mask_evaluate) \
3976.align 3; \
3977 \
3978function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
3979 stmdb sp!, { r4, r14 }; \
3980 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
3981 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
3982 \
3983 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
3984 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
3985 \
3986 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3987 mov c_64, #64; \
3988 \
3989 vmov.u16 d128_0x7C1F, #0x7C00; \
3990 vmov.u16 d128_0x03E0, #0x0300; \
3991 vorr.u16 d128_0x7C1F, #0x001F; \
3992 vorr.u16 d128_0x03E0, #0x00E0; \
3993 \
3994 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
3995 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
3996 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
3997 blend_blocks_subtract_set_blend_mask_##texturing(); \
3998 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
3999 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4000 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
4001 \
4002 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
4003 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4004 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4005 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4006 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4007 \
4008 subs num_blocks, num_blocks, #1; \
4009 beq 1f; \
4010 \
4011 0: \
4012 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4013 mov fb_ptr, fb_ptr_next; \
4014 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4015 \
4016 vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64; \
4017 blend_blocks_subtract_msb_mask_##texturing(); \
4018 \
4019 vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64; \
4020 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4021 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
4022 blend_blocks_subtract_set_stb_##texturing(); \
4023 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
4024 blend_blocks_subtract_combine_##texturing(); \
4025 blend_blocks_subtract_set_blend_mask_##texturing(); \
4026 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4027 \
4028 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4029 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4030 cmp fb_ptr_cmp, #28; \
4031 bls 2f; \
4032 \
4033 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4034 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4035 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4036 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4037 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4038 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4039 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4040 \
4041 3: \
4042 subs num_blocks, num_blocks, #1; \
4043 bne 0b; \
4044 \
4045 1: \
4046 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4047 \
4048 blend_blocks_subtract_msb_mask_##texturing(); \
4049 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4050 blend_blocks_subtract_set_stb_##texturing(); \
4051 blend_blocks_subtract_combine_##texturing(); \
4052 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4053 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4054 \
4055 ldmia sp!, { r4, pc }; \
4056 \
4057 2: \
4058 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4059 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4060 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4061 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4062 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4063 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4064 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4065 bal 3b \
4066
4067
4068blend_blocks_subtract_builder(textured, off)
4069blend_blocks_subtract_builder(textured, on)
4070blend_blocks_subtract_builder(untextured, off)
4071blend_blocks_subtract_builder(untextured, on)
4072
4073
4074#define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
4075.align 3; \
4076 \
4077function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4078 stmdb sp!, { r4, r14 }; \
4079 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4080 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4081 \
4082 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4083 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4084 \
4085 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4086 mov c_64, #64; \
4087 \
4088 vmov.u16 d128_0x7C1F, #0x7C00; \
4089 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62 4090 vmov.u16 d128_0x1C07, #0x1C00; \
d1c75d1e 4091 vmov.u16 d128_0x00E0, #0x00E0; \
75e28f62
E
4092 vorr.u16 d128_0x7C1F, #0x001F; \
4093 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62 4094 vorr.u16 d128_0x1C07, #0x0007; \
75e28f62
E
4095 \
4096 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4097 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4098 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4099 vclt.s16 blend_mask, pixels, #0; \
4100 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4101 blend_blocks_add_mask_set_##mask_evaluate(); \
4102 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4103 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
75e28f62
E
4104 \
4105 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4106 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4107 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4108 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4109 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4110 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4111 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4112 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4113 \
4114 subs num_blocks, num_blocks, #1; \
4115 beq 1f; \
4116 \
4117 0: \
4118 mov fb_ptr, fb_ptr_next; \
75e28f62
E
4119 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4120 \
d1c75d1e
E
4121 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4122 vbif.u16 blend_pixels, pixels, blend_mask; \
4123 \
75e28f62
E
4124 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4125 vclt.s16 blend_mask, pixels, #0; \
75e28f62 4126 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4127 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62
E
4128 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4129 \
4130 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4131 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4132 \
4133 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4134 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4135 cmp fb_ptr_cmp, #28; \
4136 bls 2f; \
4137 \
4138 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4139 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4140 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4141 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4142 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4143 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4144 \
4145 3: \
d1c75d1e 4146 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4147 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4148 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4149 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4150 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4151 \
4152 subs num_blocks, num_blocks, #1; \
4153 bne 0b; \
4154 \
4155 1: \
4156 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
d1c75d1e
E
4157 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4158 vbif.u16 blend_pixels, pixels, blend_mask; \
75e28f62
E
4159 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4160 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4161 \
4162 ldmia sp!, { r4, pc }; \
4163 \
4164 2: \
4165 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
d1c75d1e 4166 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
75e28f62
E
4167 \
4168 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4169 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4170 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e 4171 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4172 bal 3b \
4173
4174
d1c75d1e 4175
75e28f62
E
4176#define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4177.align 3; \
4178 \
4179function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4180 stmdb sp!, { r4, r14 }; \
4181 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
4182 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
4183 \
4184 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
4185 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]; \
4186 \
4187 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4188 mov c_64, #64; \
4189 \
4190 vmov.u16 d128_0x7C1F, #0x7C00; \
4191 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62
E
4192 vmov.u16 d128_0x1C07, #0x1C00; \
4193 vmov.u16 d128_0x00E0, #0x00E0; \
4194 vorr.u16 d128_0x7C1F, #0x001F; \
4195 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62
E
4196 vorr.u16 d128_0x1C07, #0x0007; \
4197 \
4198 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4199 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4200 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4201 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4202 blend_blocks_add_mask_set_##mask_evaluate(); \
4203 vshr.s16 pixels_fourth, pixels, #2; \
4204 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4205 \
4206 blend_blocks_add_mask_copy_##mask_evaluate(); \
4207 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4208 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4209 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4210 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4211 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4212 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4213 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4214 \
4215 subs num_blocks, num_blocks, #1; \
4216 beq 1f; \
4217 \
4218 0: \
4219 mov fb_ptr, fb_ptr_next; \
75e28f62
E
4220 ldr fb_ptr_next, [ pixel_ptr, #28 ]; \
4221 \
4222 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64; \
4223 \
4224 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4225 vshr.s16 pixels_fourth, pixels, #2; \
4226 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4227 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4228 \
4229 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4230 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64; \
4231 \
4232 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4233 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4234 cmp fb_ptr_cmp, #28; \
4235 bls 2f; \
4236 \
4237 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4238 blend_blocks_add_mask_set_##mask_evaluate(); \
4239 blend_blocks_add_mask_copy_##mask_evaluate(); \
4240 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4241 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4242 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4243 \
4244 3: \
4245 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4246 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4247 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4248 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4249 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4250 \
4251 subs num_blocks, num_blocks, #1; \
4252 bne 0b; \
4253 \
4254 1: \
4255 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4256 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4257 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4258 vst1.u16 { blend_pixels }, [ fb_ptr_next ]; \
4259 \
4260 ldmia sp!, { r4, pc }; \
4261 \
4262 2: \
4263 vst1.u16 { blend_pixels }, [ fb_ptr ]; \
4264 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4265 \
4266 vld1.u16 { fb_pixels }, [ fb_ptr_next ]; \
4267 blend_blocks_add_mask_set_##mask_evaluate(); \
4268 blend_blocks_add_mask_copy_##mask_evaluate(); \
4269 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4270 bal 3b \
4271
4272
4273blend_blocks_add_fourth_textured_builder(off)
4274blend_blocks_add_fourth_textured_builder(on)
4275blend_blocks_add_fourth_untextured_builder(off)
4276blend_blocks_add_fourth_untextured_builder(on)
4277
4278// TODO: Optimize this more. Need a scene that actually uses it for
4279// confirmation..
4280
4281.align 3
4282
4283function(blend_blocks_textured_unblended_on)
4284 stmdb sp!, { r4, r14 }
4285 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4286 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4287
4288 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4289 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
4290
4291 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4292 mov c_64, #64
4293
4294 ldr fb_ptr, [ pixel_ptr, #28 ]
4295 vld1.u16 { fb_pixels }, [ fb_ptr ]
4296 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4297 vclt.s16 write_mask, fb_pixels, #0
4298 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4299
4300 subs num_blocks, num_blocks, #1
4301 beq 1f
4302
4303 0:
134f81ec 4304 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4305 vorr.u16 draw_mask, draw_mask, write_mask
4306 vbif.u16 fb_pixels, pixels, draw_mask
4307 vst1.u16 { fb_pixels }, [ fb_ptr ]
4308
4309 ldr fb_ptr, [ pixel_ptr, #28 ]
4310 vld1.u16 { fb_pixels }, [ fb_ptr ]
4311 vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4312 vclt.s16 write_mask, fb_pixels, #0
4313 vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4314
4315 subs num_blocks, num_blocks, #1
4316 bne 0b
4317
4318 1:
134f81ec 4319 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4320 vorr.u16 draw_mask, draw_mask, write_mask
4321 vbif.u16 fb_pixels, pixels, draw_mask
4322 vst1.u16 { fb_pixels }, [ fb_ptr ]
4323
4324 ldmia sp!, { r4, pc }
4325
4326
4327function(blend_blocks_textured_unblended_off)
4328 bx lr
4329
4330
4331function(warmup)
4332 mov r3, #64
4333 cmp r0, #0
4334 bxeq lr
4335
4336 0:
4337 vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3
4338
4339 subs r0, r0, #1
4340 bne 0b
4341
4342 bx lr
4343
6c4a10c4 4344#undef vram_ptr
75e28f62 4345#undef color
6c4a10c4 4346#undef width
75e28f62 4347#undef height
6c4a10c4 4348#undef pitch
75e28f62
E
4349
4350#define vram_ptr r0
6c4a10c4
E
4351#define color r1
4352#define width r2
4353#define height r3
75e28f62 4354
6c4a10c4 4355#define pitch r1
75e28f62 4356
6c4a10c4 4357#define num_width r12
75e28f62 4358
87c45ad1
E
4359#undef colors_a
4360#undef colors_b
75e28f62 4361
87c45ad1
E
4362#define colors_a q0
4363#define colors_b q1
75e28f62
E
4364
4365.align 3
4366
4367function(render_block_fill_body)
87c45ad1 4368 vdup.u16 colors_a, color
6c4a10c4 4369 mov pitch, #2048
75e28f62 4370
87c45ad1 4371 vmov colors_b, colors_a
75e28f62 4372 sub pitch, pitch, width, lsl #1
75e28f62 4373
6c4a10c4 4374 mov num_width, width
75e28f62 4375
6c4a10c4
E
4376 0:
4377 vst1.u32 { colors_a, colors_b }, [ vram_ptr, :256 ]!
75e28f62 4378
d1c75d1e 4379 subs num_width, num_width, #16
6c4a10c4 4380 bne 0b
75e28f62 4381
75e28f62 4382 add vram_ptr, vram_ptr, pitch
6c4a10c4
E
4383 mov num_width, width
4384
75e28f62
E
4385 subs height, height, #1
4386 bne 0b
75e28f62 4387
6c4a10c4
E
4388 bx lr
4389
75e28f62
E
4390
4391#undef x
4392#undef y
4393#undef width
4394#undef height
4395#undef fb_ptr
4396#undef texture_mask
4397#undef num_blocks
4398#undef temp
4399#undef dirty_textures_mask
4400#undef clut_ptr
4401#undef current_texture_mask
4402
4403#define psx_gpu r0
4404#define x r1
4405#define y r2
4406#define u r3
4407#define v r4
4408#define width r5
4409#define height r6
4410#define offset_u r8
4411#define offset_v r9
4412#define offset_u_right r10
4413#define width_rounded r11
4414#define height_rounded r12
4415
4416#define texture_offset_base r1
4417#define tile_width r2
4418#define tile_height r3
4419#define num_blocks r4
4420#define block r5
4421#define sub_tile_height r6
4422#define fb_ptr r7
4423#define texture_mask r8
4424#define column_data r9
4425#define texture_offset r10
4426#define tiles_remaining r11
4427#define fb_ptr_advance_column r12
4428#define texture_block_ptr r14
4429
8184d7c5 4430#define temp r14
4431
75e28f62
E
4432#define texture_page_ptr r3
4433#define left_block_mask r4
4434#define right_block_mask r5
4435#define texture_mask_rev r10
4436#define control_mask r11
4437
4438#define dirty_textures_mask r4
4439#define clut_ptr r5
4440#define current_texture_mask r6
4441
4442
4443#undef texels
4444#undef clut_low_a
4445#undef clut_low_b
4446#undef clut_high_a
4447#undef clut_high_b
4448#undef clut_a
4449#undef clut_b
4450#undef texels_low
4451#undef texels_high
4452
4453#define texels d0
4454#define draw_masks_fb_ptrs q1
4455
4456#define draw_mask_fb_ptr_left d2
4457#define draw_mask_fb_ptr_right d3
4458
59d15d23 4459#define draw_mask_fb_ptr_left_a d2
4460#define draw_mask_fb_ptr_left_b d3
4461#define draw_mask_fb_ptr_right_a d10
4462#define draw_mask_fb_ptr_right_b d11
4463#define draw_masks_fb_ptrs2 q5
4464
75e28f62
E
4465#define clut_low_a d4
4466#define clut_low_b d5
4467#define clut_high_a d6
4468#define clut_high_b d7
4469
4470#define block_masks d8
4471#define block_masks_shifted d9
4472
4473#define clut_a q2
4474#define clut_b q3
4475
59d15d23 4476#define texels_low d12
4477#define texels_high d13
75e28f62 4478
59d15d23 4479#define texels_wide_low d14
4480#define texels_wide_high d15
4481#define texels_wide q7
75e28f62
E
4482
4483
59d15d23 4484setup_sprite_flush_blocks:
4485 vpush { q1 - q5 }
75e28f62 4486
4d646738 4487 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 4488 bl flush_render_block_buffer
4d646738 4489 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 4490
59d15d23 4491 vpop { q1 - q5 }
75e28f62
E
4492
4493 add block, psx_gpu, #psx_gpu_blocks_offset
75e28f62
E
4494 bx lr
4495
4496
4497setup_sprite_update_texture_4bpp_cache:
4498 stmdb sp!, { r0 - r3, r14 }
4499 bl update_texture_4bpp_cache
4500 ldmia sp!, { r0 - r3, pc }
4501
4502
4503setup_sprite_update_texture_8bpp_cache:
4d646738 4504 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
75e28f62 4505 bl update_texture_8bpp_cache
4d646738 4506 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
75e28f62
E
4507
4508
4509#define setup_sprite_tiled_initialize_4bpp() \
4510 ldr dirty_textures_mask, \
4511 [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]; \
4512 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4513 \
4514 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4515 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4516 \
4517 tst current_texture_mask, dirty_textures_mask; \
4518 vuzp.u8 clut_a, clut_b; \
4519 \
4520 blne setup_sprite_update_texture_4bpp_cache \
4521
4522#define setup_sprite_tiled_initialize_8bpp() \
4523 ldr dirty_textures_mask, \
4524 [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]; \
4525 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4526 \
4527 tst current_texture_mask, dirty_textures_mask; \
4528 blne setup_sprite_update_texture_8bpp_cache \
4529
4530
75e28f62
E
4531#define setup_sprite_block_count_single() \
4532 sub_tile_height \
4533
4534#define setup_sprite_block_count_double() \
4535 sub_tile_height, lsl #1 \
4536
4537#define setup_sprite_tile_add_blocks(type) \
4538 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4539 cmp num_blocks, #MAX_BLOCKS; \
4540 \
59d15d23 4541 movgt num_blocks, setup_sprite_block_count_##type(); \
4542 blgt setup_sprite_flush_blocks \
75e28f62
E
4543
4544
4545#define setup_sprite_tile_full_4bpp(edge) \
4546 setup_sprite_tile_add_blocks(double); \
4547 \
4548 4: \
4549 and texture_block_ptr, texture_offset, texture_mask; \
4550 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4551 \
4552 pld [ fb_ptr ]; \
4553 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4554 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4555 \
4556 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4557 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4558 \
4559 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4560 add texture_block_ptr, texture_offset, #8; \
4561 \
4562 and texture_block_ptr, texture_block_ptr, texture_mask; \
4563 add block, block, #40; \
4564 \
4565 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4566 add fb_ptr, fb_ptr, #16; \
4567 \
4568 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4569 add block, block, #24; \
4570 \
4571 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4572 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4573 \
4574 pld [ fb_ptr ]; \
4575 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4576 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4577 \
4578 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4579 add block, block, #40; \
4580 \
4581 add texture_offset, texture_offset, #0x10; \
4582 add fb_ptr, fb_ptr, #(2048 - 16); \
4583 \
4584 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4585 add block, block, #24; \
4586 \
4587 subs sub_tile_height, sub_tile_height, #1; \
4588 bne 4b; \
4589 \
4590 add texture_offset, texture_offset, #0xF00; \
4591 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4592
4593
4594#define setup_sprite_tile_half_4bpp(edge) \
4595 setup_sprite_tile_add_blocks(single); \
4596 \
4597 4: \
4598 and texture_block_ptr, texture_offset, texture_mask; \
4599 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4600 \
4601 pld [ fb_ptr ]; \
4602 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4603 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4604 \
4605 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4606 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4607 \
4608 vst2.u8 { texels_low, texels_high }, [ block, :128 ]; \
4609 add block, block, #40; \
4610 \
4611 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4612 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4613 \
4614 add block, block, #24; \
4615 add texture_offset, texture_offset, #0x10; \
4616 \
4617 add fb_ptr, fb_ptr, #2048; \
4618 subs sub_tile_height, sub_tile_height, #1; \
4619 \
4620 bne 4b; \
4621 \
4622 add texture_offset, texture_offset, #0xF00; \
4623 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4624
4625
4626#define setup_sprite_tile_full_8bpp(edge) \
4627 setup_sprite_tile_add_blocks(double); \
4628 add block, block, #16; \
4629 \
4630 4: \
4631 and texture_block_ptr, texture_offset, texture_mask; \
4632 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4633 \
4634 pld [ fb_ptr ]; \
4635 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4636 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4637 \
4638 add texture_block_ptr, texture_offset, #8; \
4639 vst1.u32 { texels }, [ block, :64 ]; \
4640 \
4641 and texture_block_ptr, texture_block_ptr, texture_mask; \
4642 add block, block, #24; \
4643 \
4644 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4645 \
4646 add fb_ptr, fb_ptr, #16; \
4647 vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ]; \
4648 \
4649 add block, block, #40; \
4650 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4651 pld [ fb_ptr ]; \
4652 \
4653 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4654 vst1.u32 { texels }, [ block, :64 ]; \
4655 add block, block, #24; \
4656 \
4657 add texture_offset, texture_offset, #0x10; \
4658 add fb_ptr, fb_ptr, #(2048 - 16); \
4659 \
4660 vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ]; \
4661 add block, block, #40; \
4662 \
4663 subs sub_tile_height, sub_tile_height, #1; \
4664 bne 4b; \
4665 \
4666 sub block, block, #16; \
4667 add texture_offset, texture_offset, #0xF00; \
4668 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4669
4670
4671#define setup_sprite_tile_half_8bpp(edge) \
4672 setup_sprite_tile_add_blocks(single); \
4673 add block, block, #16; \
4674 \
4675 4: \
4676 and texture_block_ptr, texture_offset, texture_mask; \
4677 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4678 pld [ fb_ptr ]; \
4679 \
4680 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4681 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4682 \
4683 vst1.u32 { texels }, [ block, :64 ]; \
4684 add block, block, #24; \
4685 \
4686 vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ]; \
4687 add block, block, #40; \
4688 \
4689 add texture_offset, texture_offset, #0x10; \
4690 add fb_ptr, fb_ptr, #2048; \
4691 \
4692 subs sub_tile_height, sub_tile_height, #1; \
4693 bne 4b; \
4694 \
4695 sub block, block, #16; \
4696 add texture_offset, texture_offset, #0xF00; \
4697 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4698
4699
4700#define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4701 add texture_offset, texture_offset_base, #8; \
4702 add fb_ptr, fb_ptr, #16 \
4703
4704#define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4705 mov texture_offset, texture_offset_base \
4706
4707#define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4708 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4709
4710#define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4711 mov texture_offset, texture_offset_base \
4712
4713#define setup_sprite_tile_column_edge_post_adjust_half_right() \
4714 sub fb_ptr, fb_ptr, #16 \
4715
4716#define setup_sprite_tile_column_edge_post_adjust_half_left() \
4717
4718#define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4719 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4720
4721#define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4722
4723
59d15d23 4724#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode, \
4725 x4mode) \
75e28f62 4726 mov sub_tile_height, column_data; \
59d15d23 4727 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4728 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4729 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62 4730
59d15d23 4731#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode, \
4732 x4mode) \
75e28f62
E
4733 and sub_tile_height, column_data, #0xFF; \
4734 mov tiles_remaining, column_data, lsr #16; \
59d15d23 4735 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4736 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4737 \
4738 subs tiles_remaining, tiles_remaining, #1; \
4739 beq 2f; \
4740 \
4741 3: \
4742 mov sub_tile_height, #16; \
59d15d23 4743 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4744 subs tiles_remaining, tiles_remaining, #1; \
4745 bne 3b; \
4746 \
4747 2: \
4748 uxtb sub_tile_height, column_data, ror #8; \
59d15d23 4749 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4750 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62
E
4751
4752
4753#define setup_sprite_column_data_single() \
4754 mov column_data, height; \
4755 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ] \
4756
4757#define setup_sprite_column_data_multi() \
4758 and height_rounded, height_rounded, #0xF; \
4759 rsb column_data, offset_v, #16; \
4760 \
4761 add height_rounded, height_rounded, #1; \
4762 sub tile_height, tile_height, #1; \
4763 \
4764 orr column_data, column_data, tile_height, lsl #16; \
4765 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]; \
4766 \
4767 orr column_data, column_data, height_rounded, lsl #8 \
4768
59d15d23 4769#define setup_sprite_setup_left_draw_mask_fb_ptr() \
4770 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4771 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4772
4773#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column() \
4774 mov fb_ptr_advance_column, #32; \
4775 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4776 \
ed0fd81d 4777 sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \
59d15d23 4778 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4779
4780#define setup_sprite_setup_right_draw_mask_fb_ptr() \
4781 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4782 vdup.u8 draw_mask_fb_ptr_right, block_masks[5] \
4783
4784#define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode, \
4785 edge, x4mode) \
4786 setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode: \
75e28f62
E
4787 setup_sprite_column_data_##multi_height(); \
4788 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4789 vorr.u32 block_masks, block_masks, block_masks_shifted; \
59d15d23 4790 setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
75e28f62 4791 \
59d15d23 4792 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
75e28f62
E
4793 ldmia sp!, { r4 - r11, pc } \
4794
4795#define setup_sprite_tiled_advance_column() \
4796 add texture_offset_base, texture_offset_base, #0x100; \
4797 tst texture_offset_base, #0xF00; \
4798 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4799
4800#define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
59d15d23 4801 right_mode, x4mode) \
4802 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
75e28f62 4803 setup_sprite_column_data_##multi_height(); \
75e28f62 4804 \
59d15d23 4805 setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode(); \
75e28f62 4806 \
59d15d23 4807 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
75e28f62
E
4808 \
4809 subs tile_width, tile_width, #2; \
4810 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4811 \
75e28f62
E
4812 beq 1f; \
4813 \
59d15d23 4814 vmov.u8 draw_masks_fb_ptrs, #0; \
4815 vmov.u8 draw_masks_fb_ptrs2, #0; \
4816 \
75e28f62
E
4817 0: \
4818 setup_sprite_tiled_advance_column(); \
59d15d23 4819 setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode); \
75e28f62
E
4820 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4821 subs tile_width, tile_width, #1; \
4822 bne 0b; \
4823 \
4824 1: \
59d15d23 4825 setup_sprite_setup_right_draw_mask_fb_ptr##x4mode(); \
75e28f62
E
4826 \
4827 setup_sprite_tiled_advance_column(); \
59d15d23 4828 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
75e28f62
E
4829 ldmia sp!, { r4 - r11, pc } \
4830
4831
59d15d23 4832#define setup_sprite_offset_u_adjust() \
4833
4834#define setup_sprite_get_left_block_mask() \
4835 and left_block_mask, left_block_mask, #0xFF \
4836
4837#define setup_sprite_compare_left_block_mask() \
4838 cmp left_block_mask, #0xFF \
4839
4840#define setup_sprite_get_right_block_mask() \
4841 uxtb right_block_mask, right_block_mask, ror #8 \
4842
4843#define setup_sprite_compare_right_block_mask() \
4844 cmp right_block_mask, #0xFF \
4845
4846
4847
4848/* 4x stuff */
4849#define fb_ptr2 column_data
4850
4851#define setup_sprite_offset_u_adjust_4x() \
4852 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4853 lsl offset_u_right, #1; \
4854 lsl offset_u, #1; \
4855 add offset_u_right, #1 \
4856
4857#define setup_sprite_get_left_block_mask_4x() \
4858 sxth left_block_mask, left_block_mask \
4859
4860#define setup_sprite_compare_left_block_mask_4x() \
4861 cmp left_block_mask, #0xFFFFFFFF \
4862
4863#define setup_sprite_get_right_block_mask_4x() \
4864 sxth right_block_mask, right_block_mask, ror #16 \
4865
4866#define setup_sprite_compare_right_block_mask_4x() \
4867 cmp right_block_mask, #0xFFFFFFFF \
4868
4869
4870#define widen_texels_16bpp(texels_) \
4871 vmov texels_wide_low, texels_; \
4872 vmov texels_wide_high, texels_; \
4873 vzip.16 texels_wide_low, texels_wide_high \
4874
4875#define widen_texels_8bpp(texels_) \
4876 vmov texels_wide_low, texels_; \
4877 vmov texels_wide_high, texels_; \
4878 vzip.8 texels_wide_low, texels_wide_high \
4879
4880#define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
4881 vst1.u32 { texels_ }, [ block_, :128 ]; \
4882 add block_, block_, #40; \
4883 \
4884 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
4885 vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ]; \
4886 add block_, block_, #24 \
4887
4888/* assumes 16-byte offset already added to block_ */
4889#define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
4890 vst1.u32 { texels_ }, [ block_, :64 ]; \
4891 add block_, block_, #24; \
4892 \
4893 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
4894 vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ]; \
4895 add block_, block_, #40 \
4896
4897#define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4898 draw_mask_fb_ptr_b_) \
4899 widen_texels_16bpp(texels_low); \
4900 add fb_ptr_tmp, fb_ptr, #1024*2; \
4901 \
4902 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr); \
4903 \
4904 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4905 widen_texels_16bpp(texels_high); \
4906 \
4907 add fb_ptr_tmp, fb_ptr, #8*2; \
4908 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4909 \
4910 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4911 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4912
4913#define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4914 draw_mask_fb_ptr_b_) \
4915 widen_texels_8bpp(texels); \
4916 add fb_ptr_tmp, fb_ptr, #1024*2; \
4917 \
4918 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr); \
4919 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4920 \
4921 add fb_ptr_tmp, fb_ptr, #8*2; \
4922 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4923 \
4924 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4925 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4926
4927
4928#define setup_sprite_tiled_initialize_4bpp_4x() \
4929 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]; \
4930 vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]; \
4931 \
4932 vuzp.u8 clut_a, clut_b \
4933
4934#define setup_sprite_tiled_initialize_8bpp_4x() \
4935
4936
4937#define setup_sprite_block_count_single_4x() \
4938 sub_tile_height, lsl #2 \
4939
4940#define setup_sprite_block_count_double_4x() \
4941 sub_tile_height, lsl #(1+2) \
4942
4943#define setup_sprite_tile_full_4bpp_4x(edge) \
4944 setup_sprite_tile_add_blocks(double_4x); \
4945 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4946 \
4947 4: \
4948 and texture_block_ptr, texture_offset, texture_mask; \
4949 pld [ fb_ptr ]; \
4950 \
4951 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4952 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4953 \
4954 add texture_block_ptr, texture_offset, #8; \
4955 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4956 \
4957 and texture_block_ptr, texture_block_ptr, texture_mask; \
4958 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4959 \
4960 vzip.8 texels_low, texels_high; \
4961 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
4962 draw_mask_fb_ptr_left_b); \
4963 \
4964 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
8438c3c7 4965 pld [ fb_ptr, #2048 ]; \
59d15d23 4966 \
4967 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
8438c3c7 4968 add fb_ptr, fb_ptr, #16*2; \
59d15d23 4969 \
8438c3c7 4970 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
59d15d23 4971 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4972 \
4973 vzip.8 texels_low, texels_high; \
4974 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
4975 draw_mask_fb_ptr_right_b); \
4976 \
4977 add texture_offset, texture_offset, #0x10; \
4978 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
4979 \
4980 subs sub_tile_height, sub_tile_height, #1; \
4981 bne 4b; \
4982 \
4983 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4984 add texture_offset, texture_offset, #0xF00; \
4985 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
4986
4987
4988#define setup_sprite_tile_half_4bpp_4x(edge) \
4989 setup_sprite_tile_add_blocks(single_4x); \
4990 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4991 \
4992 4: \
4993 and texture_block_ptr, texture_offset, texture_mask; \
4994 pld [ fb_ptr ]; \
4995 \
4996 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4997 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
4998 \
4999 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5000 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
5001 \
5002 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
5003 add texture_offset, texture_offset, #0x10; \
5004 \
5005 vzip.8 texels_low, texels_high; \
5006 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5007 draw_mask_fb_ptr_##edge##_b); \
5008 \
8438c3c7 5009 pld [ fb_ptr, #2048 ]; \
59d15d23 5010 add fb_ptr, fb_ptr, #2048 * 2; \
59d15d23 5011 \
8438c3c7 5012 subs sub_tile_height, sub_tile_height, #1; \
59d15d23 5013 bne 4b; \
5014 \
5015 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5016 add texture_offset, texture_offset, #0xF00; \
5017 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5018
5019
5020#define setup_sprite_tile_full_8bpp_4x(edge) \
5021 setup_sprite_tile_add_blocks(double_4x); \
5022 add block, block, #16; \
5023 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5024 \
5025 4: \
5026 and texture_block_ptr, texture_offset, texture_mask; \
5027 pld [ fb_ptr ]; \
5028 \
5029 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5030 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
5031 \
5032 add texture_block_ptr, texture_offset, #8; \
5033 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
5034 draw_mask_fb_ptr_left_b); \
5035 \
8438c3c7 5036 pld [ fb_ptr, #2048 ]; \
59d15d23 5037 and texture_block_ptr, texture_block_ptr, texture_mask; \
5038 \
5039 add fb_ptr, fb_ptr, #16*2; \
5040 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5041 \
5042 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
59d15d23 5043 \
5044 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
5045 draw_mask_fb_ptr_right_b); \
5046 \
5047 add texture_offset, texture_offset, #0x10; \
5048 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
5049 \
5050 subs sub_tile_height, sub_tile_height, #1; \
5051 bne 4b; \
5052 \
5053 sub block, block, #16; \
5054 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5055 add texture_offset, texture_offset, #0xF00; \
5056 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5057
5058
5059#define setup_sprite_tile_half_8bpp_4x(edge) \
5060 setup_sprite_tile_add_blocks(single_4x); \
5061 add block, block, #16; \
5062 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5063 \
5064 4: \
5065 and texture_block_ptr, texture_offset, texture_mask; \
5066 pld [ fb_ptr ]; \
5067 \
5068 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5069 vld1.u32 { texels }, [ texture_block_ptr, :64 ]; \
5070 \
8438c3c7 5071 pld [ fb_ptr, #2048 ]; \
59d15d23 5072 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5073 draw_mask_fb_ptr_##edge##_b); \
5074 \
5075 add texture_offset, texture_offset, #0x10; \
5076 add fb_ptr, fb_ptr, #2048 * 2; \
5077 \
5078 subs sub_tile_height, sub_tile_height, #1; \
5079 bne 4b; \
5080 \
5081 sub block, block, #16; \
5082 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5083 add texture_offset, texture_offset, #0xF00; \
5084 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ] \
5085
5086
5087#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x() \
5088 add texture_offset, texture_offset_base, #8; \
5089 add fb_ptr, fb_ptr, #16 * 2 \
5090
5091#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x() \
5092 mov texture_offset, texture_offset_base \
5093
5094#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge) \
5095 setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x() \
5096
5097#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge) \
5098 mov texture_offset, texture_offset_base \
5099
5100#define setup_sprite_tile_column_edge_post_adjust_half_right_4x() \
5101 sub fb_ptr, fb_ptr, #16 * 2 \
5102
5103#define setup_sprite_tile_column_edge_post_adjust_half_left_4x() \
5104
5105#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge) \
5106 setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x() \
5107
5108#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge) \
5109
5110
5111#define setup_sprite_setup_left_draw_mask_fb_ptr_4x() \
5112 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5113 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5114 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5115 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5116
5117#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x() \
5118 mov fb_ptr_advance_column, #32 * 2; \
5119 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5120 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
ed0fd81d 5121 sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \
59d15d23 5122 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5123 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5124
5125#define setup_sprite_setup_right_draw_mask_fb_ptr_4x() \
5126 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4]; \
5127 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5]; \
5128 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6]; \
5129 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7] \
5130
5131
75e28f62
E
5132// r0: psx_gpu
5133// r1: x
5134// r2: y
5135// r3: u
5136// [ sp ]: v
5137// [ sp + 4 ]: width
5138// [ sp + 8 ]: height
5139// [ sp + 12 ]: color (unused)
5140
59d15d23 5141#define setup_sprite_tiled_builder(texture_mode, x4mode) \
5142 \
5143setup_sprite_tile_column_width_multi(texture_mode, multi, full, full, \
5144 x4mode); \
5145setup_sprite_tile_column_width_single(texture_mode, multi, full, none, \
5146 x4mode); \
5147setup_sprite_tile_column_width_multi(texture_mode, single, full, full, \
5148 x4mode); \
5149setup_sprite_tile_column_width_single(texture_mode, single, full, none, \
5150 x4mode); \
5151setup_sprite_tile_column_width_multi(texture_mode, multi, half, full, \
5152 x4mode); \
5153setup_sprite_tile_column_width_single(texture_mode, multi, half, right, \
5154 x4mode); \
5155setup_sprite_tile_column_width_multi(texture_mode, single, half, full, \
5156 x4mode); \
5157setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
5158 x4mode); \
5159setup_sprite_tile_column_width_multi(texture_mode, multi, full, half, \
5160 x4mode); \
5161setup_sprite_tile_column_width_single(texture_mode, multi, half, left, \
5162 x4mode); \
5163setup_sprite_tile_column_width_multi(texture_mode, single, full, half, \
5164 x4mode); \
5165setup_sprite_tile_column_width_single(texture_mode, single, half, left, \
5166 x4mode); \
5167setup_sprite_tile_column_width_multi(texture_mode, multi, half, half, \
5168 x4mode); \
5169setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \
5170 x4mode); \
75e28f62
E
5171 \
5172.align 4; \
5173 \
59d15d23 5174function(setup_sprite_##texture_mode##x4mode) \
75e28f62 5175 stmdb sp!, { r4 - r11, r14 }; \
59d15d23 5176 setup_sprite_tiled_initialize_##texture_mode##x4mode(); \
75e28f62
E
5177 \
5178 ldr v, [ sp, #36 ]; \
5179 and offset_u, u, #0xF; \
5180 \
5181 ldr width, [ sp, #40 ]; \
c1817bd9 5182 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]; \
75e28f62
E
5183 \
5184 ldr height, [ sp, #44 ]; \
5185 add fb_ptr, fb_ptr, y, lsl #11; \
5186 \
5187 add fb_ptr, fb_ptr, x, lsl #1; \
5188 and offset_v, v, #0xF; \
5189 \
5190 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
5191 add width_rounded, offset_u, width; \
5192 \
5193 add height_rounded, offset_v, height; \
5194 add width_rounded, width_rounded, #15; \
5195 \
5196 add height_rounded, height_rounded, #15; \
5197 mov tile_width, width_rounded, lsr #4; \
5198 \
5199 /* texture_offset_base = VH-VL-00-00 */\
5200 mov texture_offset_base, v, lsl #8; \
5201 and offset_u_right, width_rounded, #0xF; \
5202 \
5203 /* texture_offset_base = VH-UH-UL-00 */\
5204 bfi texture_offset_base, u, #4, #8; \
59d15d23 5205 mov right_block_mask, #0xFFFFFFFE; \
5206 \
5207 setup_sprite_offset_u_adjust##x4mode(); \
75e28f62
E
5208 \
5209 /* texture_offset_base = VH-UH-VL-00 */\
5210 bfi texture_offset_base, v, #4, #4; \
59d15d23 5211 mov left_block_mask, #0xFFFFFFFF; \
75e28f62
E
5212 \
5213 mov tile_height, height_rounded, lsr #4; \
5214 mvn left_block_mask, left_block_mask, lsl offset_u; \
5215 \
5216 /* texture_mask = HH-HL-WH-WL */\
5217 ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]; \
5218 mov right_block_mask, right_block_mask, lsl offset_u_right; \
5219 \
5220 /* texture_mask_rev = WH-WL-HH-HL */\
5221 rev16 texture_mask_rev, texture_mask; \
5222 vmov block_masks, left_block_mask, right_block_mask; \
5223 \
5224 /* texture_mask = HH-HL-HL-WL */\
5225 bfi texture_mask, texture_mask_rev, #4, #4; \
5226 /* texture_mask_rev = 00-00-00-WH */\
5227 mov texture_mask_rev, texture_mask_rev, lsr #12; \
5228 \
5229 /* texture_mask = HH-WH-HL-WL */\
5230 bfi texture_mask, texture_mask_rev, #8, #4; \
59d15d23 5231 setup_sprite_get_left_block_mask##x4mode(); \
75e28f62
E
5232 \
5233 mov control_mask, #0; \
59d15d23 5234 setup_sprite_compare_left_block_mask##x4mode(); \
75e28f62 5235 \
59d15d23 5236 setup_sprite_get_right_block_mask##x4mode(); \
75e28f62
E
5237 orreq control_mask, control_mask, #0x4; \
5238 \
5239 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]; \
59d15d23 5240 setup_sprite_compare_right_block_mask##x4mode(); \
75e28f62
E
5241 \
5242 orreq control_mask, control_mask, #0x8; \
5243 cmp tile_width, #1; \
5244 \
5245 add block, psx_gpu, #psx_gpu_blocks_offset; \
5246 orreq control_mask, control_mask, #0x1; \
5247 \
5248 cmp tile_height, #1; \
5249 add block, block, num_blocks, lsl #6; \
5250 \
5251 orreq control_mask, control_mask, #0x2; \
8184d7c5 5252 JT_OP_REL(9f, control_mask, temp); \
5253 JT_OP(ldr pc, [ pc, control_mask, lsl #2 ]); \
75e28f62
E
5254 nop; \
5255 \
8184d7c5 5256 9: \
5257 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode); \
5258 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode); \
5259 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode); \
5260 .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5261 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode); \
5262 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5263 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode); \
5264 .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5265 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode); \
5266 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode); \
5267 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode); \
5268 .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5269 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode); \
75e28f62 5270 .word 0x00000000; \
8184d7c5 5271 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode); \
59d15d23 5272
5273
5274setup_sprite_tiled_builder(4bpp,);
5275setup_sprite_tiled_builder(8bpp,);
75e28f62 5276
59d15d23 5277#undef draw_mask_fb_ptr_left
5278#undef draw_mask_fb_ptr_right
75e28f62 5279
59d15d23 5280setup_sprite_tiled_builder(4bpp, _4x);
5281setup_sprite_tiled_builder(8bpp, _4x);
75e28f62
E
5282
5283
5284#undef block_ptr
5285#undef num_blocks
5286#undef clut_ptr
5287
5288#define psx_gpu r0
5289#define block_ptr r0
5290#define num_blocks r1
5291#define clut_ptr r2
5292#define texel_shift_mask r3
5293#define block_pixels_a r4
5294#define block_pixels_b r5
5295#define texel_0 r6
5296#define texel_2 r7
5297#define texel_4 r8
5298#define texel_6 r9
5299#define texel_1 r10
5300#define texel_3 r11
5301#define texel_5 r12
5302#define texel_7 r14
5303#define texels_01 r6
5304#define texels_23 r7
5305#define texels_45 r8
5306#define texels_67 r9
5307
5308function(texture_sprite_blocks_8bpp)
5309 stmdb sp!, { r4 - r11, r14 }
5310 movw texel_shift_mask, #(0xFF << 1)
5311
5312 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5313 ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
5314
5315 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5316 ldr block_pixels_a, [ block_ptr, #16 ]
5317
5318 0:
5319 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5320 ldr block_pixels_b, [ block_ptr, #20 ]
5321
5322 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5323 ldrh texel_0, [ clut_ptr, texel_0 ]
5324
5325 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5326 ldrh texel_1, [ clut_ptr, texel_1 ]
5327
5328 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5329 ldr block_pixels_a, [ block_ptr, #(64 + 16) ]
5330
5331 ldrh texel_2, [ clut_ptr, texel_2 ]
5332 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5333
5334 ldrh texel_3, [ clut_ptr, texel_3 ]
5335 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5336
5337 ldrh texel_4, [ clut_ptr, texel_4 ]
5338 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5339
5340 ldrh texel_5, [ clut_ptr, texel_5 ]
5341 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5342
5343 ldrh texel_6, [ clut_ptr, texel_6 ]
5344 orr texels_01, texel_0, texel_1, lsl #16
5345
5346 ldrh texel_7, [ clut_ptr, texel_7 ]
5347 orr texels_23, texel_2, texel_3, lsl #16
5348
5349 orr texels_45, texel_4, texel_5, lsl #16
5350 str texels_01, [ block_ptr, #0 ]
5351
5352 orr texels_67, texel_6, texel_7, lsl #16
5353 str texels_23, [ block_ptr, #4 ]
5354
5355 subs num_blocks, num_blocks, #1
5356 str texels_45, [ block_ptr, #8 ]
5357
5358 str texels_67, [ block_ptr, #12 ]
5359 add block_ptr, block_ptr, #64
5360
5361 bne 0b
5362
5363 ldmia sp!, { r4 - r11, pc }
5364
5365
5366#undef width_rounded
5367#undef texture_mask
5368#undef num_blocks
5369#undef texture_offset
59d15d23 5370#undef texels_low
5371#undef texels_high
5372#undef texels_wide_low
5373#undef texels_wide_high
5374#undef texels_wide
5375#undef fb_ptr2
8184d7c5 5376#undef temp
75e28f62
E
5377
5378#define psx_gpu r0
5379#define x r1
5380#define y r2
5381#define u r3
5382#define v r4
5383#define width r5
5384#define height r6
5385#define left_offset r8
5386#define width_rounded r9
5387#define right_width r10
59d15d23 5388
75e28f62
E
5389#define block_width r11
5390
5391#define texture_offset_base r1
5392#define texture_mask r2
5393#define texture_page_ptr r3
5394#define num_blocks r4
5395#define block r5
5396#define fb_ptr r7
5397#define texture_offset r8
5398#define blocks_remaining r9
59d15d23 5399#define fb_ptr2 r10
75e28f62
E
5400#define fb_ptr_pitch r12
5401#define texture_block_ptr r14
5402
5403#define texture_mask_width r2
5404#define texture_mask_height r3
5405#define left_mask_bits r4
5406#define right_mask_bits r5
5407
5408
5409#undef block_masks
5410#undef block_masks_shifted
5411#undef texels
5412
5413#define block_masks d0
5414#define block_masks_shifted d1
5415#define draw_mask_fb_ptr d2
5416#define texels q2
5417
59d15d23 5418#define draw_mask_fb_ptr_a d2
5419#define draw_mask_fb_ptr_b d3
5420#define texels_low d4
5421#define texels_high d5
5422#define texels_wide_low d6
5423#define texels_wide_high d7
5424#define texels_wide q3
75e28f62 5425
75e28f62 5426
59d15d23 5427setup_sprites_16bpp_flush:
5428 vpush { d0 - d3 }
75e28f62 5429
4d646738 5430 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 5431 bl flush_render_block_buffer
4d646738 5432 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 5433
59d15d23 5434 vpop { d0 - d3 }
75e28f62
E
5435
5436 add block, psx_gpu, #psx_gpu_blocks_offset
5437 mov num_blocks, block_width
5438
5439 bx lr
5440
5441function(setup_sprite_16bpp)
5442 stmdb sp!, { r4 - r11, r14 }
c1817bd9 5443 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
75e28f62
E
5444
5445 ldr v, [ sp, #36 ]
5446 add fb_ptr, fb_ptr, y, lsl #11
5447
5448 ldr width, [ sp, #40 ]
5449 add fb_ptr, fb_ptr, x, lsl #1
5450
5451 ldr height, [ sp, #44 ]
5452 and left_offset, u, #0x7
5453
5454 add texture_offset_base, u, u
5455 add width_rounded, width, #7
5456
ed0fd81d 5457 add texture_offset_base, texture_offset_base, v, lsl #11
75e28f62
E
5458 mov left_mask_bits, #0xFF
5459
5460 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5461 add width_rounded, width_rounded, left_offset
5462
5463 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5464 sub fb_ptr, fb_ptr, left_offset, lsl #1
5465
5466 add texture_mask, texture_mask_width, texture_mask_width
5467 mov right_mask_bits, #0xFE
5468
5469 and right_width, width_rounded, #0x7
5470 mvn left_mask_bits, left_mask_bits, lsl left_offset
5471
ed0fd81d 5472 add texture_mask, texture_mask, texture_mask_height, lsl #11
75e28f62
E
5473 mov block_width, width_rounded, lsr #3
5474
5475 mov right_mask_bits, right_mask_bits, lsl right_width
5476 movw fb_ptr_pitch, #(2048 + 16)
5477
5478 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5479 vmov block_masks, left_mask_bits, right_mask_bits
5480
5481 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5482 add block, psx_gpu, #psx_gpu_blocks_offset
5483
6ea0f7bf 5484 bic texture_offset_base, texture_offset_base, #0xF
75e28f62
E
5485 cmp block_width, #1
5486
5487 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5488 add block, block, num_blocks, lsl #6
5489
5490 bne 0f
5491
5492 vext.32 block_masks_shifted, block_masks, block_masks, #1
5493 vorr.u32 block_masks, block_masks, block_masks_shifted
5494 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5495
5496 1:
5497 add num_blocks, num_blocks, #1
5498 cmp num_blocks, #MAX_BLOCKS
59d15d23 5499 blgt setup_sprites_16bpp_flush
75e28f62
E
5500
5501 and texture_block_ptr, texture_offset_base, texture_mask
5502 subs height, height, #1
5503
5504 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5505 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5506
5507 vst1.u32 { texels }, [ block, :128 ]
5508 add block, block, #40
5509
5510 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5511 pld [ fb_ptr ]
5512
5513 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5514
5515 add block, block, #24
5516 add texture_offset_base, texture_offset_base, #2048
5517 add fb_ptr, fb_ptr, #2048
5518 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5519 bne 1b
5520
5521 ldmia sp!, { r4 - r11, pc }
5522
5523 0:
5524 add num_blocks, num_blocks, block_width
5525 mov texture_offset, texture_offset_base
5526
5527 cmp num_blocks, #MAX_BLOCKS
59d15d23 5528 blgt setup_sprites_16bpp_flush
75e28f62
E
5529
5530 add texture_offset_base, texture_offset_base, #2048
5531 and texture_block_ptr, texture_offset, texture_mask
5532
5533 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5534 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5535
5536 vst1.u32 { texels }, [ block, :128 ]
5537 add block, block, #40
5538
5539 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5540 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5541 pld [ fb_ptr ]
5542
5543 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5544 subs blocks_remaining, block_width, #2
5545
5546 add texture_offset, texture_offset, #16
5547 add fb_ptr, fb_ptr, #16
5548
5549 vmov.u8 draw_mask_fb_ptr, #0
5550
5551 add block, block, #24
5552 beq 2f
5553
5554 1:
5555 and texture_block_ptr, texture_offset, texture_mask
5556 subs blocks_remaining, blocks_remaining, #1
5557
5558 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5559 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5560
5561 vst1.u32 { texels }, [ block, :128 ]
5562 add block, block, #40
5563
5564 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5565 pld [ fb_ptr ]
5566
5567 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5568
5569 add texture_offset, texture_offset, #16
5570 add fb_ptr, fb_ptr, #16
5571
5572 add block, block, #24
5573 bne 1b
5574
5575 2:
5576 and texture_block_ptr, texture_offset, texture_mask
5577 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5578
5579 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5580 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5581
5582 vst1.u32 { texels }, [ block, :128 ]
5583 add block, block, #40
5584
5585 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5586 vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5587
5588 add block, block, #24
5589 subs height, height, #1
5590
5591 add fb_ptr, fb_ptr, fb_ptr_pitch
5592 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5593
5594 bne 0b
5595
5596 ldmia sp!, { r4 - r11, pc }
5597
5598
59d15d23 5599// 4x version
5600// FIXME: duplicate code with normal version :(
5601#undef draw_mask_fb_ptr
5602
5603function(setup_sprite_16bpp_4x)
5604 stmdb sp!, { r4 - r11, r14 }
5605 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5606
5607 ldr v, [ sp, #36 ]
5608 add fb_ptr, fb_ptr, y, lsl #11
5609
5610 ldr width, [ sp, #40 ]
5611 add fb_ptr, fb_ptr, x, lsl #1
5612
5613 ldr height, [ sp, #44 ]
5614 and left_offset, u, #0x7
5615
5616 add texture_offset_base, u, u
5617 add width_rounded, width, #7
5618
ed0fd81d 5619 add texture_offset_base, texture_offset_base, v, lsl #11
59d15d23 5620 movw left_mask_bits, #0xFFFF
5621
5622 ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5623 add width_rounded, width_rounded, left_offset
5624
5625 lsl left_offset, #1
5626
5627 ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5628 sub fb_ptr, fb_ptr, left_offset, lsl #1
5629
5630 add texture_mask, texture_mask_width, texture_mask_width
5631 movw right_mask_bits, #0xFFFC
5632
5633 and right_width, width_rounded, #0x7
5634 mvn left_mask_bits, left_mask_bits, lsl left_offset
5635
5636 lsl right_width, #1
5637
ed0fd81d 5638 add texture_mask, texture_mask, texture_mask_height, lsl #11
59d15d23 5639 mov block_width, width_rounded, lsr #3
5640
5641 mov right_mask_bits, right_mask_bits, lsl right_width
5642 movw fb_ptr_pitch, #(2048 + 16) * 2
5643
5644 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5645 vmov block_masks, left_mask_bits, right_mask_bits
5646
5647 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5648 add block, psx_gpu, #psx_gpu_blocks_offset
5649
5650 bic texture_offset_base, texture_offset_base, #0xF
5651 cmp block_width, #1
5652
5653 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5654 add block, block, num_blocks, lsl #6
5655
5656 lsl block_width, #2
5657 bne 0f
5658
5659 vext.32 block_masks_shifted, block_masks, block_masks, #1
5660 vorr.u32 block_masks, block_masks, block_masks_shifted
5661 vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5662 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5663
5664 1:
5665 add num_blocks, num_blocks, block_width
5666 cmp num_blocks, #MAX_BLOCKS
5667 blgt setup_sprites_16bpp_flush
5668
5669 and texture_block_ptr, texture_offset_base, texture_mask
5670 subs height, height, #1
5671
5672 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5673 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5674
5675 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5676
5677 add texture_offset_base, texture_offset_base, #2048
5678 add fb_ptr, fb_ptr, #2048*2
5679 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5680 bne 1b
5681
5682 ldmia sp!, { r4 - r11, pc }
5683
5684 0:
5685 add num_blocks, num_blocks, block_width
5686 mov texture_offset, texture_offset_base
5687
5688 vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5689 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5690
5691 cmp num_blocks, #MAX_BLOCKS
5692 blgt setup_sprites_16bpp_flush
5693
5694 add texture_offset_base, texture_offset_base, #2048
5695 and texture_block_ptr, texture_offset, texture_mask
5696
5697 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5698 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5699
5700 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5701
5702 subs blocks_remaining, block_width, #2*4
5703 add texture_offset, texture_offset, #16
5704
5705 vmov.u8 draw_mask_fb_ptr_a, #0
5706 vmov.u8 draw_mask_fb_ptr_b, #0
5707
5708 add fb_ptr, fb_ptr, #16*2
5709 beq 2f
5710
5711 1:
5712 and texture_block_ptr, texture_offset, texture_mask
5713 subs blocks_remaining, blocks_remaining, #4
5714
5715 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5716 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5717
5718 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5719 add texture_offset, texture_offset, #16
5720
5721 add fb_ptr, fb_ptr, #16*2
5722 bgt 1b
5723
5724 2:
5725 vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5726 vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5727
5728 and texture_block_ptr, texture_offset, texture_mask
5729 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5730
5731 vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5732
5733 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5734 subs height, height, #1
5735
5736 add fb_ptr, fb_ptr, fb_ptr_pitch
5737 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5738
5739 bne 0b
5740
5741 ldmia sp!, { r4 - r11, pc }
5742
5743
f0931e56 5744#undef width
5745#undef right_width
5746#undef right_mask_bits
5747#undef color
5748#undef height
5749#undef blocks_remaining
5750#undef colors
5751#undef right_mask
5752#undef test_mask
5753#undef draw_mask
5754
5755#define psx_gpu r0
5756#define x r1
5757#define y r2
5758#define width r3
5759#define right_width r5
5760#define right_mask_bits r6
5761#define fb_ptr r7
5762#define color r8
5763#define height r9
5764#define fb_ptr_pitch r12
5765
5766// referenced by setup_sprites_16bpp_flush
5767#define num_blocks r4
5768#define block r5
5769#define block_width r11
5770
5771#define color_r r1
5772#define color_g r2
5773#define color_b r8
5774#define blocks_remaining r6
5775
5776#define colors q0
5777#define right_mask q1
5778#define test_mask q2
5779#define draw_mask q2
5780#define draw_mask_bits_fb_ptr d6
5781
5782
5783.align 3
5784
5785function(setup_sprite_untextured)
5786 ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
5787 tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
5788 | RENDER_FLAGS_BLEND)
ed0fd81d 5789 ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
d5c08ed3 5790 tsteq r12, #RENDER_INTERLACE_ENABLED
f0931e56 5791 beq setup_sprite_untextured_simple
5792
5793 stmdb sp!, { r4 - r11, r14 }
5794
5795 ldr width, [ sp, #40 ]
5796 ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5797
5798 ldr height, [ sp, #44 ]
5799 add fb_ptr, fb_ptr, y, lsl #11
5800
5801 add fb_ptr, fb_ptr, x, lsl #1
5802 sub right_width, width, #1
5803
5804 ldr color, [ sp, #48 ]
5805 and right_width, #7
5806
5807 add block_width, width, #7
5808 add right_width, #1
5809
5810 lsr block_width, #3
5811 mov right_mask_bits, #0xff
5812
5813 sub fb_ptr_pitch, block_width, #1
5814 lsl right_mask_bits, right_width
5815
5816 lsl fb_ptr_pitch, #3+1
5817 ubfx color_r, color, #3, #5
5818
5819 rsb fb_ptr_pitch, #1024*2
5820 ubfx color_g, color, #11, #5
5821
5822 vld1.u32 { test_mask }, [ psx_gpu, :128 ]
5823 ubfx color_b, color, #19, #5
5824
5825 vdup.u16 right_mask, right_mask_bits
5826 orr color, color_r, color_b, lsl #10
5827
5828 ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5829 orr color, color, color_g, lsl #5
5830
5831 vtst.u16 right_mask, right_mask, test_mask
5832 add block, psx_gpu, #psx_gpu_blocks_offset
5833
5834 vdup.u16 colors, color
5835 add block, block, num_blocks, lsl #6
5836
5837
5838setup_sprite_untextured_height_loop:
5839 add num_blocks, block_width
5840 sub blocks_remaining, block_width, #1
5841
5842 cmp num_blocks, #MAX_BLOCKS
5843 blgt setup_sprites_16bpp_flush
5844
5845 cmp blocks_remaining, #0
5846 ble 1f
5847
5848 vmov.u8 draw_mask, #0 /* zero_mask */
5849 vmov.u8 draw_mask_bits_fb_ptr, #0
5850
5851 0:
5852 vst1.u32 { draw_mask }, [ block, :128 ]!
5853 subs blocks_remaining, #1
5854
5855 vst1.u32 { colors }, [ block, :128 ]
5856 add block, block, #24
5857
5858 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5859 vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
5860
5861 add block, block, #24
5862 add fb_ptr, #8*2
5863 bgt 0b
5864
5865 1:
5866 vst1.u32 { right_mask }, [ block, :128 ]!
5867 subs height, #1
5868
5869 vst1.u32 { colors }, [ block, :128 ]
5870 add block, block, #24
5871
5872 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5873 vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
5874
5875 add block, block, #24
5876 add fb_ptr, fb_ptr_pitch
5877
5878 strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5879 bgt setup_sprite_untextured_height_loop
5880
5881 ldmia sp!, { r4 - r11, pc }
5882
5883
5884
75e28f62
E
5885#undef texture_page_ptr
5886#undef vram_ptr
5887#undef dirty_textures_mask
5888#undef current_texture_mask
5889
5890#define psx_gpu r0
5891#define current_texture_page r1
5892#define texture_page_ptr r2
5893#define vram_ptr_a r3
5894#define current_texture_page_x r12
5895#define current_texture_page_y r4
5896#define dirty_textures_mask r5
5897#define tile_y r6
5898#define tile_x r7
5899#define sub_y r8
5900#define current_texture_mask r9
5901#define c_4096 r10
5902#define vram_ptr_b r11
5903
5904#define texel_block_a d0
5905#define texel_block_b d1
5906#define texel_block_expanded_a q1
5907#define texel_block_expanded_b q2
5908#define texel_block_expanded_ab q2
5909#define texel_block_expanded_c q3
5910#define texel_block_expanded_d q4
5911#define texel_block_expanded_cd q3
5912
5913function(update_texture_4bpp_cache)
5914 stmdb sp!, { r4 - r11, r14 }
5915 vpush { q0 - q3 }
5916
5917 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5918
3867c6ef 5919 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
75e28f62
E
5920 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5921
5922 and current_texture_page_x, current_texture_page, #0xF
5923 ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
5924
5925 mov current_texture_page_y, current_texture_page, lsr #4
5926 ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5927
5928 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5929 mov tile_y, #16
5930
5931 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5932 bic dirty_textures_mask, current_texture_mask
5933
5934 mov tile_x, #16
5935 str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5936
5937 mov sub_y, #8
5938 movw c_4096, #4096
5939
5940 add vram_ptr_b, vram_ptr_a, #2048
5941
5942 0:
5943 vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096
5944 vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096
5945
5946 vmovl.u8 texel_block_expanded_a, texel_block_a
5947 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5948 vmovl.u8 texel_block_expanded_c, texel_block_b
5949 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5950
5951 vbic.u16 texel_block_expanded_a, #0x00F0
5952 vbic.u16 texel_block_expanded_b, #0x00F0
5953 vbic.u16 texel_block_expanded_c, #0x00F0
5954 vbic.u16 texel_block_expanded_d, #0x00F0
5955
5956 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5957 texel_block_expanded_b
5958 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5959 texel_block_expanded_d
5960
5961 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
5962 [ texture_page_ptr, :256 ]!
5963
5964 subs sub_y, sub_y, #1
5965 bne 0b
5966
5967 mov sub_y, #8
5968 add vram_ptr_a, vram_ptr_a, #8
5969 add vram_ptr_b, vram_ptr_b, #8
5970
5971 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5972 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5973
5974 subs tile_x, tile_x, #1
5975 bne 0b
5976
5977 mov tile_x, #16
5978 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5979 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5980
5981 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5982 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5983
5984 subs tile_y, tile_y, #1
5985 bne 0b
5986
5987 vpop { q0 - q3 }
5988 ldmia sp!, { r4 - r11, pc }
5989
5990
5991#undef current_texture_page
5992
5993#define psx_gpu r0
5994#define texture_page r1
5995#define texture_page_ptr r2
5996#define vram_ptr_a r3
5997#define texture_page_x r12
5998#define texture_page_y r4
5999#define current_texture_page r5
6000#define tile_y r6
6001#define tile_x r7
6002#define sub_y r8
6003#define c_4096 r10
6004#define vram_ptr_b r11
6005
6006
6007#undef texels_a
6008#undef texels_b
6009
6010#define texels_a q0
6011#define texels_b q1
6012#define texels_c q2
6013#define texels_d q3
6014
6015
6016function(update_texture_8bpp_cache_slice)
6017 stmdb sp!, { r4 - r11, r14 }
6018 vpush { q0 - q3 }
6019
6020 ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
6021 ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
6022
3867c6ef 6023 ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
75e28f62
E
6024 mov tile_y, #16
6025
6026 and texture_page_x, texture_page, #0xF
6027 mov texture_page_y, texture_page, lsr #4
6028
6029 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
6030 mov tile_x, #8
6031
6032 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6033 eor current_texture_page, current_texture_page, texture_page
6034
6035 ands current_texture_page, current_texture_page, #0x1
6036 mov sub_y, #4
6037
6038 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6039 movw c_4096, #4096
6040
6041 add vram_ptr_b, vram_ptr_a, #2048
6042
6043 0:
6044 vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096
6045 vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096
6046 vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096
6047 vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096
6048
6049 vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]!
6050 vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]!
6051
6052 subs sub_y, sub_y, #1
6053 bne 0b
6054
6055 mov sub_y, #4
6056
6057 add vram_ptr_a, vram_ptr_a, #16
6058 add vram_ptr_b, vram_ptr_b, #16
6059
6060 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6061 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6062
6063 subs tile_x, tile_x, #1
6064 bne 0b
6065
6066 mov tile_x, #8
6067
6068 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6069 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6070
6071 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6072 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6073
6074 subs tile_y, tile_y, #1
6075 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6076
6077 bne 0b
6078
6079 vpop { q0 - q3 }
6080 ldmia sp!, { r4 - r11, pc }
6081
50f9355a 6082
6083/* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6084function(scale2x_tiles8)
6085 push { r4, r14 }
6086
6087 mov r4, r1
6088 add r12, r0, #1024*2
6089 mov r14, r2
6090
60910:
6092 vld1.u16 { q0 }, [ r1, :128 ]!
6093 vld1.u16 { q2 }, [ r1, :128 ]!
6094 vmov q1, q0
6095 vmov q3, q2
6096 vzip.16 q0, q1
6097 vzip.16 q2, q3
6098 subs r14, #2
6099 vst1.u16 { q0, q1 }, [ r0, :128 ]!
6100 vst1.u16 { q0, q1 }, [ r12, :128 ]!
6101 blt 1f
6102 vst1.u16 { q2, q3 }, [ r0, :128 ]!
6103 vst1.u16 { q2, q3 }, [ r12, :128 ]!
6104 bgt 0b
61051:
6106 subs r3, #1
6107 mov r14, r2
6108 add r0, #1024*2*2
6109 add r4, #1024*2
ed0fd81d 6110 sub r0, r0, r2, lsl #4+1
50f9355a 6111 mov r1, r4
6112 add r12, r0, #1024*2
6113 bgt 0b
6114 nop
6115
6116 pop { r4, pc }
59d15d23 6117
6118// vim:filetype=armasm