gpu_neon: fix apparent missing msb setting in blend_blocks_textured_add_fourth
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
CommitLineData
75e28f62
E
1/*
2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
59d15d23 3 * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
75e28f62
E
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of
8 * the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15
16#define MAX_SPANS 512
17#define MAX_BLOCKS 64
18#define MAX_BLOCKS_PER_ROW 128
19
f0931e56 20#define RENDER_STATE_MASK_EVALUATE 0x20
21#define RENDER_FLAGS_MODULATE_TEXELS 0x1
22#define RENDER_FLAGS_BLEND 0x2
d5c08ed3 23#define RENDER_INTERLACE_ENABLED 0x1
f0931e56 24
cb88320b 25#include "psx_gpu_offsets.h"
75e28f62 26
cb88320b 27#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
75e28f62 28
75e28f62
E
29#define edge_data_left_x_offset 0
30#define edge_data_num_blocks_offset 2
31#define edge_data_right_mask_offset 4
32#define edge_data_y_offset 6
33
ed0fd81d 34.syntax unified
35.text
75e28f62
E
36
37#define psx_gpu r0
38#define v_a r1
39#define v_b r2
40#define v_c r3
41
42#define x0 r4
43#define x1 r5
44#define x2 r6
45#define x0_x1 r5
46#define x1_x2 r6
47#define y0 r7
48#define y1 r8
49#define y2 r9
50#define y0_y1 r7
51#define y1_y2 r8
52#define b0 r9
53#define b1 r10
54#define b2 r11
55#define b0_b1 r10
56#define b1_b2 r11
57
58
59#define area_r_s r5
60
61#define g_bx0 r2
62#define g_bx r3
63#define g_bx2 r4
64#define g_bx3 r5
65#define b_base r6
66#define g_by r8
67
68#define gs_bx r7
69#define gs_by r10
70
71#define ga_bx g_bx
72#define ga_by g_by
73
74#define gw_bx_h g_bx
75#define gw_by_h g_by
76
77#define gw_bx_l r11
78#define gw_by_l gw_bx_l
79
80#define store_a r0
81#define store_b r1
82#define store_inc r5
83
84
85#define v0 q0
86#define uvrgb0 d0
87#define x0_y0 d1
88
89#define v1 q1
90#define uvrgb1 d2
91#define x1_y1 d3
92
93#define v2 q2
94#define uvrgb2 d4
95#define x2_y2 d5
96
97#define x0_ab q3
98#define uvrg_xxxx0 q3
99#define uvrg0 d6
100#define xxxx0 d7
101
102#define x1_ab q4
103#define uvrg_xxxx1 q4
104#define uvrg1 d8
105#define xxxx1 d9
106
107#define x2_ab q5
108#define uvrg_xxxx2 q5
109#define uvrg2 d10
110#define xxxx2 d11
111
112#define y0_ab q6
113#define yyyy_uvrg0 q6
114#define yyyy0 d12
115#define uvrg0b d13
116
117#define y1_ab q7
118#define yyyy_uvrg1 q7
119#define yyyy1 d14
120#define uvrg1b d15
121
122#define y2_ab q8
123#define yyyy_uvrg2 q8
124#define yyyy2 d16
125#define uvrg2b d17
126
127#define d0_ab q9
128#define d0_a d18
129#define d0_b d19
130
131#define d1_ab q10
132#define d1_a d20
133#define d1_b d21
134
135#define d2_ab q11
136#define d2_a d22
137#define d2_b d23
138
139#define d3_ab q12
140#define d3_a d24
141#define d3_b d25
142
143#define ga_uvrg_x q1
144#define ga_uvrg_y q4
145
146#define dx x0_x1
147#define dy y0_y1
148#define db b0_b1
149
150#define uvrg_base q11
151
152#define gs_uvrg_x q5
153#define gs_uvrg_y q6
154
155#define g_uvrg_x q1
156#define ga_uv_x d2
157#define g_uv_x d2
158#define ga_rg_x d3
159#define g_rg_x d3
160
161#define g_uvrg_y q4
162#define ga_uv_y d8
163#define g_uv_y d8
164#define ga_rg_y d9
165#define g_rg_y d9
166
167#define gw_uv_x q1
168#define gw_rg_x q2
169#define gw_uv_y q4
170#define gw_rg_y q3
171
172#define w_mask q9
173#define w_mask_l d18
174
175#define r_shift q10
176
177#define uvrg_dx0 q0
178#define uvrg_dx0l d0
179#define uvrg_dx0h d1
180
181#define uvrg_dx1 q1
182#define uvrg_dx1l d2
183#define uvrg_dx1h d3
184
185#define uvrg_dx2 q2
186#define uvrg_dx2l d4
187#define uvrg_dx2h d5
188
189#define uvrg_dx3 q3
190#define uvrg_dx3l d6
191#define uvrg_dx3h d7
192
c6063f89 193#define uvrgb_phase q13
75e28f62
E
194
195.align 4
196
0e4ad319 197#include "arm_features.h"
8184d7c5 198
0e4ad319 199#define function(name) FUNCTION(name):
200
201#ifndef TEXRELS_FORBIDDEN
75e28f62 202
8184d7c5 203#define JT_OP_REL(table_label, index_reg, temp)
204#define JT_OP(x...) x
205#define JTE(start, target) target
206
207#else
208
8184d7c5 209#define JT_OP_REL(table_label, index_reg, temp) \
210 adr temp, table_label; \
e1f6de8f 211 ldr temp, [temp, index_reg, lsl #2]; \
8184d7c5 212 add pc, pc, temp \
213
214#define JT_OP(x...)
215#define JTE(start, target) (target - start)
216
0e4ad319 217#endif
4d646738 218
0e4ad319 219#ifdef __MACH__
8184d7c5 220#define flush_render_block_buffer _flush_render_block_buffer
221#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
222#define update_texture_8bpp_cache _update_texture_8bpp_cache
8184d7c5 223#endif
224
75e28f62
E
225@ r0: psx_gpu
226@ r1: v_a
227@ r2: v_b
228@ r3: v_c
229
230function(compute_all_gradients)
231 // First compute the triangle area reciprocal and shift. The division will
232 // happen concurrently with much of the work which follows.
233 @ r12 = psx_gpu->triangle_area
e1f6de8f 234 ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
75e28f62
E
235 stmdb sp!, { r4 - r11, lr }
236
237 @ load exponent of 62 into upper half of double
238 movw r4, #0
239 clz r14, r12 @ r14 = shift
240
241 movt r4, #((62 + 1023) << 4)
242 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
243
244 @ load area normalized into lower half of double
245 mov r5, r12, lsr #10
246 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
247
248 movt r4, #((1022 + 31) << 4)
249 mov r5, r12, lsl #20
250
251 add r4, r4, r12, lsr #11
252 vmov.f64 d31, r5, r4
253
254 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
255
256 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
257 // ( d0 * d1 ) - ( d2 * d3 ) =
258 // ( m0 ) - ( m1 ) = gradient
259
260 // This is split to do 12 elements at a time over three sets: a, b, and c.
261 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
262 // two of the slots are unused.
263
264 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
265 // is g.
266
267 // First type is: uvrg bxxx xxxx
268 // Second type is: yyyy ybyy uvrg
269 // Since x_a and y_c are the same the same variable is used for both.
270
e1f6de8f 271 vld1.u32 { v0 }, [v_a, :128] @ v0 = { uvrg0, b0, x0, y0 }
272 ldrsh x0, [v_a, #8] @ load x0
75e28f62 273
e1f6de8f 274 vld1.u32 { v1 }, [v_b, :128] @ v1 = { uvrg1, b1, x1, y1}
275 ldrh x1, [v_b, #8] @ load x1
75e28f62 276
e1f6de8f 277 vld1.u32 { v2 }, [v_c, :128] @ v2 = { uvrg2, b2, x2, y2 }
278 ldrh x2, [v_c, #8] @ load x2
75e28f62
E
279
280 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
e1f6de8f 281 ldrh y0, [v_a, #10] @ load y0
75e28f62
E
282
283 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
e1f6de8f 284 ldrh y1, [v_b, #10] @ load y1
75e28f62
E
285
286 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
e1f6de8f 287 ldrh y2, [v_c, #10] @ load y2
75e28f62
E
288
289 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
290 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
291
292 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
293 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
294
295 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
296 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
297
298 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
299 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
300
e1f6de8f 301 ldrb b2, [v_c, #4] @ load b2
75e28f62
E
302 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
303
e1f6de8f 304 ldrb b1, [v_b, #4] @ load b1
75e28f62
E
305 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
306
307 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
308 vsub.s16 d0_ab, x1_ab, x0_ab
309
e1f6de8f 310 ldrb b0, [v_a, #4] @ load b0
75e28f62
E
311 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
312
313 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
314 vsub.s16 d2_ab, x2_ab, x1_ab
315
316 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
317 vsub.s16 d1_ab, y2_ab, y1_ab
318
319 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
320 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
321
322 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
323 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
324
325 vsub.s16 d3_ab, y1_ab, y0_ab
326 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
327 @ ((x2 - X1) * (b1 - b0))
328 vmull.s16 ga_uvrg_x, d0_a, d1_a
329 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
330 @ ((b2 - b1) * (y1 - y0))
331 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
332 movs gs_bx, ga_bx, asr #31
333
334 vmull.s16 ga_uvrg_y, d0_b, d1_b
335 rsbmi ga_bx, ga_bx, #0
336
c6063f89 337 @ r12 = psx_gpu->uvrgb_phase
e1f6de8f 338 ldr r12, [psx_gpu, #psx_gpu_uvrgb_phase_offset]
c6063f89 339
75e28f62
E
340 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
341 movs gs_by, ga_by, asr #31
342
343 vshr.u64 d0, d30, #22
c6063f89 344 add b_base, r12, b0, lsl #16
345
346 vdup.u32 uvrgb_phase, r12
75e28f62
E
347
348 rsbmi ga_by, ga_by, #0
349 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
350
351 @ r12 = psx_gpu->triangle_winding_offset
e1f6de8f 352 ldrb r12, [psx_gpu, #psx_gpu_triangle_winding_offset]
75e28f62
E
353 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
354
75e28f62
E
355 rsb r12, r12, #0 @ r12 = -(triangle->winding)
356
357 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
358 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
359
360 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
361 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
362
c6063f89 363 vadd.u32 uvrg_base, uvrgb_phase
75e28f62
E
364 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
365
366 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
367 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
368
369 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
370 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
371 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
372 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
373
374 vshl.u64 gw_rg_x, gw_rg_x, r_shift
375 vshl.u64 gw_uv_x, gw_uv_x, r_shift
376 vshl.u64 gw_rg_y, gw_rg_y, r_shift
377 vshl.u64 gw_uv_y, gw_uv_y, r_shift
378
379 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
380 vmovn.u64 g_uv_x, gw_uv_x
381
382 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
383 vmovn.u64 g_rg_x, gw_rg_x
384
385 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
386 vmovn.u64 g_uv_y, gw_uv_y
387
388 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
389 vmovn.u64 g_rg_y, gw_rg_y
390
391 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
392 mov ga_bx, ga_bx, lsl #13
393
394 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
395 mov ga_by, ga_by, lsl #13
396
397 vdup.u32 x0_y0, x0
398 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
399
400 vshl.u32 g_uvrg_x, g_uvrg_x, #4
401 vshl.u32 g_uvrg_y, g_uvrg_y, #4
402
403 umull gw_by_l, gw_by_h, ga_by, area_r_s
404 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
405
406 eor gs_bx, gs_bx, r12
407 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
408
409 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
410 eor gs_by, gs_by, r12
411
412 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
413 add store_a, psx_gpu, #psx_gpu_uvrg_offset
414
415 sub r11, r11, #(32 - 13)
416
417 add store_b, store_a, #16
418 mov store_inc, #32
419
420 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
e1f6de8f 421 vst1.u32 { uvrg_base }, [store_a, :128], store_inc
75e28f62 422
e1f6de8f 423 vst1.u32 { uvrg_dx1 }, [store_b, :128], store_inc
75e28f62
E
424 mov g_bx, gw_bx_h, lsr r11
425
e1f6de8f 426 vst1.u32 { g_uvrg_y }, [store_a, :128], store_inc
75e28f62
E
427 mov g_by, gw_by_h, lsr r11
428
429 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
e1f6de8f 430 [store_b, :128], store_inc
75e28f62
E
431 eor g_bx, g_bx, gs_bx
432
433 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
e1f6de8f 434 [store_b, :128], store_inc
75e28f62
E
435 sub g_bx, g_bx, gs_bx
436
437 lsl g_bx, g_bx, #4
438 eor g_by, g_by, gs_by
439
440 mls b_base, g_bx, x0, b_base
441 sub g_by, g_by, gs_by
442
443 lsl g_by, g_by, #4
444 mov g_bx0, #0
445
446 add g_bx2, g_bx, g_bx
447 add g_bx3, g_bx, g_bx2
448
449 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
450
451 ldmia sp!, { r4 - r11, pc }
452
453
454#define psx_gpu r0
455#define v_a r1
456#define v_b r2
457#define v_c r3
458
459#define temp r14
460
461#define x_a r4
462#define x_b r5
463#define x_c r6
464#define y_a r1
465#define y_b r2
466#define y_c r3
467
468#define height_minor_a r7
469#define height_minor_b r8
470#define height_major r9
471#define height r9
472
473#define reciprocal_table_ptr r10
474
475#define edge_alt_low r4
476#define edge_alt_high r5
477#define edge_dx_dy_alt r6
478#define edge_shift_alt r10
479
480#define edge_dx_dy_alt_low r4
481#define edge_dx_dy_alt_high r5
482
483#define span_edge_data r4
484#define span_uvrg_offset r5
485#define span_b_offset r6
486
487#define clip r14
488
489#define b r11
490#define b_dy r12
491
492
493#define alternate_x q0
494#define alternate_dx_dy q1
495#define alternate_x_32 q2
496
497#define alternate_x_low d0
498#define alternate_x_high d1
499#define alternate_dx_dy_low d2
500#define alternate_dx_dy_high d3
501#define alternate_x_32_low d4
502#define alternate_x_32_high d5
503
504#define left_x q3
505#define right_x q4
506#define left_dx_dy q5
507#define right_dx_dy q6
508#define left_edge q7
509#define right_edge q8
510
511#define left_x_low d6
512#define left_x_high d7
513#define right_x_low d8
514#define right_x_high d9
515#define left_dx_dy_low d10
516#define left_dx_dy_high d11
517#define right_dx_dy_low d12
518#define right_dx_dy_high d13
519#define left_edge_low d14
520#define left_edge_high d15
521#define right_edge_low d16
522#define right_edge_high d17
523
524#define y_mid_point d18
525#define c_0x0004 d19
526
527#define left_right_x_16 q11
528#define span_shifts_y q12
529#define c_0x0001 q13
530
531#define span_shifts d24
532#define y_x4 d25
533#define c_0xFFFE d26
534#define c_0x0007 d27
535
536#define left_right_x_16_low d22
537#define left_right_x_16_high d23
538
539#define uvrg q14
540#define uvrg_dy q15
541
542#define alternate_x_16 d4
543
544#define v_clip q3
545#define v_clip_low d6
546
547#define right_x_32 q10
548#define left_x_32 q11
549#define alternate_select d24
550
551#define right_x_32_low d20
552#define right_x_32_high d21
553#define left_x_32_low d22
554#define left_x_32_high d23
555
556#define edges_xy q0
557#define edges_dx_dy d2
558#define edge_shifts d3
559#define edge_shifts_64 q2
560
561#define edges_xy_left d0
562#define edges_xy_right d1
563
564#define height_reciprocals d6
565#define heights d7
566
567#define widths d8
568#define c_0x01 d9
569#define x_starts d10
570#define x_ends d11
571
572#define heights_b d12
573#define edges_dx_dy_64 q10
574
575#define edges_dx_dy_64_left d20
576#define edges_dx_dy_64_right d21
577
578
579#define setup_spans_prologue() \
580 stmdb sp!, { r4 - r11, lr }; \
581 \
e1f6de8f 582 ldrsh x_a, [v_a, #8]; \
583 ldrsh x_b, [v_b, #8]; \
584 ldrsh x_c, [v_c, #8]; \
585 ldrsh y_a, [v_a, #10]; \
586 ldrsh y_b, [v_b, #10]; \
587 ldrsh y_c, [v_c, #10]; \
75e28f62
E
588 \
589 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
e1f6de8f 590 vld1.32 { uvrg }, [temp]; \
75e28f62 591 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
e1f6de8f 592 vld1.32 { uvrg_dy }, [temp]; \
593 ldr reciprocal_table_ptr, [psx_gpu, #psx_gpu_reciprocal_table_ptr_offset]; \
75e28f62
E
594 \
595 vmov.u32 c_0x01, #0x01 \
596
597#define setup_spans_load_b() \
e1f6de8f 598 ldr b, [psx_gpu, #psx_gpu_b_offset]; \
599 ldr b_dy, [psx_gpu, #psx_gpu_b_dy_offset] \
75e28f62
E
600
601#define setup_spans_prologue_b() \
602 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
603 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
604 \
605 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
606 vmov.u16 c_0x0004, #0x0004; \
607 \
608 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
609 vmov.u16 c_0x0001, #0x0001; \
610 \
e1f6de8f 611 vld1.u16 { left_edge_low[], left_edge_high[] }, [temp]; \
75e28f62
E
612 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
613 \
e1f6de8f 614 vld1.u16 { right_edge_low[], right_edge_high[] }, [temp]; \
75e28f62
E
615 vadd.u16 right_edge, right_edge, c_0x0001; \
616 \
617 vmov.u16 c_0x0007, #0x0007; \
618 vmvn.u16 c_0xFFFE, #0x0001 \
619
620
621#define compute_edge_delta_x2() \
e1f6de8f 622 ldr temp, [reciprocal_table_ptr, height, lsl #2]; \
75e28f62
E
623 \
624 vdup.u32 heights, height; \
625 vsub.u32 widths, x_ends, x_starts; \
626 \
627 vdup.u32 edge_shifts, temp; \
628 vsub.u32 heights_b, heights, c_0x01; \
7d5140f5 629 vshr.u32 height_reciprocals, edge_shifts, #10; \
75e28f62
E
630 \
631 vmla.s32 heights_b, x_starts, heights; \
632 vbic.u16 edge_shifts, #0xE0; \
633 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
634 vmull.s32 edges_xy, heights_b, height_reciprocals \
635
636#define width_alt r6
637#define height_reciprocal_alt r11
638#define height_b_alt r12
639
640#define compute_edge_delta_x3(start_c, height_a, height_b) \
ed0fd81d 641 vmov heights, height_a, height_b; \
e1f6de8f 642 ldr temp, [reciprocal_table_ptr, height_a, lsl #2]; \
75e28f62 643 vmov.u32 edge_shifts[0], temp; \
e1f6de8f 644 ldr temp, [reciprocal_table_ptr, height_b, lsl #2]; \
75e28f62 645 vmov.u32 edge_shifts[1], temp; \
e1f6de8f 646 ldr edge_shift_alt, [reciprocal_table_ptr, height_minor_b, lsl #2]; \
75e28f62
E
647 \
648 vsub.u32 widths, x_ends, x_starts; \
649 sub width_alt, x_c, start_c; \
650 \
651 vsub.u32 heights_b, heights, c_0x01; \
652 sub height_b_alt, height_minor_b, #1; \
653 \
7d5140f5
E
654 vshr.u32 height_reciprocals, edge_shifts, #10; \
655 lsr height_reciprocal_alt, edge_shift_alt, #10; \
75e28f62
E
656 \
657 vmla.s32 heights_b, x_starts, heights; \
658 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
659 \
660 vbic.u16 edge_shifts, #0xE0; \
661 and edge_shift_alt, edge_shift_alt, #0x1F; \
662 \
663 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
664 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
665 \
666 vmull.s32 edges_xy, heights_b, height_reciprocals; \
667 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
668
669
670#define setup_spans_adjust_y_up() \
671 vsub.u32 y_x4, y_x4, c_0x0004 \
672
673#define setup_spans_adjust_y_down() \
674 vadd.u32 y_x4, y_x4, c_0x0004 \
675
676#define setup_spans_adjust_interpolants_up() \
677 vsub.u32 uvrg, uvrg, uvrg_dy; \
678 sub b, b, b_dy \
679
680#define setup_spans_adjust_interpolants_down() \
681 vadd.u32 uvrg, uvrg, uvrg_dy; \
682 add b, b, b_dy \
683
684
685#define setup_spans_clip_interpolants_increment() \
686 mla b, b_dy, clip, b; \
687 vmla.s32 uvrg, uvrg_dy, v_clip \
688
689#define setup_spans_clip_interpolants_decrement() \
690 mls b, b_dy, clip, b; \
691 vmls.s32 uvrg, uvrg_dy, v_clip \
692
693#define setup_spans_clip_alternate_yes() \
694 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
695
696#define setup_spans_clip_alternate_no() \
697
698#define setup_spans_clip(direction, alternate_active) \
699 vdup.u32 v_clip, clip; \
700 setup_spans_clip_alternate_##alternate_active(); \
701 setup_spans_clip_interpolants_##direction(); \
702 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
703
704
705#define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
706 vmovl.s32 edge_shifts_64, edge_shifts; \
707 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
708 \
709 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
710 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
711 \
712 vmov left_x_low, edges_xy_##left_index; \
713 vmov right_x_low, edges_xy_##right_index; \
714 \
715 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
716 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
717 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
718 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
719 \
720 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
721 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
722 \
723 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
724 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
725
726
727#define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
728 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
729 \
730 vdup.u16 y_mid_point, y_b; \
731 rsb temp, edge_shift_alt, #32; \
732 \
733 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
734 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
735 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
736 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
737 \
738 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
739 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
740 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
741 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
742 \
743 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
744 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
745
746
747#define setup_spans_y_select_up() \
748 vclt.s16 alternate_select, y_x4, y_mid_point \
749
750#define setup_spans_y_select_down() \
751 vcgt.s16 alternate_select, y_x4, y_mid_point \
752
753
754#define setup_spans_alternate_select_left() \
755 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
756
757#define setup_spans_alternate_select_right() \
758 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
759
760
761#define setup_spans_set_x4_alternate_yes(alternate, direction) \
762 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
763 vshrn.s64 left_x_32_low, left_x, #32; \
764 vshrn.s64 right_x_32_low, right_x, #32; \
765 \
766 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
767 vadd.u64 left_x, left_x, left_dx_dy; \
768 vadd.u64 right_x, right_x, right_dx_dy; \
769 \
770 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
771 vshrn.s64 left_x_32_high, left_x, #32; \
772 vshrn.s64 right_x_32_high, right_x, #32; \
773 \
774 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
775 vadd.u64 left_x, left_x, left_dx_dy; \
776 vadd.u64 right_x, right_x, right_dx_dy; \
777 \
778 vmovn.u32 alternate_x_16, alternate_x_32; \
779 setup_spans_y_select_##direction(); \
780 vmovn.u32 left_right_x_16_low, left_x_32; \
781 \
782 vmovn.u32 left_right_x_16_high, right_x_32; \
783 setup_spans_alternate_select_##alternate(); \
784 \
e1f6de8f 785 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
786 str b, [span_b_offset], #4; \
75e28f62
E
787 setup_spans_adjust_interpolants_##direction(); \
788 \
789 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
790 \
e1f6de8f 791 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
792 str b, [span_b_offset], #4; \
75e28f62
E
793 setup_spans_adjust_interpolants_##direction(); \
794 \
795 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
796 \
e1f6de8f 797 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
798 str b, [span_b_offset], #4; \
75e28f62
E
799 setup_spans_adjust_interpolants_##direction(); \
800 \
801 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
802 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
803 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
804 \
e1f6de8f 805 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
806 str b, [span_b_offset], #4; \
75e28f62
E
807 setup_spans_adjust_interpolants_##direction(); \
808 \
809 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
810 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
811 \
e1f6de8f 812 vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \
75e28f62
E
813 \
814 setup_spans_adjust_y_##direction() \
815
816
817#define setup_spans_set_x4_alternate_no(alternate, direction) \
818 vshrn.s64 left_x_32_low, left_x, #32; \
819 vshrn.s64 right_x_32_low, right_x, #32; \
820 \
821 vadd.u64 left_x, left_x, left_dx_dy; \
822 vadd.u64 right_x, right_x, right_dx_dy; \
823 \
824 vshrn.s64 left_x_32_high, left_x, #32; \
825 vshrn.s64 right_x_32_high, right_x, #32; \
826 \
827 vadd.u64 left_x, left_x, left_dx_dy; \
828 vadd.u64 right_x, right_x, right_dx_dy; \
829 \
830 vmovn.u32 left_right_x_16_low, left_x_32; \
831 vmovn.u32 left_right_x_16_high, right_x_32; \
832 \
e1f6de8f 833 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
834 str b, [span_b_offset], #4; \
75e28f62
E
835 setup_spans_adjust_interpolants_##direction(); \
836 \
837 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
838 \
e1f6de8f 839 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
840 str b, [span_b_offset], #4; \
75e28f62
E
841 setup_spans_adjust_interpolants_##direction(); \
842 \
843 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
844 \
e1f6de8f 845 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
846 str b, [span_b_offset], #4; \
75e28f62
E
847 setup_spans_adjust_interpolants_##direction(); \
848 \
849 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
850 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
851 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
852 \
e1f6de8f 853 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
854 str b, [span_b_offset], #4; \
75e28f62
E
855 setup_spans_adjust_interpolants_##direction(); \
856 \
857 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
858 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
859 \
e1f6de8f 860 vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \
75e28f62
E
861 \
862 setup_spans_adjust_y_##direction() \
863
864
865#define edge_adjust_low r11
866#define edge_adjust_high r12
867
868#define setup_spans_alternate_adjust_yes() \
869 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
870 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
871 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
872
873#define setup_spans_alternate_adjust_no() \
874
875
876#define setup_spans_down(left_index, right_index, alternate, alternate_active) \
877 setup_spans_alternate_adjust_##alternate_active(); \
878 setup_spans_load_b(); \
879 \
e1f6de8f 880 ldrsh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]; \
75e28f62
E
881 subs y_c, y_c, temp; \
882 subgt height, height, y_c; \
883 addgt height, height, #1; \
884 \
e1f6de8f 885 ldrsh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]; \
75e28f62
E
886 subs clip, temp, y_a; \
887 ble 0f; \
888 \
889 sub height, height, clip; \
890 add y_a, y_a, clip; \
891 setup_spans_clip(increment, alternate_active); \
892 \
893 0: \
894 cmp height, #0; \
895 ble 1f; \
896 \
897 orr temp, y_a, y_a, lsl #16; \
898 add temp, temp, #(1 << 16); \
899 add y_a, temp, #2; \
900 add y_a, y_a, #(2 << 16); \
ed0fd81d 901 vmov y_x4, temp, y_a; \
75e28f62
E
902 \
903 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
904 right_index); \
905 setup_spans_prologue_b(); \
906 \
e1f6de8f 907 strh height, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
908 \
909 2: \
910 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
911 subs height, height, #4; \
912 bhi 2b; \
913 \
914 1: \
915
916
917#define setup_spans_alternate_pre_increment_yes() \
918 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
919 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
920
921#define setup_spans_alternate_pre_increment_no() \
922
923
924#define setup_spans_up_decrement_yes() \
925 suble height, height, #1 \
926
927#define setup_spans_up_decrement_no() \
928
929
930#define setup_spans_up(left_index, right_index, alternate, alternate_active) \
931 setup_spans_alternate_adjust_##alternate_active(); \
932 setup_spans_load_b(); \
933 sub y_a, y_a, #1; \
934 \
e1f6de8f 935 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]; \
75e28f62
E
936 subs temp, temp, y_c; \
937 subgt height, height, temp; \
938 setup_spans_up_decrement_##alternate_active(); \
939 \
e1f6de8f 940 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]; \
75e28f62
E
941 subs clip, y_a, temp; \
942 ble 0f; \
943 \
944 sub height, height, clip; \
945 sub y_a, y_a, clip; \
946 setup_spans_clip(decrement, alternate_active); \
947 \
948 0: \
949 cmp height, #0; \
950 ble 1f; \
951 \
952 orr temp, y_a, y_a, lsl #16; \
953 sub temp, temp, #(1 << 16); \
954 sub y_a, temp, #2; \
955 sub y_a, y_a, #(2 << 16); \
ed0fd81d 956 vmov y_x4, temp, y_a; \
75e28f62
E
957 \
958 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
959 \
960 setup_spans_alternate_pre_increment_##alternate_active(); \
961 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
962 right_index); \
963 setup_spans_adjust_interpolants_up(); \
964 setup_spans_prologue_b(); \
965 \
e1f6de8f 966 strh height, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
967 \
968 2: \
969 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
970 subs height, height, #4; \
971 bhi 2b; \
972 \
973 1: \
974
975
976#define setup_spans_epilogue() \
977 ldmia sp!, { r4 - r11, pc } \
978
979
980#define setup_spans_up_up(minor, major) \
981 setup_spans_prologue(); \
982 sub height_minor_a, y_a, y_b; \
983 sub height_minor_b, y_b, y_c; \
984 sub height, y_a, y_c; \
985 \
986 vdup.u32 x_starts, x_a; \
ed0fd81d 987 vmov x_ends, x_c, x_b; \
75e28f62
E
988 \
989 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
990 setup_spans_up(major, minor, minor, yes); \
991 setup_spans_epilogue() \
992
993function(setup_spans_up_left)
994 setup_spans_up_up(left, right)
995
996function(setup_spans_up_right)
997 setup_spans_up_up(right, left)
998
75e28f62
E
999#define setup_spans_down_down(minor, major) \
1000 setup_spans_prologue(); \
1001 sub height_minor_a, y_b, y_a; \
1002 sub height_minor_b, y_c, y_b; \
1003 sub height, y_c, y_a; \
1004 \
1005 vdup.u32 x_starts, x_a; \
ed0fd81d 1006 vmov x_ends, x_c, x_b; \
75e28f62
E
1007 \
1008 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1009 setup_spans_down(major, minor, minor, yes); \
1010 setup_spans_epilogue() \
1011
1012function(setup_spans_down_left)
1013 setup_spans_down_down(left, right)
1014
1015function(setup_spans_down_right)
1016 setup_spans_down_down(right, left)
1017
1018
1019#define setup_spans_up_flat() \
1020 sub height, y_a, y_c; \
1021 \
1022 compute_edge_delta_x2(); \
1023 setup_spans_up(left, right, none, no); \
1024 setup_spans_epilogue() \
1025
1026function(setup_spans_up_a)
1027 setup_spans_prologue()
1028
ed0fd81d 1029 vmov x_starts, x_a, x_b
75e28f62
E
1030 vdup.u32 x_ends, x_c
1031
1032 setup_spans_up_flat()
1033
1034function(setup_spans_up_b)
1035 setup_spans_prologue()
1036
1037 vdup.u32 x_starts, x_a
ed0fd81d 1038 vmov x_ends, x_b, x_c
75e28f62
E
1039
1040 setup_spans_up_flat()
1041
1042#define setup_spans_down_flat() \
1043 sub height, y_c, y_a; \
1044 \
1045 compute_edge_delta_x2(); \
1046 setup_spans_down(left, right, none, no); \
1047 setup_spans_epilogue() \
1048
1049function(setup_spans_down_a)
1050 setup_spans_prologue()
1051
ed0fd81d 1052 vmov x_starts, x_a, x_b
75e28f62
E
1053 vdup.u32 x_ends, x_c
1054
1055 setup_spans_down_flat()
1056
1057function(setup_spans_down_b)
1058 setup_spans_prologue()
1059
1060 vdup.u32 x_starts, x_a
ed0fd81d 1061 vmov x_ends, x_b, x_c
75e28f62
E
1062
1063 setup_spans_down_flat()
1064
1065
1066#define middle_y r9
1067
1068#define edges_xy_b q11
1069#define edges_dx_dy_b d26
1070#define edge_shifts_b d27
1071#define edges_dx_dy_and_shifts_b q13
1072#define height_increment d20
1073
1074#define edges_dx_dy_and_shifts q1
1075
1076#define edges_xy_b_left d22
1077#define edges_xy_b_right d23
1078
1079#define setup_spans_up_down_load_edge_set_b() \
1080 vmov edges_xy, edges_xy_b; \
1081 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1082
1083
1084function(setup_spans_up_down)
1085 setup_spans_prologue()
1086
1087 // s32 middle_y = y_a;
1088 sub height_minor_a, y_a, y_b
1089 sub height_minor_b, y_c, y_a
1090 sub height_major, y_c, y_b
1091
ed0fd81d 1092 vmov x_starts, x_a, x_c
75e28f62
E
1093 vdup.u32 x_ends, x_b
1094
1095 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1096
1097 mov temp, #0
ed0fd81d 1098 vmov height_increment, temp, height_minor_b
75e28f62
E
1099 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1100
1101 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1102 vmov edges_xy_b_right, edges_xy_right
1103
1104 vmov edge_shifts_b, edge_shifts
1105 vmov.u32 edge_shifts_b[0], edge_shift_alt
1106
1107 vneg.s32 edges_dx_dy_b, edges_dx_dy
1108 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1109
1110 mov middle_y, y_a
1111
1112 setup_spans_load_b()
1113 sub y_a, y_a, #1
1114
e1f6de8f 1115 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
75e28f62
E
1116 subs temp, temp, y_b
1117 subgt height_minor_a, height_minor_a, temp
1118
e1f6de8f 1119 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
75e28f62
E
1120 subs clip, y_a, temp
1121 ble 0f
1122
1123 sub height_minor_a, height_minor_a, clip
1124 sub y_a, y_a, clip
1125 setup_spans_clip(decrement, no)
1126
1127 0:
1128 cmp height_minor_a, #0
1129 ble 3f
1130
1131 orr temp, y_a, y_a, lsl #16
1132 sub temp, temp, #(1 << 16)
1133 sub y_a, temp, #2
1134 sub y_a, y_a, #(2 << 16)
ed0fd81d 1135 vmov y_x4, temp, y_a
75e28f62
E
1136
1137 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1138
e1f6de8f 1139 strh height_minor_a, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1140
1141 setup_spans_adjust_edges_alternate_no(left, right);
1142 setup_spans_adjust_interpolants_up()
1143 setup_spans_up_down_load_edge_set_b()
1144
1145 setup_spans_prologue_b()
1146
1147
1148 2:
1149 setup_spans_set_x4_alternate_no(none, up)
1150 subs height_minor_a, height_minor_a, #4
1151 bhi 2b
1152
1153 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1154 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1155 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1156
1157 4:
1158 add temp, psx_gpu, #psx_gpu_uvrg_offset
e1f6de8f 1159 vld1.32 { uvrg }, [temp]
75e28f62
E
1160 mov y_a, middle_y
1161
1162 setup_spans_load_b()
1163
e1f6de8f 1164 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
75e28f62
E
1165 subs y_c, y_c, temp
1166 subgt height_minor_b, height_minor_b, y_c
1167 addgt height_minor_b, height_minor_b, #1
1168
e1f6de8f 1169 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
75e28f62
E
1170 subs clip, temp, y_a
1171 ble 0f
1172
1173 sub height_minor_b, height_minor_b, clip
1174 add y_a, y_a, clip
1175 setup_spans_clip(increment, no)
1176
1177 0:
1178 cmp height_minor_b, #0
1179 ble 1f
1180
1181 orr temp, y_a, y_a, lsl #16
1182 add temp, temp, #(1 << 16)
1183 add y_a, temp, #2
1184 add y_a, y_a, #(2 << 16)
ed0fd81d 1185 vmov y_x4, temp, y_a
75e28f62
E
1186
1187 setup_spans_adjust_edges_alternate_no(left, right)
1188
e1f6de8f 1189 ldrh temp, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62 1190 add temp, temp, height_minor_b
b7569147 1191
1192 cmp temp, #MAX_SPANS
1193 beq 5f
1194
e1f6de8f 1195 strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1196
1197 2:
1198 setup_spans_set_x4_alternate_no(none, down)
1199 subs height_minor_b, height_minor_b, #4
1200 bhi 2b
1201
1202 1:
1203 setup_spans_epilogue()
1204
1205 3:
1206 setup_spans_up_down_load_edge_set_b()
1207 setup_spans_prologue_b()
1208 bal 4b
1209
b7569147 1210 5:
1211 // FIXME: overflow corner case
1212 sub temp, temp, height_minor_b
1213 bics height_minor_b, #3
1214 add temp, temp, height_minor_b
e1f6de8f 1215 strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
b7569147 1216 bne 2b
1217 bal 1b
1218
75e28f62
E
1219#undef span_uvrg_offset
1220#undef span_edge_data
1221#undef span_b_offset
1222#undef left_x
1223#undef b
1224
1225#define psx_gpu r0
1226#define num_spans r1
1227#define span_uvrg_offset r2
1228#define span_edge_data r3
1229#define span_b_offset r4
1230#define b_dx r5
1231#define span_num_blocks r6
1232#define y r7
1233#define left_x r8
1234#define b r9
1235#define dither_offset_ptr r10
1236#define block_ptr_a r11
1237#define fb_ptr r12
1238#define num_blocks r14
1239
1240#define uvrg_dx_ptr r2
1241#define texture_mask_ptr r3
1242#define dither_shift r8
1243#define dither_row r10
1244
1245#define c_32 r7
1246#define b_dx4 r8
1247#define b_dx8 r9
1248#define block_ptr_b r10
1249
1250#define block_span_ptr r10
1251#define right_mask r8
1252
1253#define color r2
1254#define color_r r3
1255#define color_g r4
1256#define color_b r5
1257
1258#undef uvrg
1259
1260#define u_block q0
1261#define v_block q1
1262#define r_block q2
1263#define g_block q3
1264#define b_block q4
1265
1266#define uv_dx4 d10
1267#define rg_dx4 d11
1268#define uv_dx8 d12
1269#define rg_dx8 d13
1270#define b_whole_8 d14
1271#define fb_mask_ptrs d15
1272
1273#define uvrg_dx4 q5
1274#define uvrg_dx8 q6
1275#define uv_dx8 d12
1276#define rg_dx8 d13
1277
1278#define u_whole q8
1279#define v_whole q9
1280#define r_whole q10
1281#define g_whole q11
1282#define b_whole q12
1283
1284#define u_whole_low d16
1285#define u_whole_high d17
1286#define v_whole_low d18
1287#define v_whole_high d19
1288#define r_whole_low d20
1289#define r_whole_high d21
1290#define g_whole_low d22
1291#define g_whole_high d23
1292#define b_whole_low d24
1293#define b_whole_high d25
1294
1295#define dx4 q13
1296#define dx8 q13
1297
1298#define u_whole_8 d26
1299#define v_whole_8 d27
1300#define u_whole_8b d24
1301#define r_whole_8 d24
1302#define g_whole_8 d25
1303
1304#define uv_whole_8 q13
1305#define uv_whole_8b q14
1306
1307#define dither_offsets q14
1308#define texture_mask q15
1309#define texture_mask_u d30
1310#define texture_mask_v d31
1311
1312#define dither_offsets_short d28
1313
1314#define v_left_x q8
1315#define uvrg q9
1316#define block_span q10
1317
1318#define uv d18
1319#define rg d19
1320
1321#define draw_mask q1
1322#define draw_mask_edge q13
1323#define test_mask q0
1324
1325#define uvrg_dx q3
1326
1327#define colors q2
1328
1329#define setup_blocks_texture_swizzled() \
1330 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1331 vsli.u8 u_whole_8, v_whole_8, #4; \
1332 vsri.u8 v_whole_8, u_whole_8b, #4 \
1333
1334#define setup_blocks_texture_unswizzled() \
1335
1336
1337#define setup_blocks_shaded_textured_builder(swizzling) \
1338.align 3; \
1339 \
1340function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
e1f6de8f 1341 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
1342 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1343 \
e1f6de8f 1344 vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128]; \
75e28f62
E
1345 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1346 \
1347 cmp num_spans, #0; \
1348 bxeq lr; \
1349 \
1350 stmdb sp!, { r4 - r11, r14 }; \
1351 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1352 \
e1f6de8f 1353 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
1354 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1355 \
e1f6de8f 1356 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16]; \
75e28f62
E
1357 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1358 \
e1f6de8f 1359 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1360 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1361 \
1362 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1363 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1364 \
1365 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1366 \
1367 0: \
1368 vmov.u8 fb_mask_ptrs, #0; \
1369 \
e1f6de8f 1370 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
1371 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1372 \
e1f6de8f 1373 ldrh y, [span_edge_data, #edge_data_y_offset]; \
1374 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
1375 \
1376 cmp span_num_blocks, #0; \
1377 beq 1f; \
1378 \
e1f6de8f 1379 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
1380 add num_blocks, span_num_blocks, num_blocks; \
1381 \
1382 cmp num_blocks, #MAX_BLOCKS; \
1383 bgt 2f; \
1384 \
1385 3: \
e1f6de8f 1386 ldr b, [span_b_offset]; \
75e28f62
E
1387 add fb_ptr, fb_ptr, y, lsl #11; \
1388 \
1389 vdup.u32 v_left_x, left_x; \
1390 and y, y, #0x3; \
1391 \
e1f6de8f 1392 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
1393 add fb_ptr, fb_ptr, left_x, lsl #1; \
1394 \
1395 mla b, b_dx, left_x, b; \
1396 and dither_shift, left_x, #0x03; \
1397 \
e1f6de8f 1398 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
1399 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1400 \
1401 mov dither_shift, dither_shift, lsl #3; \
1402 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1403 \
1404 mov c_32, #32; \
1405 subs span_num_blocks, span_num_blocks, #1; \
1406 \
1407 mov dither_row, dither_row, ror dither_shift; \
1408 mov b_dx4, b_dx, lsl #2; \
1409 \
1410 vdup.u32 dither_offsets_short, dither_row; \
1411 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1412 \
1413 vdup.u32 b_block, b; \
1414 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1415 \
1416 vdup.u32 u_block, uv[0]; \
1417 mov b_dx8, b_dx, lsl #3; \
1418 \
1419 vdup.u32 v_block, uv[1]; \
1420 vdup.u32 r_block, rg[0]; \
1421 vdup.u32 g_block, rg[1]; \
1422 \
e1f6de8f 1423 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1424 \
1425 vadd.u32 u_block, u_block, block_span; \
e1f6de8f 1426 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1427 \
1428 vadd.u32 v_block, v_block, block_span; \
e1f6de8f 1429 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1430 \
1431 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 1432 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1433 \
1434 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 1435 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
1436 \
1437 vadd.u32 b_block, b_block, block_span; \
1438 add block_ptr_b, block_ptr_a, #16; \
1439 \
1440 vshrn.u32 u_whole_low, u_block, #16; \
1441 vshrn.u32 v_whole_low, v_block, #16; \
1442 vshrn.u32 r_whole_low, r_block, #16; \
1443 vshrn.u32 g_whole_low, g_block, #16; \
1444 \
1445 vdup.u32 dx4, uv_dx4[0]; \
1446 vshrn.u32 b_whole_low, b_block, #16; \
1447 \
1448 vaddhn.u32 u_whole_high, u_block, dx4; \
1449 vdup.u32 dx4, uv_dx4[1]; \
1450 \
1451 vaddhn.u32 v_whole_high, v_block, dx4; \
1452 vdup.u32 dx4, rg_dx4[0]; \
1453 \
1454 vaddhn.u32 r_whole_high, r_block, dx4; \
1455 vdup.u32 dx4, rg_dx4[1]; \
1456 \
1457 vaddhn.u32 g_whole_high, g_block, dx4; \
1458 vdup.u32 dx4, b_dx4; \
1459 \
1460 vaddhn.u32 b_whole_high, b_block, dx4; \
1461 vdup.u32 dx8, uv_dx8[0]; \
1462 \
1463 vadd.u32 u_block, u_block, dx8; \
1464 vdup.u32 dx8, uv_dx8[1]; \
1465 \
1466 vadd.u32 v_block, v_block, dx8; \
1467 vdup.u32 dx8, rg_dx8[0]; \
1468 \
1469 vadd.u32 r_block, r_block, dx8; \
1470 vdup.u32 dx8, rg_dx8[1]; \
1471 \
1472 vadd.u32 g_block, g_block, dx8; \
1473 vdup.u32 dx8, b_dx8; \
1474 \
1475 vadd.u32 b_block, b_block, dx8; \
1476 vmovn.u16 u_whole_8, u_whole; \
1477 \
1478 vmovn.u16 v_whole_8, v_whole; \
1479 \
1480 vmovn.u16 b_whole_8, b_whole; \
e1f6de8f 1481 pld [fb_ptr]; \
75e28f62
E
1482 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1483 \
1484 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1485 setup_blocks_texture_##swizzling(); \
1486 \
1487 vmovn.u16 r_whole_8, r_whole; \
1488 beq 5f; \
1489 \
1490 4: \
1491 vmovn.u16 g_whole_8, g_whole; \
1492 vshrn.u32 u_whole_low, u_block, #16; \
1493 \
e1f6de8f 1494 vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32; \
75e28f62
E
1495 vshrn.u32 v_whole_low, v_block, #16; \
1496 \
e1f6de8f 1497 vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32; \
75e28f62
E
1498 vshrn.u32 r_whole_low, r_block, #16; \
1499 \
e1f6de8f 1500 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1501 vshrn.u32 g_whole_low, g_block, #16; \
1502 \
1503 vdup.u32 dx4, uv_dx4[0]; \
1504 vshrn.u32 b_whole_low, b_block, #16; \
1505 \
1506 vaddhn.u32 u_whole_high, u_block, dx4; \
1507 vdup.u32 dx4, uv_dx4[1]; \
1508 \
1509 vaddhn.u32 v_whole_high, v_block, dx4; \
1510 vdup.u32 dx4, rg_dx4[0]; \
1511 \
1512 vaddhn.u32 r_whole_high, r_block, dx4; \
1513 vdup.u32 dx4, rg_dx4[1]; \
1514 \
1515 vaddhn.u32 g_whole_high, g_block, dx4; \
1516 vdup.u32 dx4, b_dx4; \
1517 \
1518 vaddhn.u32 b_whole_high, b_block, dx4; \
1519 vdup.u32 dx8, uv_dx8[0]; \
1520 \
1521 vadd.u32 u_block, u_block, dx8; \
1522 vdup.u32 dx8, uv_dx8[1]; \
1523 \
1524 vadd.u32 v_block, v_block, dx8; \
1525 vdup.u32 dx8, rg_dx8[0]; \
1526 \
1527 vadd.u32 r_block, r_block, dx8; \
1528 vdup.u32 dx8, rg_dx8[1]; \
1529 \
1530 vadd.u32 g_block, g_block, dx8; \
1531 vdup.u32 dx8, b_dx8; \
1532 \
1533 vadd.u32 b_block, b_block, dx8; \
1534 vmovn.u16 u_whole_8, u_whole; \
1535 \
1536 add fb_ptr, fb_ptr, #16; \
1537 vmovn.u16 v_whole_8, v_whole; \
1538 \
e1f6de8f 1539 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
75e28f62
E
1540 vmovn.u16 b_whole_8, b_whole; \
1541 \
e1f6de8f 1542 pld [fb_ptr]; \
75e28f62
E
1543 \
1544 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1545 subs span_num_blocks, span_num_blocks, #1; \
1546 \
1547 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1548 setup_blocks_texture_##swizzling(); \
1549 \
1550 vmovn.u16 r_whole_8, r_whole; \
1551 bne 4b; \
1552 \
1553 5: \
1554 vmovn.u16 g_whole_8, g_whole; \
e1f6de8f 1555 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62 1556 \
e1f6de8f 1557 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
1558 vdup.u8 draw_mask, right_mask; \
1559 \
1560 vmov.u32 fb_mask_ptrs[0], right_mask; \
1561 vtst.u16 draw_mask, draw_mask, test_mask; \
1562 vzip.u8 u_whole_8, v_whole_8; \
1563 \
1564 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
e1f6de8f 1565 vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32; \
1566 vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32; \
1567 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1568 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1569 \
1570 1: \
1571 add span_uvrg_offset, span_uvrg_offset, #16; \
1572 add span_b_offset, span_b_offset, #4; \
1573 \
1574 add span_edge_data, span_edge_data, #8; \
1575 subs num_spans, num_spans, #1; \
1576 \
e1f6de8f 1577 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1578 bne 0b; \
1579 \
1580 ldmia sp!, { r4 - r11, pc }; \
1581 \
1582 2: \
1583 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1584 vpush { texture_mask }; \
1585 vpush { uvrg_dx4 }; \
1586 \
4d646738 1587 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 1588 bl flush_render_block_buffer; \
4d646738 1589 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
1590 \
1591 vpop { uvrg_dx4 }; \
1592 vpop { texture_mask }; \
1593 \
1594 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1595 vmov.u8 fb_mask_ptrs, #0; \
1596 \
1597 mov num_blocks, span_num_blocks; \
1598 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1599 bal 3b \
1600
1601
1602setup_blocks_shaded_textured_builder(swizzled)
1603setup_blocks_shaded_textured_builder(unswizzled)
1604
1605
1606#define setup_blocks_unshaded_textured_builder(swizzling) \
1607.align 3; \
1608 \
1609function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
e1f6de8f 1610 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
1611 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1612 \
e1f6de8f 1613 vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128]; \
75e28f62
E
1614 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1615 \
1616 cmp num_spans, #0; \
1617 bxeq lr; \
1618 \
1619 stmdb sp!, { r4 - r11, r14 }; \
1620 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1621 \
1622 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1623 \
e1f6de8f 1624 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16]; \
75e28f62
E
1625 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1626 \
e1f6de8f 1627 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1628 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1629 \
1630 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1631 \
1632 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1633 \
1634 0: \
1635 vmov.u8 fb_mask_ptrs, #0; \
1636 \
e1f6de8f 1637 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
1638 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1639 \
e1f6de8f 1640 ldrh y, [span_edge_data, #edge_data_y_offset]; \
1641 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
1642 \
1643 cmp span_num_blocks, #0; \
1644 beq 1f; \
1645 \
e1f6de8f 1646 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
1647 add num_blocks, span_num_blocks, num_blocks; \
1648 \
1649 cmp num_blocks, #MAX_BLOCKS; \
1650 bgt 2f; \
1651 \
1652 3: \
1653 add fb_ptr, fb_ptr, y, lsl #11; \
1654 \
1655 vdup.u32 v_left_x, left_x; \
1656 and y, y, #0x3; \
1657 \
e1f6de8f 1658 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
1659 add fb_ptr, fb_ptr, left_x, lsl #1; \
1660 \
1661 and dither_shift, left_x, #0x03; \
1662 \
e1f6de8f 1663 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
1664 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1665 \
1666 mov dither_shift, dither_shift, lsl #3; \
1667 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1668 \
1669 mov c_32, #32; \
1670 subs span_num_blocks, span_num_blocks, #1; \
1671 \
1672 mov dither_row, dither_row, ror dither_shift; \
1673 \
1674 vdup.u32 dither_offsets_short, dither_row; \
1675 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1676 \
1677 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1678 \
1679 vdup.u32 u_block, uv[0]; \
1680 \
1681 vdup.u32 v_block, uv[1]; \
e1f6de8f 1682 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1683 \
1684 vadd.u32 u_block, u_block, block_span; \
e1f6de8f 1685 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1686 \
1687 vadd.u32 v_block, v_block, block_span; \
1688 add block_ptr_b, block_ptr_a, #16; \
1689 \
1690 vshrn.u32 u_whole_low, u_block, #16; \
1691 vshrn.u32 v_whole_low, v_block, #16; \
1692 \
1693 vdup.u32 dx4, uv_dx4[0]; \
1694 \
1695 vaddhn.u32 u_whole_high, u_block, dx4; \
1696 vdup.u32 dx4, uv_dx4[1]; \
1697 \
1698 vaddhn.u32 v_whole_high, v_block, dx4; \
1699 vdup.u32 dx8, uv_dx8[0]; \
1700 \
1701 vadd.u32 u_block, u_block, dx8; \
1702 vdup.u32 dx8, uv_dx8[1]; \
1703 \
1704 vadd.u32 v_block, v_block, dx8; \
1705 vmovn.u16 u_whole_8, u_whole; \
1706 \
1707 vmovn.u16 v_whole_8, v_whole; \
1708 \
e1f6de8f 1709 pld [fb_ptr]; \
75e28f62
E
1710 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1711 \
1712 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1713 setup_blocks_texture_##swizzling(); \
1714 \
1715 beq 5f; \
1716 \
1717 4: \
1718 vshrn.u32 u_whole_low, u_block, #16; \
1719 \
e1f6de8f 1720 vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32; \
75e28f62
E
1721 vshrn.u32 v_whole_low, v_block, #16; \
1722 \
1723 add block_ptr_b, block_ptr_b, #32; \
e1f6de8f 1724 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1725 \
1726 vdup.u32 dx4, uv_dx4[0]; \
1727 vaddhn.u32 u_whole_high, u_block, dx4; \
1728 vdup.u32 dx4, uv_dx4[1]; \
1729 \
1730 vaddhn.u32 v_whole_high, v_block, dx4; \
1731 vdup.u32 dx8, uv_dx8[0]; \
1732 \
1733 vadd.u32 u_block, u_block, dx8; \
1734 vdup.u32 dx8, uv_dx8[1]; \
1735 \
1736 vadd.u32 v_block, v_block, dx8; \
1737 vmovn.u16 u_whole_8, u_whole; \
1738 \
1739 add fb_ptr, fb_ptr, #16; \
1740 vmovn.u16 v_whole_8, v_whole; \
1741 \
e1f6de8f 1742 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1743 pld [fb_ptr]; \
75e28f62
E
1744 \
1745 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1746 subs span_num_blocks, span_num_blocks, #1; \
1747 \
1748 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1749 setup_blocks_texture_##swizzling(); \
1750 \
1751 bne 4b; \
1752 \
1753 5: \
e1f6de8f 1754 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62 1755 \
e1f6de8f 1756 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
1757 vdup.u8 draw_mask, right_mask; \
1758 \
1759 vmov.u32 fb_mask_ptrs[0], right_mask; \
1760 vtst.u16 draw_mask, draw_mask, test_mask; \
1761 vzip.u8 u_whole_8, v_whole_8; \
1762 \
1763 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1764 add block_ptr_b, block_ptr_b, #32; \
e1f6de8f 1765 vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32; \
1766 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1767 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1768 \
1769 1: \
1770 add span_uvrg_offset, span_uvrg_offset, #16; \
1771 add span_edge_data, span_edge_data, #8; \
1772 subs num_spans, num_spans, #1; \
1773 \
e1f6de8f 1774 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1775 bne 0b; \
1776 \
1777 ldmia sp!, { r4 - r11, pc }; \
1778 \
1779 2: \
1780 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1781 vpush { texture_mask }; \
1782 vpush { uvrg_dx4 }; \
1783 \
4d646738 1784 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 1785 bl flush_render_block_buffer; \
4d646738 1786 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
1787 \
1788 vpop { uvrg_dx4 }; \
1789 vpop { texture_mask }; \
1790 \
1791 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1792 vmov.u8 fb_mask_ptrs, #0; \
1793 \
1794 mov num_blocks, span_num_blocks; \
1795 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1796 bal 3b \
1797
1798
1799setup_blocks_unshaded_textured_builder(swizzled)
1800setup_blocks_unshaded_textured_builder(unswizzled)
1801
1802
1803.align 3
1804
1805function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
e1f6de8f 1806 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1807 veor.u32 draw_mask, draw_mask, draw_mask
1808
1809 cmp num_spans, #0
1810 bxeq lr
1811
1812 stmdb sp!, { r4 - r11, r14 }
e1f6de8f 1813 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62 1814
e1f6de8f 1815 ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
75e28f62
E
1816
1817 ubfx color_r, color, #3, #5
1818 ubfx color_g, color, #11, #5
1819 ubfx color_b, color, #19, #5
1820
1821 orr color, color_r, color_b, lsl #10
1822 orr color, color, color_g, lsl #5
1823
1824 vdup.u16 colors, color
1825
e1f6de8f 1826 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
1827 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1828
1829 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1830 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1831
1832 0:
e1f6de8f 1833 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1834 ldrh y, [span_edge_data, #edge_data_y_offset]
75e28f62 1835
e1f6de8f 1836 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62
E
1837
1838 cmp span_num_blocks, #0
1839 beq 1f
1840
e1f6de8f 1841 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
75e28f62
E
1842 add num_blocks, span_num_blocks, num_blocks
1843
1844 cmp num_blocks, #MAX_BLOCKS
1845 bgt 2f
1846
1847 3:
1848 add fb_ptr, fb_ptr, y, lsl #11
1849 and y, y, #0x3
1850
1851 add fb_ptr, fb_ptr, left_x, lsl #1
1852 mov c_32, #32
1853
1854 subs span_num_blocks, span_num_blocks, #1
1855
1856 add block_ptr_b, block_ptr_a, #16
e1f6de8f 1857 pld [fb_ptr]
75e28f62
E
1858
1859 vmov.u32 fb_mask_ptrs[1], fb_ptr
1860 beq 5f
1861
1862 4:
e1f6de8f 1863 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_32
1864 vst1.u32 { colors }, [block_ptr_b, :128], c_32
1865 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
75e28f62
E
1866
1867 add fb_ptr, fb_ptr, #16
1868 add block_ptr_b, block_ptr_b, #32
1869
e1f6de8f 1870 pld [fb_ptr]
75e28f62
E
1871
1872 vmov.u32 fb_mask_ptrs[1], fb_ptr
1873 subs span_num_blocks, span_num_blocks, #1
1874
1875 bne 4b
1876
1877 5:
e1f6de8f 1878 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]
75e28f62
E
1879
1880 vdup.u8 draw_mask_edge, right_mask
1881 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1882
e1f6de8f 1883 vst1.u32 { colors }, [block_ptr_b, :128], c_32
1884 vst1.u32 { draw_mask_edge }, [block_ptr_a, :128], c_32
75e28f62 1885 add block_ptr_b, block_ptr_b, #32
e1f6de8f 1886 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
75e28f62
E
1887
1888 1:
1889 add span_edge_data, span_edge_data, #8
1890 subs num_spans, num_spans, #1
1891
e1f6de8f 1892 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
1893 bne 0b
1894
1895 ldmia sp!, { r4 - r11, pc }
1896
1897 2:
1898 vpush { colors }
1899
4d646738 1900 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 1901 bl flush_render_block_buffer
4d646738 1902 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62
E
1903
1904 vpop { colors }
1905
e1f6de8f 1906 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
1907 veor.u32 draw_mask, draw_mask, draw_mask
1908
1909 mov num_blocks, span_num_blocks
1910 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1911 bal 3b
1912
1913
1914#define mask_msb_scalar r14
1915
1916#define msb_mask q15
1917
1918#define pixels_low d16
1919
1920#define msb_mask_low d30
1921#define msb_mask_high d31
1922
1923
1924.align 3
1925
1926function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
e1f6de8f 1927 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1928
1929 cmp num_spans, #0
1930 bxeq lr
1931
1932 stmdb sp!, { r4 - r11, r14 }
1933
e1f6de8f 1934 ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
75e28f62
E
1935
1936 ubfx color_r, color, #3, #5
1937 ubfx color_g, color, #11, #5
1938
e1f6de8f 1939 ldrh mask_msb_scalar, [psx_gpu, #psx_gpu_mask_msb_offset]
75e28f62
E
1940 ubfx color_b, color, #19, #5
1941
1942 orr color, color_r, color_b, lsl #10
1943 orr color, color, color_g, lsl #5
1944 orr color, color, mask_msb_scalar
1945
1946 vdup.u16 colors, color
1947
1948 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
ed0fd81d 1949 orr color, color, color, lsl #16
3867c6ef 1950
75e28f62
E
1951
1952 0:
e1f6de8f 1953 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1954 ldrh y, [span_edge_data, #edge_data_y_offset]
75e28f62 1955
e1f6de8f 1956 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62
E
1957
1958 cmp span_num_blocks, #0
1959 beq 1f
1960
e1f6de8f 1961 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
75e28f62
E
1962
1963 add fb_ptr, fb_ptr, y, lsl #11
1964 subs span_num_blocks, span_num_blocks, #1
1965
1966 add fb_ptr, fb_ptr, left_x, lsl #1
1967 beq 3f
1968
1969 2:
e1f6de8f 1970 vst1.u32 { colors }, [fb_ptr]!
75e28f62
E
1971 subs span_num_blocks, span_num_blocks, #1
1972
1973 bne 2b
1974
1975 3:
e1f6de8f 1976 ldrb right_mask, [span_edge_data, #edge_data_right_mask_offset]
75e28f62 1977
3867c6ef
E
1978 cmp right_mask, #0x0
1979 beq 5f
1980
1981 tst right_mask, #0xF
e1f6de8f 1982 streq color, [fb_ptr], #4
3867c6ef 1983 moveq right_mask, right_mask, lsr #4
e1f6de8f 1984 streq color, [fb_ptr], #4
3867c6ef
E
1985
1986 tst right_mask, #0x3
e1f6de8f 1987 streq color, [fb_ptr], #4
3867c6ef
E
1988 moveq right_mask, right_mask, lsr #2
1989
1990 tst right_mask, #0x1
e1f6de8f 1991 strheq color, [fb_ptr]
75e28f62
E
1992
1993 1:
1994 add span_edge_data, span_edge_data, #8
1995 subs num_spans, num_spans, #1
75e28f62
E
1996 bne 0b
1997
1998 ldmia sp!, { r4 - r11, pc }
1999
3867c6ef 2000 5:
e1f6de8f 2001 vst1.u32 { colors }, [fb_ptr]
3867c6ef 2002 bal 1b
75e28f62
E
2003
2004
2005#undef c_64
2006
2007#define c_64 r7
2008#define rg_dx_ptr r2
2009
2010
2011#undef r_block
2012#undef g_block
2013#undef b_block
2014#undef r_whole
2015#undef g_whole
2016#undef b_whole
2017#undef r_whole_low
2018#undef r_whole_high
2019#undef g_whole_low
2020#undef g_whole_high
2021#undef b_whole_low
2022#undef b_whole_high
2023#undef r_whole_8
2024#undef g_whole_8
2025#undef b_whole_8
2026#undef dither_offsets
2027#undef rg_dx4
2028#undef rg_dx8
2029#undef dx4
2030#undef dx8
2031#undef v_left_x
2032#undef uvrg
2033#undef block_span
2034#undef rg
2035#undef draw_mask
2036#undef test_mask
2037
2038#define r_block q0
2039#define g_block q1
2040#define b_block q2
2041
2042#define r_whole q3
2043#define g_whole q4
2044#define b_whole q5
2045
2046#define r_whole_low d6
2047#define r_whole_high d7
2048#define g_whole_low d8
2049#define g_whole_high d9
2050#define b_whole_low d10
2051#define b_whole_high d11
2052
2053#define gb_whole_8 q6
2054
2055#define g_whole_8 d12
2056#define b_whole_8 d13
2057
2058#define r_whole_8 d14
2059
2060#define pixels q8
2061
2062#define rg_dx4 d18
2063#define rg_dx8 d19
2064
2065#define dx4 q10
2066#define dx8 q10
2067
2068#define v_left_x d6
2069#define uvrg q4
2070#define block_span q5
2071
2072#define rg d9
2073
2074#define d64_1 d22
2075#define d64_128 d23
2076
2077#define d128_4 q12
2078#define d128_0x7 q13
2079
2080#define d64_4 d24
2081
2082#define dither_offsets q14
2083#define draw_mask q15
2084
2085#define dither_offsets_low d28
2086
2087#define rg_dx d0
2088#define test_mask q10
2089
2090
2091#define setup_blocks_shaded_untextured_dither_a_dithered() \
2092 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2093 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2094
2095#define setup_blocks_shaded_untextured_dither_b_dithered() \
2096 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2097 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2098
2099#define setup_blocks_shaded_untextured_dither_a_undithered() \
2100
2101#define setup_blocks_shaded_untextured_dither_b_undithered() \
2102
2103
2104#define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2105.align 3; \
2106 \
2107function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
e1f6de8f 2108 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
2109 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2110 \
e1f6de8f 2111 vld1.u32 { rg_dx }, [rg_dx_ptr, :64]; \
75e28f62
E
2112 \
2113 cmp num_spans, #0; \
2114 bxeq lr; \
2115 \
2116 stmdb sp!, { r4 - r11, r14 }; \
2117 vshl.u32 rg_dx4, rg_dx, #2; \
2118 \
e1f6de8f 2119 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
2120 vshl.u32 rg_dx8, rg_dx, #3; \
2121 \
2122 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2123 \
e1f6de8f 2124 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
2125 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2126 \
2127 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2128 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2129 \
2130 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2131 vmov.u8 d64_1, #1; \
2132 \
2133 vmov.u8 d128_4, #4; \
2134 vmov.u8 d64_128, #128; \
2135 \
2136 vmov.u8 d128_0x7, #0x7; \
2137 \
2138 0: \
e1f6de8f 2139 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
2140 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2141 \
e1f6de8f 2142 ldrh y, [span_edge_data, #edge_data_y_offset]; \
2143 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
2144 \
2145 cmp span_num_blocks, #0; \
2146 beq 1f; \
2147 \
e1f6de8f 2148 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
2149 add num_blocks, span_num_blocks, num_blocks; \
2150 \
2151 cmp num_blocks, #MAX_BLOCKS; \
2152 bgt 2f; \
2153 \
2154 3: \
e1f6de8f 2155 ldr b, [span_b_offset]; \
75e28f62
E
2156 add fb_ptr, fb_ptr, y, lsl #11; \
2157 \
2158 vdup.u32 v_left_x, left_x; \
2159 and y, y, #0x3; \
2160 \
e1f6de8f 2161 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
2162 add fb_ptr, fb_ptr, left_x, lsl #1; \
2163 \
2164 mla b, b_dx, left_x, b; \
2165 and dither_shift, left_x, #0x03; \
2166 \
e1f6de8f 2167 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
2168 vshr.u32 rg_dx, rg_dx4, #2; \
2169 \
2170 mov dither_shift, dither_shift, lsl #3; \
2171 vmla.u32 rg, rg_dx, v_left_x; \
2172 \
2173 mov c_64, #64; \
2174 subs span_num_blocks, span_num_blocks, #1; \
2175 \
2176 mov dither_row, dither_row, ror dither_shift; \
2177 mov b_dx4, b_dx, lsl #2; \
2178 \
2179 vdup.u32 dither_offsets, dither_row; \
2180 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2181 \
2182 vdup.u32 b_block, b; \
2183 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2184 \
2185 mov b_dx8, b_dx, lsl #3; \
2186 vdup.u32 r_block, rg[0]; \
2187 vdup.u32 g_block, rg[1]; \
2188 \
e1f6de8f 2189 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2190 \
2191 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 2192 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2193 \
2194 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 2195 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
2196 \
2197 vadd.u32 b_block, b_block, block_span; \
2198 add block_ptr_b, block_ptr_a, #16; \
2199 \
2200 vshrn.u32 r_whole_low, r_block, #16; \
2201 vshrn.u32 g_whole_low, g_block, #16; \
2202 vshrn.u32 b_whole_low, b_block, #16; \
2203 vdup.u32 dx4, rg_dx4[0]; \
2204 \
2205 vaddhn.u32 r_whole_high, r_block, dx4; \
2206 vdup.u32 dx4, rg_dx4[1]; \
2207 \
2208 vaddhn.u32 g_whole_high, g_block, dx4; \
2209 vdup.u32 dx4, b_dx4; \
2210 \
2211 vaddhn.u32 b_whole_high, b_block, dx4; \
2212 vdup.u32 dx8, rg_dx8[0]; \
2213 \
2214 vadd.u32 r_block, r_block, dx8; \
2215 vdup.u32 dx8, rg_dx8[1]; \
2216 \
2217 vadd.u32 g_block, g_block, dx8; \
2218 vdup.u32 dx8, b_dx8; \
2219 \
2220 vadd.u32 b_block, b_block, dx8; \
2221 \
2222 vmovn.u16 r_whole_8, r_whole; \
2223 vmovn.u16 g_whole_8, g_whole; \
2224 vmovn.u16 b_whole_8, b_whole; \
2225 \
2226 beq 5f; \
2227 veor.u32 draw_mask, draw_mask, draw_mask; \
2228 \
2229 4: \
2230 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2231 vshrn.u32 r_whole_low, r_block, #16; \
2232 \
2233 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2234 vshrn.u32 g_whole_low, g_block, #16; \
2235 \
2236 vshrn.u32 b_whole_low, b_block, #16; \
e1f6de8f 2237 str fb_ptr, [block_ptr_a, #44]; \
75e28f62
E
2238 \
2239 vdup.u32 dx4, rg_dx4[0]; \
2240 vshr.u8 r_whole_8, r_whole_8, #3; \
2241 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2242 \
2243 vaddhn.u32 r_whole_high, r_block, dx4; \
2244 vdup.u32 dx4, rg_dx4[1]; \
2245 \
2246 vaddhn.u32 g_whole_high, g_block, dx4; \
2247 vdup.u32 dx4, b_dx4; \
2248 \
2249 vaddhn.u32 b_whole_high, b_block, dx4; \
2250 vdup.u32 dx8, rg_dx8[0]; \
2251 \
2252 vmull.u8 pixels, r_whole_8, d64_1; \
2253 vmlal.u8 pixels, g_whole_8, d64_4; \
2254 vmlal.u8 pixels, b_whole_8, d64_128; \
2255 \
2256 vadd.u32 r_block, r_block, dx8; \
2257 vdup.u32 dx8, rg_dx8[1]; \
2258 \
2259 vadd.u32 g_block, g_block, dx8; \
2260 vdup.u32 dx8, b_dx8; \
2261 \
2262 vadd.u32 b_block, b_block, dx8; \
2263 add fb_ptr, fb_ptr, #16; \
2264 \
2265 vmovn.u16 r_whole_8, r_whole; \
2266 vmovn.u16 g_whole_8, g_whole; \
2267 vmovn.u16 b_whole_8, b_whole; \
2268 \
e1f6de8f 2269 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64; \
2270 vst1.u32 { pixels }, [block_ptr_b, :128], c_64; \
75e28f62 2271 \
e1f6de8f 2272 pld [fb_ptr]; \
75e28f62
E
2273 \
2274 subs span_num_blocks, span_num_blocks, #1; \
2275 bne 4b; \
2276 \
2277 5: \
e1f6de8f 2278 str fb_ptr, [block_ptr_a, #44]; \
75e28f62
E
2279 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2280 \
e1f6de8f 2281 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62
E
2282 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2283 \
2284 vshr.u8 r_whole_8, r_whole_8, #3; \
2285 vdup.u8 draw_mask, right_mask; \
2286 \
2287 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
e1f6de8f 2288 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
2289 \
2290 vtst.u16 draw_mask, draw_mask, test_mask; \
2291 \
2292 vmull.u8 pixels, r_whole_8, d64_1; \
2293 vmlal.u8 pixels, g_whole_8, d64_4; \
2294 vmlal.u8 pixels, b_whole_8, d64_128; \
2295 \
e1f6de8f 2296 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64; \
2297 vst1.u32 { pixels }, [block_ptr_b, :128], c_64; \
75e28f62
E
2298 \
2299 1: \
2300 add span_uvrg_offset, span_uvrg_offset, #16; \
2301 add span_b_offset, span_b_offset, #4; \
2302 \
2303 add span_edge_data, span_edge_data, #8; \
2304 subs num_spans, num_spans, #1; \
2305 \
e1f6de8f 2306 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
2307 bne 0b; \
2308 \
2309 ldmia sp!, { r4 - r11, pc }; \
2310 \
2311 2: \
2312 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2313 vpush { rg_dx4 }; \
2314 \
4d646738 2315 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 2316 bl flush_render_block_buffer; \
4d646738 2317 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
2318 \
2319 vpop { rg_dx4 }; \
2320 \
2321 vmov.u8 d64_1, #1; \
2322 vmov.u8 d128_4, #4; \
2323 vmov.u8 d64_128, #128; \
2324 vmov.u8 d128_0x7, #0x7; \
2325 \
2326 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2327 \
2328 mov num_blocks, span_num_blocks; \
2329 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2330 bal 3b \
2331
2332
2333setup_blocks_shaded_untextured_indirect_builder(undithered)
2334setup_blocks_shaded_untextured_indirect_builder(dithered)
2335
2336
2337#undef draw_mask
2338
2339#define mask_msb_ptr r14
2340
2341#define draw_mask q0
2342#define pixels_low d16
3867c6ef 2343#define pixels_high d17
75e28f62
E
2344
2345
2346
2347#define setup_blocks_shaded_untextured_direct_builder(dithering) \
2348.align 3; \
2349 \
2350function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
e1f6de8f 2351 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
2352 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2353 \
e1f6de8f 2354 vld1.u32 { rg_dx }, [rg_dx_ptr, :64]; \
75e28f62
E
2355 \
2356 cmp num_spans, #0; \
2357 bxeq lr; \
2358 \
2359 stmdb sp!, { r4 - r11, r14 }; \
2360 vshl.u32 rg_dx4, rg_dx, #2; \
2361 \
e1f6de8f 2362 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
2363 vshl.u32 rg_dx8, rg_dx, #3; \
2364 \
2365 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2366 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2367 \
2368 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2369 vmov.u8 d64_1, #1; \
2370 \
2371 vmov.u8 d128_4, #4; \
2372 vmov.u8 d64_128, #128; \
2373 \
2374 vmov.u8 d128_0x7, #0x7; \
2375 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 2376 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
2377 \
2378 0: \
e1f6de8f 2379 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
2380 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2381 \
e1f6de8f 2382 ldrh y, [span_edge_data, #edge_data_y_offset]; \
2383 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
2384 \
2385 cmp span_num_blocks, #0; \
2386 beq 1f; \
2387 \
e1f6de8f 2388 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
2389 add fb_ptr, fb_ptr, y, lsl #11; \
2390 \
e1f6de8f 2391 ldr b, [span_b_offset]; \
75e28f62
E
2392 vdup.u32 v_left_x, left_x; \
2393 and y, y, #0x3; \
2394 \
e1f6de8f 2395 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
2396 add fb_ptr, fb_ptr, left_x, lsl #1; \
2397 \
2398 mla b, b_dx, left_x, b; \
2399 and dither_shift, left_x, #0x03; \
2400 \
e1f6de8f 2401 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
2402 vshr.u32 rg_dx, rg_dx4, #2; \
2403 \
2404 mov dither_shift, dither_shift, lsl #3; \
2405 vmla.u32 rg, rg_dx, v_left_x; \
2406 \
2407 subs span_num_blocks, span_num_blocks, #1; \
2408 \
2409 mov dither_row, dither_row, ror dither_shift; \
2410 mov b_dx4, b_dx, lsl #2; \
2411 \
2412 vdup.u32 dither_offsets, dither_row; \
2413 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2414 \
2415 vdup.u32 b_block, b; \
2416 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2417 \
2418 mov b_dx8, b_dx, lsl #3; \
2419 vdup.u32 r_block, rg[0]; \
2420 vdup.u32 g_block, rg[1]; \
2421 \
e1f6de8f 2422 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2423 \
2424 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 2425 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2426 \
2427 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 2428 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
2429 \
2430 vadd.u32 b_block, b_block, block_span; \
2431 add block_ptr_b, block_ptr_a, #16; \
2432 \
2433 vshrn.u32 r_whole_low, r_block, #16; \
2434 vshrn.u32 g_whole_low, g_block, #16; \
2435 vshrn.u32 b_whole_low, b_block, #16; \
2436 vdup.u32 dx4, rg_dx4[0]; \
2437 \
2438 vaddhn.u32 r_whole_high, r_block, dx4; \
2439 vdup.u32 dx4, rg_dx4[1]; \
2440 \
2441 vaddhn.u32 g_whole_high, g_block, dx4; \
2442 vdup.u32 dx4, b_dx4; \
2443 \
2444 vaddhn.u32 b_whole_high, b_block, dx4; \
2445 vdup.u32 dx8, rg_dx8[0]; \
2446 \
2447 vadd.u32 r_block, r_block, dx8; \
2448 vdup.u32 dx8, rg_dx8[1]; \
2449 \
2450 vadd.u32 g_block, g_block, dx8; \
2451 vdup.u32 dx8, b_dx8; \
2452 \
2453 vadd.u32 b_block, b_block, dx8; \
2454 \
2455 vmovn.u16 r_whole_8, r_whole; \
2456 vmovn.u16 g_whole_8, g_whole; \
2457 vmovn.u16 b_whole_8, b_whole; \
2458 \
2459 beq 3f; \
2460 \
2461 2: \
2462 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2463 vshrn.u32 r_whole_low, r_block, #16; \
2464 \
2465 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2466 vshrn.u32 g_whole_low, g_block, #16; \
2467 \
2468 vshrn.u32 b_whole_low, b_block, #16; \
2469 \
2470 vdup.u32 dx4, rg_dx4[0]; \
2471 vshr.u8 r_whole_8, r_whole_8, #3; \
2472 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2473 \
2474 vaddhn.u32 r_whole_high, r_block, dx4; \
2475 vdup.u32 dx4, rg_dx4[1]; \
2476 \
2477 vmov pixels, msb_mask; \
2478 vaddhn.u32 g_whole_high, g_block, dx4; \
2479 vdup.u32 dx4, b_dx4; \
2480 \
2481 vaddhn.u32 b_whole_high, b_block, dx4; \
2482 vdup.u32 dx8, rg_dx8[0]; \
2483 \
2484 vmlal.u8 pixels, r_whole_8, d64_1; \
2485 vmlal.u8 pixels, g_whole_8, d64_4; \
2486 vmlal.u8 pixels, b_whole_8, d64_128; \
2487 \
2488 vadd.u32 r_block, r_block, dx8; \
2489 vdup.u32 dx8, rg_dx8[1]; \
2490 \
2491 vadd.u32 g_block, g_block, dx8; \
2492 vdup.u32 dx8, b_dx8; \
2493 \
2494 vadd.u32 b_block, b_block, dx8; \
2495 \
2496 vmovn.u16 r_whole_8, r_whole; \
2497 vmovn.u16 g_whole_8, g_whole; \
2498 vmovn.u16 b_whole_8, b_whole; \
2499 \
e1f6de8f 2500 vst1.u32 { pixels }, [fb_ptr]!; \
75e28f62
E
2501 subs span_num_blocks, span_num_blocks, #1; \
2502 bne 2b; \
2503 \
2504 3: \
2505 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2506 \
e1f6de8f 2507 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62
E
2508 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2509 \
2510 vshr.u8 r_whole_8, r_whole_8, #3; \
3867c6ef 2511 rbit right_mask, right_mask; \
75e28f62
E
2512 vmov pixels, msb_mask; \
2513 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
3867c6ef 2514 clz right_mask, right_mask; \
75e28f62
E
2515 \
2516 vmlal.u8 pixels, r_whole_8, d64_1; \
2517 vmlal.u8 pixels, g_whole_8, d64_4; \
2518 vmlal.u8 pixels, b_whole_8, d64_128; \
2519 \
8184d7c5 2520 JT_OP_REL(100f, right_mask, temp); \
e1f6de8f 2521 JT_OP(ldr pc, [pc, right_mask, lsl #2]); \
3867c6ef 2522 nop; \
8184d7c5 2523 100: \
3867c6ef 2524 nop; \
8184d7c5 2525 .word JTE(100b, 4f); \
2526 .word JTE(100b, 5f); \
2527 .word JTE(100b, 6f); \
2528 .word JTE(100b, 7f); \
2529 .word JTE(100b, 8f); \
2530 .word JTE(100b, 9f); \
2531 .word JTE(100b, 10f); \
2532 .word JTE(100b, 11f); \
3867c6ef 2533 \
75e28f62 2534 4: \
e1f6de8f 2535 vst1.u16 { pixels_low[0] }, [fb_ptr]; \
3867c6ef
E
2536 bal 1f; \
2537 \
2538 5: \
e1f6de8f 2539 vst1.u32 { pixels_low[0] }, [fb_ptr]; \
3867c6ef
E
2540 bal 1f; \
2541 \
2542 6: \
e1f6de8f 2543 vst1.u32 { pixels_low[0] }, [fb_ptr]!; \
2544 vst1.u16 { pixels_low[2] }, [fb_ptr]; \
3867c6ef
E
2545 bal 1f; \
2546 \
2547 7: \
e1f6de8f 2548 vst1.u32 { pixels_low }, [fb_ptr]; \
3867c6ef
E
2549 bal 1f; \
2550 \
2551 8: \
e1f6de8f 2552 vst1.u32 { pixels_low }, [fb_ptr]!; \
2553 vst1.u16 { pixels_high[0] }, [fb_ptr]; \
3867c6ef
E
2554 bal 1f; \
2555 \
2556 9: \
e1f6de8f 2557 vst1.u32 { pixels_low }, [fb_ptr]!; \
2558 vst1.u32 { pixels_high[0] }, [fb_ptr]!; \
3867c6ef
E
2559 bal 1f; \
2560 \
2561 10: \
e1f6de8f 2562 vst1.u32 { pixels_low }, [fb_ptr]!; \
2563 vst1.u32 { pixels_high[0] }, [fb_ptr]!; \
2564 vst1.u16 { pixels_high[2] }, [fb_ptr]; \
3867c6ef
E
2565 bal 1f; \
2566 \
2567 11: \
e1f6de8f 2568 vst1.u32 { pixels }, [fb_ptr]; \
3867c6ef 2569 bal 1f; \
75e28f62
E
2570 \
2571 1: \
2572 add span_uvrg_offset, span_uvrg_offset, #16; \
2573 add span_b_offset, span_b_offset, #4; \
2574 \
2575 add span_edge_data, span_edge_data, #8; \
2576 subs num_spans, num_spans, #1; \
2577 \
2578 bne 0b; \
2579 \
2580 ldmia sp!, { r4 - r11, pc } \
2581
2582setup_blocks_shaded_untextured_direct_builder(undithered)
2583setup_blocks_shaded_untextured_direct_builder(dithered)
2584
2585
2586#undef psx_gpu
2587#undef num_blocks
2588#undef triangle
2589#undef c_64
2590
2591#define psx_gpu r0
2592#define block_ptr r1
2593#define num_blocks r2
2594#define uv_01 r3
2595#define uv_23 r4
2596#define uv_45 r5
2597#define uv_67 r6
2598#define uv_0 r7
2599#define uv_1 r3
2600#define uv_2 r8
2601#define uv_3 r4
2602#define uv_4 r9
2603#define uv_5 r5
2604#define uv_6 r10
2605#define uv_7 r6
2606#define texture_ptr r11
2607
2608#define pixel_0 r7
2609#define pixel_1 r3
2610#define pixel_2 r8
2611#define pixel_3 r4
2612#define pixel_4 r9
2613#define pixel_5 r5
2614#define pixel_6 r10
2615#define pixel_7 r6
2616
2617#define pixels_a r7
2618#define pixels_b r9
2619#define pixels_c r8
2620#define pixels_d r10
2621
2622#define c_64 r0
2623
2624#define clut_ptr r12
2625#define current_texture_mask r5
2626#define dirty_textures_mask r6
2627
2628#define texels d0
2629
2630#define clut_low_a d2
2631#define clut_low_b d3
2632#define clut_high_a d4
2633#define clut_high_b d5
2634
2635#define clut_a q1
2636#define clut_b q2
2637
2638#define texels_low d6
2639#define texels_high d7
2640
2641.align 3
2642
2643function(texture_blocks_untextured)
2644 bx lr
2645
2646
2647.align 3
2648
2649function(texture_blocks_4bpp)
2650 stmdb sp!, { r3 - r11, r14 }
2651 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2652
e1f6de8f 2653 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2654 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62 2655
e1f6de8f 2656 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2657 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]
75e28f62 2658
e1f6de8f 2659 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62
E
2660 vuzp.u8 clut_a, clut_b
2661
e1f6de8f 2662 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
2663 tst dirty_textures_mask, current_texture_mask
2664
2665 bne 1f
2666 mov c_64, #64
2667
26680:
2669 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2670
2671 uxtah uv_0, texture_ptr, uv_01
2672 uxtah uv_1, texture_ptr, uv_01, ror #16
2673
2674 uxtah uv_2, texture_ptr, uv_23
2675 uxtah uv_3, texture_ptr, uv_23, ror #16
2676
2677 uxtah uv_4, texture_ptr, uv_45
e1f6de8f 2678 ldrb pixel_0, [uv_0]
75e28f62
E
2679
2680 uxtah uv_5, texture_ptr, uv_45, ror #16
e1f6de8f 2681 ldrb pixel_1, [uv_1]
75e28f62
E
2682
2683 uxtah uv_6, texture_ptr, uv_67
e1f6de8f 2684 ldrb pixel_2, [uv_2]
75e28f62
E
2685
2686 uxtah uv_7, texture_ptr, uv_67, ror #16
e1f6de8f 2687 ldrb pixel_3, [uv_3]
75e28f62 2688
e1f6de8f 2689 ldrb pixel_4, [uv_4]
75e28f62
E
2690 subs num_blocks, num_blocks, #1
2691
e1f6de8f 2692 ldrb pixel_5, [uv_5]
75e28f62
E
2693 orr pixels_a, pixel_0, pixel_1, lsl #8
2694
e1f6de8f 2695 ldrb pixel_6, [uv_6]
75e28f62
E
2696 orr pixels_b, pixel_4, pixel_5, lsl #8
2697
e1f6de8f 2698 ldrb pixel_7, [uv_7]
75e28f62
E
2699 orr pixels_a, pixels_a, pixel_2, lsl #16
2700
2701 orr pixels_b, pixels_b, pixel_6, lsl #16
2702 orr pixels_a, pixels_a, pixel_3, lsl #24
2703
2704 orr pixels_b, pixels_b, pixel_7, lsl #24
ed0fd81d 2705 vmov texels, pixels_a, pixels_b
75e28f62
E
2706
2707 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2708 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2709
e1f6de8f 2710 vst2.u8 { texels_low, texels_high }, [block_ptr, :128], c_64
75e28f62
E
2711 bne 0b
2712
2713 ldmia sp!, { r3 - r11, pc }
2714
27151:
2716 stmdb sp!, { r1 - r2 }
2717 bl update_texture_4bpp_cache
2718
2719 mov c_64, #64
2720 ldmia sp!, { r1 - r2 }
2721 bal 0b
2722
2723
2724.align 3
2725
2726function(texture_blocks_8bpp)
2727 stmdb sp!, { r3 - r11, r14 }
2728 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2729
e1f6de8f 2730 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2731 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62 2732
e1f6de8f 2733 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2734 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62 2735
e1f6de8f 2736 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]
75e28f62
E
2737 tst dirty_textures_mask, current_texture_mask
2738
2739 bne 1f
2740 nop
2741
27420:
2743 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2744
2745 uxtah uv_0, texture_ptr, uv_01
2746 uxtah uv_1, texture_ptr, uv_01, ror #16
2747
2748 uxtah uv_2, texture_ptr, uv_23
2749 uxtah uv_3, texture_ptr, uv_23, ror #16
2750
2751 uxtah uv_4, texture_ptr, uv_45
e1f6de8f 2752 ldrb pixel_0, [uv_0]
75e28f62
E
2753
2754 uxtah uv_5, texture_ptr, uv_45, ror #16
e1f6de8f 2755 ldrb pixel_1, [uv_1]
75e28f62
E
2756
2757 uxtah uv_6, texture_ptr, uv_67
e1f6de8f 2758 ldrb pixel_2, [uv_2]
75e28f62
E
2759
2760 uxtah uv_7, texture_ptr, uv_67, ror #16
e1f6de8f 2761 ldrb pixel_3, [uv_3]
75e28f62 2762
e1f6de8f 2763 ldrb pixel_4, [uv_4]
75e28f62
E
2764 add pixel_0, pixel_0, pixel_0
2765
e1f6de8f 2766 ldrb pixel_5, [uv_5]
75e28f62
E
2767 add pixel_1, pixel_1, pixel_1
2768
e1f6de8f 2769 ldrb pixel_6, [uv_6]
75e28f62
E
2770 add pixel_2, pixel_2, pixel_2
2771
e1f6de8f 2772 ldrb pixel_7, [uv_7]
75e28f62
E
2773 add pixel_3, pixel_3, pixel_3
2774
e1f6de8f 2775 ldrh pixel_0, [clut_ptr, pixel_0]
75e28f62
E
2776 add pixel_4, pixel_4, pixel_4
2777
e1f6de8f 2778 ldrh pixel_1, [clut_ptr, pixel_1]
75e28f62
E
2779 add pixel_5, pixel_5, pixel_5
2780
e1f6de8f 2781 ldrh pixel_2, [clut_ptr, pixel_2]
75e28f62
E
2782 add pixel_6, pixel_6, pixel_6
2783
e1f6de8f 2784 ldrh pixel_3, [clut_ptr, pixel_3]
75e28f62
E
2785 add pixel_7, pixel_7, pixel_7
2786
e1f6de8f 2787 ldrh pixel_4, [clut_ptr, pixel_4]
75e28f62
E
2788 orr pixels_a, pixel_0, pixel_1, lsl #16
2789
e1f6de8f 2790 ldrh pixel_5, [clut_ptr, pixel_5]
75e28f62
E
2791 orr pixels_c, pixel_2, pixel_3, lsl #16
2792
e1f6de8f 2793 ldrh pixel_6, [clut_ptr, pixel_6]
75e28f62
E
2794 subs num_blocks, num_blocks, #1
2795
e1f6de8f 2796 ldrh pixel_7, [clut_ptr, pixel_7]
75e28f62
E
2797 orr pixels_b, pixel_4, pixel_5, lsl #16
2798
2799 orr pixels_d, pixel_6, pixel_7, lsl #16
2800 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2801
2802 add block_ptr, block_ptr, #64
2803 bne 0b
2804
2805 ldmia sp!, { r3 - r11, pc }
2806
28071:
4d646738 2808 stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
75e28f62
E
2809
2810 bl update_texture_8bpp_cache
2811
4d646738 2812 ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
75e28f62
E
2813 bal 0b
2814
2815
2816#undef uv_0
2817#undef uv_1
2818#undef uv_2
2819#undef uv_3
2820#undef uv_4
2821#undef uv_5
2822#undef uv_6
2823#undef uv_7
2824
2825#undef pixel_0
2826#undef pixel_1
2827#undef pixel_2
2828#undef pixel_3
2829#undef pixel_4
2830#undef pixel_5
2831#undef pixel_6
2832#undef pixel_7
2833
2834#undef texture_ptr
2835
2836#undef pixels_a
2837#undef pixels_b
2838#undef pixels_c
2839#undef pixels_d
2840
2841#define psx_gpu r0
2842#define block_ptr r1
2843#define num_blocks r2
2844
2845#define uv_0 r3
2846#define uv_1 r4
2847#define u_0 r3
2848#define u_1 r4
2849#define v_0 r5
2850#define v_1 r6
2851
2852#define uv_2 r5
2853#define uv_3 r6
2854#define u_2 r5
2855#define u_3 r6
2856#define v_2 r7
2857#define v_3 r8
2858
2859#define uv_4 r7
2860#define uv_5 r8
2861#define u_4 r7
2862#define u_5 r8
2863#define v_4 r9
2864#define v_5 r10
2865
2866#define uv_6 r9
2867#define uv_7 r10
2868#define u_6 r9
2869#define u_7 r10
2870#define v_6 r11
2871#define v_7 r0
2872
2873#define pixel_0 r3
2874#define pixel_1 r4
2875#define pixel_2 r5
2876#define pixel_3 r6
2877#define pixel_4 r7
2878#define pixel_5 r8
2879#define pixel_6 r9
2880#define pixel_7 r10
2881
2882#define pixels_a r3
2883#define pixels_b r5
2884#define pixels_c r7
2885#define pixels_d r9
2886
2887#define texture_ptr r12
2888
2889
2890.align 3
2891
2892function(texture_blocks_16bpp)
2893 stmdb sp!, { r3 - r11, r14 }
2894 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2895
e1f6de8f 2896 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2897 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
75e28f62
E
2898
28990:
e1f6de8f 2900 ldrh uv_0, [block_ptr]
75e28f62
E
2901 subs num_blocks, num_blocks, #1
2902
e1f6de8f 2903 ldrh uv_1, [block_ptr, #2]
75e28f62
E
2904
2905 and v_0, uv_0, #0xFF00
2906 and v_1, uv_1, #0xFF00
2907
2908 and u_0, uv_0, #0xFF
2909 and u_1, uv_1, #0xFF
2910
2911 add uv_0, u_0, v_0, lsl #2
e1f6de8f 2912 ldrh uv_2, [block_ptr, #4]
75e28f62
E
2913
2914 add uv_1, u_1, v_1, lsl #2
e1f6de8f 2915 ldrh uv_3, [block_ptr, #6]
75e28f62
E
2916
2917 add uv_0, uv_0, uv_0
2918 add uv_1, uv_1, uv_1
2919
2920 and v_2, uv_2, #0xFF00
2921 and v_3, uv_3, #0xFF00
2922
2923 and u_2, uv_2, #0xFF
2924 and u_3, uv_3, #0xFF
2925
2926 add uv_2, u_2, v_2, lsl #2
e1f6de8f 2927 ldrh uv_4, [block_ptr, #8]
75e28f62
E
2928
2929 add uv_3, u_3, v_3, lsl #2
e1f6de8f 2930 ldrh uv_5, [block_ptr, #10]
75e28f62
E
2931
2932 add uv_2, uv_2, uv_2
2933 add uv_3, uv_3, uv_3
2934
2935 and v_4, uv_4, #0xFF00
2936 and v_5, uv_5, #0xFF00
2937
2938 and u_4, uv_4, #0xFF
2939 and u_5, uv_5, #0xFF
2940
2941 add uv_4, u_4, v_4, lsl #2
e1f6de8f 2942 ldrh uv_6, [block_ptr, #12]
75e28f62
E
2943
2944 add uv_5, u_5, v_5, lsl #2
e1f6de8f 2945 ldrh uv_7, [block_ptr, #14]
75e28f62
E
2946
2947 add uv_4, uv_4, uv_4
e1f6de8f 2948 ldrh pixel_0, [texture_ptr, uv_0]
75e28f62
E
2949
2950 add uv_5, uv_5, uv_5
e1f6de8f 2951 ldrh pixel_1, [texture_ptr, uv_1]
75e28f62
E
2952
2953 and v_6, uv_6, #0xFF00
e1f6de8f 2954 ldrh pixel_2, [texture_ptr, uv_2]
75e28f62
E
2955
2956 and v_7, uv_7, #0xFF00
e1f6de8f 2957 ldrh pixel_3, [texture_ptr, uv_3]
75e28f62
E
2958
2959 and u_6, uv_6, #0xFF
e1f6de8f 2960 ldrh pixel_4, [texture_ptr, uv_4]
75e28f62
E
2961
2962 and u_7, uv_7, #0xFF
e1f6de8f 2963 ldrh pixel_5, [texture_ptr, uv_5]
75e28f62
E
2964
2965 add uv_6, u_6, v_6, lsl #2
2966 add uv_7, u_7, v_7, lsl #2
2967
2968 add uv_6, uv_6, uv_6
2969 add uv_7, uv_7, uv_7
2970
2971 orr pixels_a, pixel_0, pixel_1, lsl #16
2972 orr pixels_b, pixel_2, pixel_3, lsl #16
2973
e1f6de8f 2974 ldrh pixel_6, [texture_ptr, uv_6]
75e28f62
E
2975 orr pixels_c, pixel_4, pixel_5, lsl #16
2976
e1f6de8f 2977 ldrh pixel_7, [texture_ptr, uv_7]
75e28f62
E
2978 orr pixels_d, pixel_6, pixel_7, lsl #16
2979
2980 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2981 add block_ptr, block_ptr, #64
2982
2983 bne 0b
2984
2985 ldmia sp!, { r3 - r11, pc }
2986
2987
2988#undef num_blocks
2989
2990#undef test_mask
2991#undef texels
2992#undef pixels_b
2993#undef pixels
2994#undef d64_1
2995#undef d64_4
2996#undef d64_128
2997#undef draw_mask
2998#undef msb_mask
2999#undef msb_mask_low
3000#undef msb_mask_high
3001#undef fb_pixels
3002
3003#undef c_32
3004#undef fb_ptr
3005#undef mask_msb_ptr
3006
3007#define psx_gpu r0
3008#define num_blocks r1
3009#define color_ptr r2
3867c6ef
E
3010#define colors_scalar r2
3011#define colors_scalar_compare r3
75e28f62
E
3012#define mask_msb_ptr r2
3013
3014#define block_ptr_load_a r0
3015#define block_ptr_store r3
3016#define block_ptr_load_b r12
3017#define c_32 r2
3018
3019#define c_48 r4
3020#define fb_ptr r14
3021#define draw_mask_bits_scalar r5
3022
3023#define d128_0x07 q0
3024#define d128_0x1F q1
3025#define d128_0x8000 q2
3026#define test_mask q3
3027#define texels q4
3028#define colors_rg q5
3029#define colors_b_dm_bits q6
3030#define texels_rg q7
3031#define pixels_r q8
3032#define pixels_g q9
3033#define pixels_b q10
3034#define pixels q11
3035#define zero_mask q4
3036#define draw_mask q12
3037#define msb_mask q13
3038
3039#define fb_pixels q8
3040
3041#define pixels_gb_low q9
3042
3043#define colors_r d10
3044#define colors_g d11
3045#define colors_b d12
3046#define draw_mask_bits d13
3047#define texels_r d14
3048#define texels_g d15
3049#define pixels_r_low d16
3050#define pixels_g_low d18
3051#define pixels_b_low d19
3052#define msb_mask_low d26
3053#define msb_mask_high d27
3054
3055#define d64_1 d28
3056#define d64_4 d29
3057#define d64_128 d30
3058#define texels_b d31
3059
3060#define shade_blocks_textured_modulated_prologue_indirect() \
3061 mov c_48, #48; \
3062 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3063
3064#define shade_blocks_textured_modulated_prologue_direct() \
3065 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3066 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16] \
75e28f62 3067
75e28f62 3068
3867c6ef
E
3069#define shade_blocks_textured_modulated_prologue_shaded(dithering, target) \
3070
3071#define shade_blocks_textured_false_modulation_check_undithered(target) \
e1f6de8f 3072 ldr colors_scalar, [psx_gpu, #psx_gpu_triangle_color_offset]; \
3867c6ef
E
3073 movw colors_scalar_compare, #0x8080; \
3074 \
3075 movt colors_scalar_compare, #0x80; \
3076 cmp colors_scalar, colors_scalar_compare; \
3077 beq shade_blocks_textured_unmodulated_##target \
3078
3079#define shade_blocks_textured_false_modulation_check_dithered(target) \
3080
3081#define shade_blocks_textured_modulated_prologue_unshaded(dithering, target) \
3082 shade_blocks_textured_false_modulation_check_##dithering(target); \
75e28f62 3083 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
e1f6de8f 3084 vld1.u32 { colors_r[] }, [color_ptr, :32]; \
75e28f62
E
3085 vdup.u8 colors_g, colors_r[1]; \
3086 vdup.u8 colors_b, colors_r[2]; \
3087 vdup.u8 colors_r, colors_r[0] \
3088
3089
3090#define shade_blocks_textured_modulated_load_dithered(target) \
e1f6de8f 3091 vld1.u32 { target }, [block_ptr_load_b, :128] \
75e28f62
E
3092
3093#define shade_blocks_textured_modulated_load_last_dithered(target) \
e1f6de8f 3094 vld1.u32 { target }, [block_ptr_load_b, :128], c_32 \
75e28f62
E
3095
3096#define shade_blocks_textured_modulated_load_undithered(target) \
3097
3098#define shade_blocks_textured_modulated_load_last_undithered(target) \
3099 add block_ptr_load_b, block_ptr_load_b, #32 \
3100
3101#define shade_blocks_textured_modulate_dithered(channel) \
3102 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3103
3104#define shade_blocks_textured_modulate_undithered(channel) \
3105 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3106
3107
3108#define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
e1f6de8f 3109 vst1.u32 { draw_mask }, [block_ptr_store, :128]! \
75e28f62
E
3110
3111#define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
e1f6de8f 3112 ldr fb_ptr, [block_ptr_load_b, #(offset - 64)]; \
3113 vld1.u32 { fb_pixels }, [fb_ptr]; \
75e28f62
E
3114 vbit.u16 pixels, fb_pixels, draw_mask \
3115
3116#define shade_blocks_textured_modulated_store_pixels_indirect() \
e1f6de8f 3117 vst1.u32 { pixels }, [block_ptr_store, :128], c_48 \
75e28f62
E
3118
3119#define shade_blocks_textured_modulated_store_pixels_direct() \
e1f6de8f 3120 vst1.u32 { pixels }, [fb_ptr] \
75e28f62
E
3121
3122
3123#define shade_blocks_textured_modulated_load_rg_shaded() \
e1f6de8f 3124 vld1.u32 { colors_r, colors_g }, [block_ptr_load_b, :128], c_32 \
75e28f62
E
3125
3126#define shade_blocks_textured_modulated_load_rg_unshaded() \
3127 add block_ptr_load_b, block_ptr_load_b, #32 \
3128
3129#define shade_blocks_textured_modulated_load_bdm_shaded() \
e1f6de8f 3130 vld1.u32 { colors_b, draw_mask_bits }, [block_ptr_load_a, :128], c_32 \
75e28f62
E
3131
3132#define shade_blocks_textured_modulated_load_bdm_unshaded() \
e1f6de8f 3133 ldr draw_mask_bits_scalar, [block_ptr_load_a, #8]; \
75e28f62
E
3134 add block_ptr_load_a, block_ptr_load_a, #32 \
3135
3136#define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3137 vdup.u16 draw_mask, draw_mask_bits[0] \
3138
3139#define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3140 vdup.u16 draw_mask, draw_mask_bits_scalar \
3141
3142
3143#define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3144
3145#define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3146 vorr.u16 pixels, pixels, msb_mask \
3147
3148
3149#define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3150.align 3; \
3151 \
3152function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
3867c6ef 3153 shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
75e28f62 3154 stmdb sp!, { r4 - r5, lr }; \
e1f6de8f 3155 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62 3156 \
e1f6de8f 3157 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
3158 \
3159 shade_blocks_textured_modulated_prologue_##target(); \
75e28f62
E
3160 \
3161 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3162 mov c_32, #32; \
3163 \
3164 add block_ptr_load_b, block_ptr_load_a, #16; \
3165 vmov.u8 d64_1, #1; \
3166 vmov.u8 d64_4, #4; \
3167 vmov.u8 d64_128, #128; \
3168 \
e1f6de8f 3169 vld1.u32 { texels }, [block_ptr_load_a, :128], c_32; \
75e28f62
E
3170 vmov.u8 d128_0x07, #0x07; \
3171 \
3172 shade_blocks_textured_modulated_load_rg_##shading(); \
3173 vmov.u8 d128_0x1F, #0x1F; \
3174 \
3175 shade_blocks_textured_modulated_load_bdm_##shading(); \
3176 vmov.u16 d128_0x8000, #0x8000; \
3177 \
3178 vmovn.u16 texels_r, texels; \
3179 vshrn.u16 texels_g, texels, #5; \
3180 \
3181 vshrn.u16 texels_b, texels, #7; \
3182 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3183 \
3184 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3185 vtst.u16 draw_mask, draw_mask, test_mask; \
3186 \
3187 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3188 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3189 \
3190 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3191 vshr.u8 texels_b, texels_b, #3; \
3192 \
3193 shade_blocks_textured_modulate_##dithering(r); \
3194 shade_blocks_textured_modulate_##dithering(g); \
3195 shade_blocks_textured_modulate_##dithering(b); \
3196 \
3197 vand.u16 pixels, texels, d128_0x8000; \
3198 vceq.u16 zero_mask, texels, #0; \
3199 \
3200 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3201 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3202 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3203 \
3204 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3205 vorr.u16 draw_mask, draw_mask, zero_mask; \
3206 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3207 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3208 \
3209 subs num_blocks, num_blocks, #1; \
3210 beq 1f; \
3211 \
3212 .align 3; \
3213 \
3214 0: \
e1f6de8f 3215 vld1.u32 { texels }, [block_ptr_load_a, :128], c_32; \
75e28f62
E
3216 shade_blocks_textured_modulated_load_rg_##shading(); \
3217 vshrn.u16 texels_g, texels, #5; \
3218 \
3219 shade_blocks_textured_modulated_load_bdm_##shading(); \
3220 vshrn.u16 texels_b, texels, #7; \
3221 \
e1f6de8f 3222 pld [block_ptr_load_a]; \
75e28f62
E
3223 vmovn.u16 texels_r, texels; \
3224 vmlal.u8 pixels, pixels_r_low, d64_1; \
3225 \
3226 vmlal.u8 pixels, pixels_g_low, d64_4; \
3227 vmlal.u8 pixels, pixels_b_low, d64_128; \
3228 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3229 \
3230 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3231 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3232 \
3233 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3234 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3235 \
3236 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3237 vtst.u16 draw_mask, draw_mask, test_mask; \
3238 \
3239 shade_blocks_textured_modulated_store_pixels_##target(); \
3240 vshr.u8 texels_b, texels_b, #3; \
3241 \
3242 shade_blocks_textured_modulate_##dithering(r); \
3243 shade_blocks_textured_modulate_##dithering(g); \
3244 shade_blocks_textured_modulate_##dithering(b); \
3245 \
3246 vand.u16 pixels, texels, d128_0x8000; \
3247 vceq.u16 zero_mask, texels, #0; \
3248 \
3249 subs num_blocks, num_blocks, #1; \
3250 \
3251 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3252 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3253 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3254 \
3255 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3256 vorr.u16 draw_mask, draw_mask, zero_mask; \
3257 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3258 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3259 \
3260 bne 0b; \
3261 \
3262 1: \
3263 vmlal.u8 pixels, pixels_r_low, d64_1; \
3264 vmlal.u8 pixels, pixels_g_low, d64_4; \
3265 vmlal.u8 pixels, pixels_b_low, d64_128; \
3266 \
3267 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3268 shade_blocks_textured_modulated_store_pixels_##target(); \
3269 \
3270 ldmia sp!, { r4 - r5, pc } \
3271
3272
3273shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3274shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3275shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3276shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3277
3278shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3279shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3280shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3281shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3282
3283
3284#undef c_64
3285#undef fb_ptr
3286#undef color_ptr
3287
3288#undef color_r
3289#undef color_g
3290#undef color_b
3291
3292#undef test_mask
3293#undef pixels
3294#undef draw_mask
3295#undef zero_mask
3296#undef fb_pixels
3297#undef msb_mask
3298#undef msb_mask_low
3299#undef msb_mask_high
3300
3301#define psx_gpu r0
3302#define num_blocks r1
3303#define mask_msb_ptr r2
3304#define color_ptr r3
3305
3306#define block_ptr_load r0
3307#define draw_mask_store_ptr r3
3308#define draw_mask_bits_ptr r12
3309#define draw_mask_ptr r12
3310#define pixel_store_ptr r14
3311
3312#define fb_ptr_cmp r4
3313
3314#define fb_ptr r3
3315#define fb_ptr_next r14
3316
3317#define c_64 r2
3318
3319#define test_mask q0
3320#define pixels q1
3321#define draw_mask q2
3322#define zero_mask q3
3323#define draw_mask_combined q4
3324#define fb_pixels q5
3325#define fb_pixels_next q6
3326#define msb_mask q7
3327
3328#define draw_mask_low d4
3329#define draw_mask_high d5
3330#define msb_mask_low d14
3331#define msb_mask_high d15
3332
3333.align 3
3334function(shade_blocks_textured_unmodulated_indirect)
e1f6de8f 3335 str r14, [sp, #-4]
75e28f62
E
3336 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3337
e1f6de8f 3338 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3339 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3340
e1f6de8f 3341 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
3342 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3343
3344 mov c_64, #64
3345 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3346
e1f6de8f 3347 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
75e28f62 3348 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3349 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3350 vceq.u16 zero_mask, pixels, #0
3351
3352 vtst.u16 draw_mask, draw_mask, test_mask
e1f6de8f 3353 vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
75e28f62
E
3354
3355 subs num_blocks, num_blocks, #1
3356 beq 1f
3357
3358 0:
e1f6de8f 3359 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
75e28f62
E
3360 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3361
3362 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3363 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3364 vceq.u16 zero_mask, pixels, #0
3365
3366 vtst.u16 draw_mask, draw_mask, test_mask
e1f6de8f 3367 vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
75e28f62 3368
e1f6de8f 3369 vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
75e28f62
E
3370 subs num_blocks, num_blocks, #1
3371
3372 bne 0b
3373
3374 1:
3375 vorr.u16 draw_mask_combined, draw_mask, zero_mask
e1f6de8f 3376 vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
75e28f62 3377
e1f6de8f 3378 ldr pc, [sp, #-4]
75e28f62
E
3379
3380
3381.align 3
3382
3383function(shade_blocks_textured_unmodulated_direct)
3384 stmdb sp!, { r4, r14 }
3385 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3386
e1f6de8f 3387 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3388 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3389
e1f6de8f 3390 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
3391 mov c_64, #64
3392
e1f6de8f 3393 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
3394 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3395
3396 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3397 [draw_mask_bits_ptr, :16], c_64
3398 ldr fb_ptr_next, [block_ptr_load, #44]
75e28f62 3399
e1f6de8f 3400 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3401 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3402 vceq.u16 zero_mask, pixels, #0
3403 vtst.u16 draw_mask, draw_mask, test_mask
3404
3405 subs num_blocks, num_blocks, #1
3406 beq 1f
3407
3408 0:
3409 mov fb_ptr, fb_ptr_next
e1f6de8f 3410 ldr fb_ptr_next, [block_ptr_load, #44]
75e28f62
E
3411
3412 vorr.u16 pixels, pixels, msb_mask
3413
3414 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3415 vmov fb_pixels, fb_pixels_next
3416
3417 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3418 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3419 vbif.u16 fb_pixels, pixels, draw_mask_combined
3420
75e28f62 3421 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
e1f6de8f 3422 pld [fb_ptr_next, #64]
8438c3c7 3423
75e28f62 3424 add fb_ptr_cmp, fb_ptr_cmp, #14
e1f6de8f 3425 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
8438c3c7 3426
75e28f62
E
3427 cmp fb_ptr_cmp, #28
3428 bls 4f
3429
e1f6de8f 3430 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3431 vceq.u16 zero_mask, pixels, #0
3432
e1f6de8f 3433 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3434 vtst.u16 draw_mask, draw_mask, test_mask
3435
3436 3:
3437 subs num_blocks, num_blocks, #1
3438 bne 0b
3439
3440 1:
3441 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3442 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3443
e1f6de8f 3444 vst1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3445
3446 ldmia sp!, { r4, pc }
3447
3448 4:
e1f6de8f 3449 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3450 vceq.u16 zero_mask, pixels, #0
3451
e1f6de8f 3452 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3453 vtst.u16 draw_mask, draw_mask, test_mask
3454
3455 bal 3b
3456
3457
3458function(shade_blocks_unshaded_untextured_indirect)
3459 bx lr
3460
3461.align 3
3462
3463function(shade_blocks_unshaded_untextured_direct)
3464 stmdb sp!, { r4, r14 }
3465 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3466
e1f6de8f 3467 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3468 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3469
e1f6de8f 3470 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
3471 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3472
3473 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
e1f6de8f 3474 vld1.u16 { pixels }, [color_ptr, :128]
75e28f62
E
3475
3476 mov c_64, #64
e1f6de8f 3477 vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62
E
3478
3479 vorr.u16 pixels, pixels, msb_mask
3480 subs num_blocks, num_blocks, #1
3481
e1f6de8f 3482 ldr fb_ptr_next, [block_ptr_load], #64
75e28f62 3483
e1f6de8f 3484 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3485 beq 1f
3486
3487 0:
3488 vmov fb_pixels, fb_pixels_next
3489 mov fb_ptr, fb_ptr_next
e1f6de8f 3490 ldr fb_ptr_next, [block_ptr_load], #64
75e28f62
E
3491
3492 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 3493 vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62
E
3494
3495 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3496 add fb_ptr_cmp, fb_ptr_cmp, #14
3497 cmp fb_ptr_cmp, #28
3498 bls 4f
3499
e1f6de8f 3500 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3501 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3502
3503 3:
3504 subs num_blocks, num_blocks, #1
3505 bne 0b
3506
3507 1:
3508 vbif.u16 fb_pixels_next, pixels, draw_mask
e1f6de8f 3509 vst1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3510
3511 ldmia sp!, { r4, pc }
3512
3513 4:
e1f6de8f 3514 vst1.u16 { fb_pixels }, [fb_ptr]
3515 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3516 bal 3b
3517
3518
3519#undef draw_mask_ptr
3520#undef c_64
3521#undef fb_ptr
3522#undef fb_ptr_next
3523#undef fb_ptr_cmp
3524
3525#define psx_gpu r0
3526#define num_blocks r1
3527#define msb_mask_ptr r2
3528#define pixel_ptr r3
3529#define draw_mask_ptr r0
3530#define c_64 r2
3531#define fb_ptr r12
3532#define fb_ptr_next r14
3533#define fb_ptr_cmp r4
3534
3535#undef msb_mask
3536#undef draw_mask
3537#undef pixels
3538#undef fb_pixels
3539#undef d128_0x8000
3540#undef msb_mask_low
3541#undef msb_mask_high
3542#undef draw_mask_next
3543#undef pixels_g
3544#undef blend_pixels
3545#undef fb_pixels_next
3546
3547#define msb_mask q0
3548#define draw_mask q1
3549#define pixels q2
3550#define fb_pixels q3
3551#define blend_pixels q4
3552#define pixels_no_msb q5
3553#define blend_mask q6
3554#define fb_pixels_no_msb q7
3555#define d128_0x8000 q8
3556#define d128_0x0421 q9
3557#define fb_pixels_next q10
3558#define blend_pixels_next q11
3559#define pixels_next q12
3560#define draw_mask_next q13
3561#define write_mask q14
3562
3563#define pixels_rb q5
3564#define pixels_mg q7
3565#define pixels_g q7
3566#define d128_0x7C1F q8
3567#define d128_0x03E0 q9
3568#define fb_pixels_rb q10
3569#define fb_pixels_g q11
3570#define fb_pixels_masked q11
3571#define d128_0x83E0 q15
3572#define pixels_fourth q7
3573#define d128_0x1C07 q12
3574#define d128_0x00E0 q13
3575#define d128_0x80E0 q13
3576
3577#define msb_mask_low d0
3578#define msb_mask_high d1
3579
3580#define blend_blocks_average_set_blend_mask_textured(source) \
3581 vclt.s16 blend_mask, source, #0 \
3582
3583#define blend_blocks_average_set_stp_bit_textured() \
3584 vorr.u16 blend_pixels, #0x8000 \
3585
3586#define blend_blocks_average_combine_textured(source) \
3587 vbif.u16 blend_pixels, source, blend_mask \
3588
3589#define blend_blocks_average_set_blend_mask_untextured(source) \
3590
3591#define blend_blocks_average_set_stp_bit_untextured() \
3592
3593#define blend_blocks_average_combine_untextured(source) \
3594
3595#define blend_blocks_average_mask_set_on() \
3596 vclt.s16 write_mask, fb_pixels_next, #0 \
3597
3598#define blend_blocks_average_mask_copy_on() \
3599 vorr.u16 draw_mask, draw_mask_next, write_mask \
3600
3601#define blend_blocks_average_mask_copy_b_on() \
3602 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3603
3604#define blend_blocks_average_mask_set_off() \
3605
3606#define blend_blocks_average_mask_copy_off() \
3607 vmov draw_mask, draw_mask_next \
3608
3609#define blend_blocks_average_mask_copy_b_off() \
3610
3611#define blend_blocks_average_builder(texturing, mask_evaluate) \
3612.align 3; \
3613 \
3614function(blend_blocks_##texturing##_average_##mask_evaluate) \
3615 stmdb sp!, { r4, r14 }; \
3616 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3617 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3618 \
3619 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3620 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3621 \
3622 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3623 mov c_64, #64; \
3624 \
3625 vmov.u16 d128_0x8000, #0x8000; \
e1f6de8f 3626 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
3627 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62
E
3628 \
3629 vmov.u16 d128_0x0421, #0x0400; \
e1f6de8f 3630 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
3631 \
3632 vorr.u16 d128_0x0421, #0x0021; \
e1f6de8f 3633 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3634 \
3635 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3636 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3637 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3638 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3639 blend_blocks_average_mask_set_##mask_evaluate(); \
3640 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3641 \
3642 subs num_blocks, num_blocks, #1; \
3643 beq 1f; \
3644 \
3645 0: \
3646 mov fb_ptr, fb_ptr_next; \
e1f6de8f 3647 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62
E
3648 \
3649 vmov pixels, pixels_next; \
e1f6de8f 3650 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
3651 \
3652 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3653 \
3654 blend_blocks_average_mask_copy_##mask_evaluate(); \
e1f6de8f 3655 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
3656 \
3657 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3658 blend_blocks_average_set_stp_bit_##texturing(); \
3659 vmov fb_pixels, fb_pixels_next; \
3660 blend_blocks_average_combine_##texturing(pixels); \
3661 \
3662 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3663 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3664 cmp fb_ptr_cmp, #28; \
3665 bls 2f; \
3666 \
e1f6de8f 3667 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3668 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3669 \
3670 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3671 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3672 \
3673 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3674 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3675 \
3676 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3677 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3678 blend_blocks_average_mask_set_##mask_evaluate(); \
e1f6de8f 3679 vst1.u16 { fb_pixels }, [fb_ptr]; \
75e28f62
E
3680 \
3681 3: \
3682 subs num_blocks, num_blocks, #1; \
3683 bne 0b; \
3684 \
3685 1: \
3686 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3687 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3688 \
3689 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3690 blend_blocks_average_set_stp_bit_##texturing(); \
3691 blend_blocks_average_combine_##texturing(pixels_next); \
3692 \
3693 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3694 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
e1f6de8f 3695 vst1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3696 \
3697 ldmia sp!, { r4, pc }; \
3698 \
3699 2: \
3700 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3701 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
e1f6de8f 3702 vst1.u16 { fb_pixels }, [fb_ptr]; \
75e28f62 3703 \
e1f6de8f 3704 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3705 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3706 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3707 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3708 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3709 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3710 \
3711 bal 3b \
3712
3713blend_blocks_average_builder(textured, off)
3714blend_blocks_average_builder(untextured, off)
3715blend_blocks_average_builder(textured, on)
3716blend_blocks_average_builder(untextured, on)
3717
3718
3719#define blend_blocks_add_mask_set_on() \
3720 vclt.s16 write_mask, fb_pixels, #0 \
3721
3722#define blend_blocks_add_mask_copy_on() \
3723 vorr.u16 draw_mask, draw_mask, write_mask \
3724
3725#define blend_blocks_add_mask_set_off() \
3726
3727#define blend_blocks_add_mask_copy_off() \
3728
3729
3730#define blend_blocks_add_textured_builder(mask_evaluate) \
3731.align 3; \
3732 \
3733function(blend_blocks_textured_add_##mask_evaluate) \
3734 stmdb sp!, { r4, r14 }; \
3735 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3736 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3737 \
3738 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3739 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3740 \
3741 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3742 mov c_64, #64; \
3743 \
3744 vmov.u16 d128_0x7C1F, #0x7C00; \
3745 vmov.u16 d128_0x03E0, #0x0300; \
3746 vmov.u16 d128_0x83E0, #0x8000; \
3747 vorr.u16 d128_0x03E0, #0x00E0; \
3748 vorr.u16 d128_0x7C1F, #0x001F; \
3749 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3750 \
e1f6de8f 3751 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
3752 ldr fb_ptr_next, [pixel_ptr, #28]; \
3753 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 3754 vclt.s16 blend_mask, pixels, #0; \
e1f6de8f 3755 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3756 blend_blocks_add_mask_set_##mask_evaluate(); \
3757 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3758 \
3759 blend_blocks_add_mask_copy_##mask_evaluate(); \
3760 vorr.u16 pixels, pixels, msb_mask; \
3761 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3762 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3763 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3764 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3765 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3766 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3767 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3768 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3769 \
3770 subs num_blocks, num_blocks, #1; \
3771 beq 1f; \
3772 \
3773 0: \
3774 mov fb_ptr, fb_ptr_next; \
3775 \
e1f6de8f 3776 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 3777 \
e1f6de8f 3778 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
3779 vclt.s16 blend_mask, pixels, #0; \
3780 \
3781 vorr.u16 pixels, pixels, msb_mask; \
3782 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3783 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3784 \
8438c3c7 3785 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
e1f6de8f 3786 pld [fb_ptr_next, #64]; \
75e28f62
E
3787 \
3788 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
8438c3c7 3789 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3790 \
75e28f62 3791 add fb_ptr_cmp, fb_ptr_cmp, #14; \
e1f6de8f 3792 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
8438c3c7 3793 \
75e28f62
E
3794 cmp fb_ptr_cmp, #28; \
3795 bls 2f; \
3796 \
e1f6de8f 3797 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3798 blend_blocks_add_mask_set_##mask_evaluate(); \
3799 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3800 blend_blocks_add_mask_copy_##mask_evaluate(); \
3801 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3802 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
e1f6de8f 3803 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3804 \
3805 3: \
3806 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3807 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3808 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3809 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3810 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3811 \
3812 subs num_blocks, num_blocks, #1; \
3813 bne 0b; \
3814 \
3815 1: \
3816 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3817 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3818 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
3819 \
3820 ldmia sp!, { r4, pc }; \
3821 \
3822 2: \
e1f6de8f 3823 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3824 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3825 \
e1f6de8f 3826 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3827 blend_blocks_add_mask_set_##mask_evaluate(); \
3828 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3829 blend_blocks_add_mask_copy_##mask_evaluate(); \
3830 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3831 bal 3b \
3832
3833
3834#define blend_blocks_add_untextured_builder(mask_evaluate) \
3835.align 3; \
3836 \
3837function(blend_blocks_untextured_add_##mask_evaluate) \
3838 stmdb sp!, { r4, r14 }; \
3839 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3840 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3841 \
3842 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3843 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3844 \
3845 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3846 mov c_64, #64; \
3847 \
3848 vmov.u16 d128_0x7C1F, #0x7C00; \
3849 vmov.u16 d128_0x03E0, #0x0300; \
3850 vorr.u16 d128_0x7C1F, #0x001F; \
3851 vorr.u16 d128_0x03E0, #0x00E0; \
3852 \
e1f6de8f 3853 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
3854 ldr fb_ptr_next, [pixel_ptr, #28]; \
3855 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
3856 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3857 blend_blocks_add_mask_set_##mask_evaluate(); \
3858 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3859 \
3860 blend_blocks_add_mask_copy_##mask_evaluate(); \
3861 vand.u16 pixels_g, pixels, d128_0x03E0; \
3862 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3863 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3864 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3865 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3866 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3867 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3868 \
3869 subs num_blocks, num_blocks, #1; \
3870 beq 1f; \
3871 \
3872 0: \
3873 mov fb_ptr, fb_ptr_next; \
3874 \
e1f6de8f 3875 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 3876 \
e1f6de8f 3877 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
3878 \
3879 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3880 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3881 vand.u16 pixels_g, pixels, d128_0x03E0; \
3882 \
3883 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3884 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
3885 \
3886 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3887 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3888 cmp fb_ptr_cmp, #28; \
3889 bls 2f; \
3890 \
e1f6de8f 3891 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3892 blend_blocks_add_mask_set_##mask_evaluate(); \
3893 blend_blocks_add_mask_copy_##mask_evaluate(); \
3894 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3895 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 3896 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3897 \
3898 3: \
3899 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3900 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3901 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3902 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3903 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3904 \
3905 subs num_blocks, num_blocks, #1; \
3906 bne 0b; \
3907 \
3908 1: \
3909 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3910 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3911 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3912 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
3913 \
3914 ldmia sp!, { r4, pc }; \
3915 \
3916 2: \
e1f6de8f 3917 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3918 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3919 \
e1f6de8f 3920 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3921 blend_blocks_add_mask_set_##mask_evaluate(); \
3922 blend_blocks_add_mask_copy_##mask_evaluate(); \
3923 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3924 bal 3b \
3925
3926
3927blend_blocks_add_textured_builder(off)
3928blend_blocks_add_textured_builder(on)
3929blend_blocks_add_untextured_builder(off)
3930blend_blocks_add_untextured_builder(on)
3931
3932#define blend_blocks_subtract_set_blend_mask_textured() \
3933 vclt.s16 blend_mask, pixels_next, #0 \
3934
3935#define blend_blocks_subtract_combine_textured() \
3936 vbif.u16 blend_pixels, pixels, blend_mask \
3937
718a9e58 3938#define blend_blocks_subtract_set_stp_textured() \
75e28f62
E
3939 vorr.u16 blend_pixels, #0x8000 \
3940
3941#define blend_blocks_subtract_msb_mask_textured() \
3942 vorr.u16 pixels, pixels_next, msb_mask \
3943
3944#define blend_blocks_subtract_set_blend_mask_untextured() \
3945
3946#define blend_blocks_subtract_combine_untextured() \
3947
718a9e58 3948#define blend_blocks_subtract_set_stp_untextured() \
75e28f62
E
3949 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3950
3951#define blend_blocks_subtract_msb_mask_untextured() \
3952
3953
3954#define blend_blocks_subtract_mask_set_on() \
3955 vclt.s16 write_mask, fb_pixels, #0 \
3956
3957#define blend_blocks_subtract_mask_copy_on() \
3958 vorr.u16 draw_mask, draw_mask_next, write_mask \
3959
3960#define blend_blocks_subtract_mask_set_off() \
3961
3962#define blend_blocks_subtract_mask_copy_off() \
3963 vmov draw_mask, draw_mask_next \
3964
3965
3966#define blend_blocks_subtract_builder(texturing, mask_evaluate) \
3967.align 3; \
3968 \
3969function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
3970 stmdb sp!, { r4, r14 }; \
3971 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3972 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3973 \
3974 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3975 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3976 \
3977 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3978 mov c_64, #64; \
3979 \
3980 vmov.u16 d128_0x7C1F, #0x7C00; \
3981 vmov.u16 d128_0x03E0, #0x0300; \
3982 vorr.u16 d128_0x7C1F, #0x001F; \
3983 vorr.u16 d128_0x03E0, #0x00E0; \
3984 \
e1f6de8f 3985 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
3986 ldr fb_ptr_next, [pixel_ptr, #28]; \
3987 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62 3988 blend_blocks_subtract_set_blend_mask_##texturing(); \
e1f6de8f 3989 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3990 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3991 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3992 \
3993 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3994 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3995 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3996 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3997 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3998 \
3999 subs num_blocks, num_blocks, #1; \
4000 beq 1f; \
4001 \
4002 0: \
4003 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4004 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4005 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4006 \
e1f6de8f 4007 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4008 blend_blocks_subtract_msb_mask_##texturing(); \
4009 \
e1f6de8f 4010 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
4011 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4012 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
718a9e58 4013 blend_blocks_subtract_set_stp_##texturing(); \
75e28f62
E
4014 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
4015 blend_blocks_subtract_combine_##texturing(); \
4016 blend_blocks_subtract_set_blend_mask_##texturing(); \
4017 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4018 \
4019 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4020 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4021 cmp fb_ptr_cmp, #28; \
4022 bls 2f; \
4023 \
e1f6de8f 4024 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4025 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4026 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4027 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4028 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
e1f6de8f 4029 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4030 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4031 \
4032 3: \
4033 subs num_blocks, num_blocks, #1; \
4034 bne 0b; \
4035 \
4036 1: \
4037 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4038 \
4039 blend_blocks_subtract_msb_mask_##texturing(); \
4040 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
718a9e58 4041 blend_blocks_subtract_set_stp_##texturing(); \
75e28f62
E
4042 blend_blocks_subtract_combine_##texturing(); \
4043 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4044 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
4045 \
4046 ldmia sp!, { r4, pc }; \
4047 \
4048 2: \
e1f6de8f 4049 vst1.u16 { blend_pixels }, [fb_ptr]; \
4050 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4051 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4052 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4053 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4054 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4055 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4056 bal 3b \
4057
4058
4059blend_blocks_subtract_builder(textured, off)
4060blend_blocks_subtract_builder(textured, on)
4061blend_blocks_subtract_builder(untextured, off)
4062blend_blocks_subtract_builder(untextured, on)
4063
4064
4065#define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
4066.align 3; \
4067 \
4068function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4069 stmdb sp!, { r4, r14 }; \
4070 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 4071 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
4072 \
4073 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 4074 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
4075 \
4076 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4077 mov c_64, #64; \
4078 \
4079 vmov.u16 d128_0x7C1F, #0x7C00; \
4080 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62 4081 vmov.u16 d128_0x1C07, #0x1C00; \
d1c75d1e 4082 vmov.u16 d128_0x00E0, #0x00E0; \
75e28f62
E
4083 vorr.u16 d128_0x7C1F, #0x001F; \
4084 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62 4085 vorr.u16 d128_0x1C07, #0x0007; \
75e28f62 4086 \
e1f6de8f 4087 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
4088 ldr fb_ptr_next, [pixel_ptr, #28]; \
4089 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 4090 vclt.s16 blend_mask, pixels, #0; \
e1f6de8f 4091 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4092 blend_blocks_add_mask_set_##mask_evaluate(); \
4093 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4094 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
75e28f62
E
4095 \
4096 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4097 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4098 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4099 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4100 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4101 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4102 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4103 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4104 \
4105 subs num_blocks, num_blocks, #1; \
4106 beq 1f; \
4107 \
4108 0: \
4109 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4110 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4111 \
d1c75d1e 4112 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
718a9e58 4113 vorr.u16 blend_pixels, #0x8000; /* stp */ \
d1c75d1e
E
4114 vbif.u16 blend_pixels, pixels, blend_mask; \
4115 \
e1f6de8f 4116 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 4117 vclt.s16 blend_mask, pixels, #0; \
75e28f62 4118 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4119 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62
E
4120 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4121 \
4122 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4123 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4124 \
4125 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4126 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4127 cmp fb_ptr_cmp, #28; \
4128 bls 2f; \
4129 \
e1f6de8f 4130 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62 4131 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4132 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4133 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4134 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 4135 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4136 \
4137 3: \
d1c75d1e 4138 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4139 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4140 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4141 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4142 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4143 \
4144 subs num_blocks, num_blocks, #1; \
4145 bne 0b; \
4146 \
4147 1: \
4148 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
718a9e58 4149 vorr.u16 blend_pixels, #0x8000; /* stp */ \
d1c75d1e 4150 vbif.u16 blend_pixels, pixels, blend_mask; \
718a9e58 4151 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62 4152 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4153 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
4154 \
4155 ldmia sp!, { r4, pc }; \
4156 \
4157 2: \
e1f6de8f 4158 vst1.u16 { blend_pixels }, [fb_ptr]; \
d1c75d1e 4159 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
75e28f62 4160 \
e1f6de8f 4161 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62 4162 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4163 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e 4164 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4165 bal 3b \
4166
4167
d1c75d1e 4168
75e28f62
E
4169#define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4170.align 3; \
4171 \
4172function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4173 stmdb sp!, { r4, r14 }; \
4174 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 4175 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
4176 \
4177 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 4178 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
4179 \
4180 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4181 mov c_64, #64; \
4182 \
4183 vmov.u16 d128_0x7C1F, #0x7C00; \
4184 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62
E
4185 vmov.u16 d128_0x1C07, #0x1C00; \
4186 vmov.u16 d128_0x00E0, #0x00E0; \
4187 vorr.u16 d128_0x7C1F, #0x001F; \
4188 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62
E
4189 vorr.u16 d128_0x1C07, #0x0007; \
4190 \
e1f6de8f 4191 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
4192 ldr fb_ptr_next, [pixel_ptr, #28]; \
4193 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
4194 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4195 blend_blocks_add_mask_set_##mask_evaluate(); \
4196 vshr.s16 pixels_fourth, pixels, #2; \
4197 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4198 \
4199 blend_blocks_add_mask_copy_##mask_evaluate(); \
4200 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4201 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4202 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4203 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4204 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4205 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4206 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4207 \
4208 subs num_blocks, num_blocks, #1; \
4209 beq 1f; \
4210 \
4211 0: \
4212 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4213 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4214 \
e1f6de8f 4215 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
4216 \
4217 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4218 vshr.s16 pixels_fourth, pixels, #2; \
4219 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4220 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4221 \
4222 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4223 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4224 \
4225 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4226 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4227 cmp fb_ptr_cmp, #28; \
4228 bls 2f; \
4229 \
e1f6de8f 4230 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4231 blend_blocks_add_mask_set_##mask_evaluate(); \
4232 blend_blocks_add_mask_copy_##mask_evaluate(); \
4233 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4234 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 4235 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4236 \
4237 3: \
4238 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4239 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4240 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4241 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4242 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4243 \
4244 subs num_blocks, num_blocks, #1; \
4245 bne 0b; \
4246 \
4247 1: \
4248 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4249 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4250 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4251 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
4252 \
4253 ldmia sp!, { r4, pc }; \
4254 \
4255 2: \
e1f6de8f 4256 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4257 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4258 \
e1f6de8f 4259 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4260 blend_blocks_add_mask_set_##mask_evaluate(); \
4261 blend_blocks_add_mask_copy_##mask_evaluate(); \
4262 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4263 bal 3b \
4264
4265
4266blend_blocks_add_fourth_textured_builder(off)
4267blend_blocks_add_fourth_textured_builder(on)
4268blend_blocks_add_fourth_untextured_builder(off)
4269blend_blocks_add_fourth_untextured_builder(on)
4270
4271// TODO: Optimize this more. Need a scene that actually uses it for
4272// confirmation..
4273
4274.align 3
4275
4276function(blend_blocks_textured_unblended_on)
4277 stmdb sp!, { r4, r14 }
4278 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
e1f6de8f 4279 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
4280
4281 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
e1f6de8f 4282 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
4283
4284 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4285 mov c_64, #64
4286
e1f6de8f 4287 ldr fb_ptr, [pixel_ptr, #28]
4288 vld1.u16 { fb_pixels }, [fb_ptr]
4289 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62 4290 vclt.s16 write_mask, fb_pixels, #0
e1f6de8f 4291 vld1.u32 { pixels }, [pixel_ptr, :128], c_64
75e28f62
E
4292
4293 subs num_blocks, num_blocks, #1
4294 beq 1f
4295
4296 0:
134f81ec 4297 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4298 vorr.u16 draw_mask, draw_mask, write_mask
4299 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 4300 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62 4301
e1f6de8f 4302 ldr fb_ptr, [pixel_ptr, #28]
4303 vld1.u16 { fb_pixels }, [fb_ptr]
4304 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62 4305 vclt.s16 write_mask, fb_pixels, #0
e1f6de8f 4306 vld1.u32 { pixels }, [pixel_ptr, :128], c_64
75e28f62
E
4307
4308 subs num_blocks, num_blocks, #1
4309 bne 0b
4310
4311 1:
134f81ec 4312 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4313 vorr.u16 draw_mask, draw_mask, write_mask
4314 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 4315 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
4316
4317 ldmia sp!, { r4, pc }
4318
4319
4320function(blend_blocks_textured_unblended_off)
4321 bx lr
4322
4323
4324function(warmup)
4325 mov r3, #64
4326 cmp r0, #0
4327 bxeq lr
4328
4329 0:
e1f6de8f 4330 vld1.u32 { u_whole_8, v_whole_8 }, [r1, :128], r3
75e28f62
E
4331
4332 subs r0, r0, #1
4333 bne 0b
4334
4335 bx lr
4336
6c4a10c4 4337#undef vram_ptr
75e28f62 4338#undef color
6c4a10c4 4339#undef width
75e28f62 4340#undef height
6c4a10c4 4341#undef pitch
75e28f62
E
4342
4343#define vram_ptr r0
6c4a10c4
E
4344#define color r1
4345#define width r2
4346#define height r3
75e28f62 4347
6c4a10c4 4348#define pitch r1
75e28f62 4349
6c4a10c4 4350#define num_width r12
75e28f62 4351
87c45ad1
E
4352#undef colors_a
4353#undef colors_b
75e28f62 4354
87c45ad1
E
4355#define colors_a q0
4356#define colors_b q1
75e28f62
E
4357
4358.align 3
4359
4360function(render_block_fill_body)
87c45ad1 4361 vdup.u16 colors_a, color
6c4a10c4 4362 mov pitch, #2048
75e28f62 4363
87c45ad1 4364 vmov colors_b, colors_a
75e28f62 4365 sub pitch, pitch, width, lsl #1
75e28f62 4366
6c4a10c4 4367 mov num_width, width
75e28f62 4368
6c4a10c4 4369 0:
e1f6de8f 4370 vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]!
75e28f62 4371
d1c75d1e 4372 subs num_width, num_width, #16
6c4a10c4 4373 bne 0b
75e28f62 4374
75e28f62 4375 add vram_ptr, vram_ptr, pitch
6c4a10c4
E
4376 mov num_width, width
4377
75e28f62
E
4378 subs height, height, #1
4379 bne 0b
75e28f62 4380
6c4a10c4
E
4381 bx lr
4382
75e28f62
E
4383
4384#undef x
4385#undef y
4386#undef width
4387#undef height
4388#undef fb_ptr
4389#undef texture_mask
4390#undef num_blocks
4391#undef temp
4392#undef dirty_textures_mask
4393#undef clut_ptr
4394#undef current_texture_mask
4395
4396#define psx_gpu r0
4397#define x r1
4398#define y r2
4399#define u r3
4400#define v r4
4401#define width r5
4402#define height r6
4403#define offset_u r8
4404#define offset_v r9
4405#define offset_u_right r10
4406#define width_rounded r11
4407#define height_rounded r12
4408
4409#define texture_offset_base r1
4410#define tile_width r2
4411#define tile_height r3
4412#define num_blocks r4
4413#define block r5
4414#define sub_tile_height r6
4415#define fb_ptr r7
4416#define texture_mask r8
4417#define column_data r9
4418#define texture_offset r10
4419#define tiles_remaining r11
4420#define fb_ptr_advance_column r12
4421#define texture_block_ptr r14
4422
8184d7c5 4423#define temp r14
4424
75e28f62
E
4425#define texture_page_ptr r3
4426#define left_block_mask r4
4427#define right_block_mask r5
4428#define texture_mask_rev r10
4429#define control_mask r11
4430
4431#define dirty_textures_mask r4
4432#define clut_ptr r5
4433#define current_texture_mask r6
4434
4435
4436#undef texels
4437#undef clut_low_a
4438#undef clut_low_b
4439#undef clut_high_a
4440#undef clut_high_b
4441#undef clut_a
4442#undef clut_b
4443#undef texels_low
4444#undef texels_high
4445
4446#define texels d0
4447#define draw_masks_fb_ptrs q1
4448
4449#define draw_mask_fb_ptr_left d2
4450#define draw_mask_fb_ptr_right d3
4451
59d15d23 4452#define draw_mask_fb_ptr_left_a d2
4453#define draw_mask_fb_ptr_left_b d3
4454#define draw_mask_fb_ptr_right_a d10
4455#define draw_mask_fb_ptr_right_b d11
4456#define draw_masks_fb_ptrs2 q5
4457
75e28f62
E
4458#define clut_low_a d4
4459#define clut_low_b d5
4460#define clut_high_a d6
4461#define clut_high_b d7
4462
4463#define block_masks d8
4464#define block_masks_shifted d9
4465
4466#define clut_a q2
4467#define clut_b q3
4468
59d15d23 4469#define texels_low d12
4470#define texels_high d13
75e28f62 4471
59d15d23 4472#define texels_wide_low d14
4473#define texels_wide_high d15
4474#define texels_wide q7
75e28f62
E
4475
4476
59d15d23 4477setup_sprite_flush_blocks:
4478 vpush { q1 - q5 }
75e28f62 4479
4d646738 4480 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 4481 bl flush_render_block_buffer
4d646738 4482 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 4483
59d15d23 4484 vpop { q1 - q5 }
75e28f62
E
4485
4486 add block, psx_gpu, #psx_gpu_blocks_offset
75e28f62
E
4487 bx lr
4488
4489
4490setup_sprite_update_texture_4bpp_cache:
4491 stmdb sp!, { r0 - r3, r14 }
4492 bl update_texture_4bpp_cache
4493 ldmia sp!, { r0 - r3, pc }
4494
4495
4496setup_sprite_update_texture_8bpp_cache:
4d646738 4497 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
75e28f62 4498 bl update_texture_8bpp_cache
4d646738 4499 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
75e28f62
E
4500
4501
4502#define setup_sprite_tiled_initialize_4bpp() \
4503 ldr dirty_textures_mask, \
e1f6de8f 4504 [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]; \
4505 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]; \
75e28f62 4506 \
e1f6de8f 4507 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]; \
4508 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]; \
75e28f62
E
4509 \
4510 tst current_texture_mask, dirty_textures_mask; \
4511 vuzp.u8 clut_a, clut_b; \
4512 \
4513 blne setup_sprite_update_texture_4bpp_cache \
4514
4515#define setup_sprite_tiled_initialize_8bpp() \
4516 ldr dirty_textures_mask, \
e1f6de8f 4517 [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]; \
4518 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]; \
75e28f62
E
4519 \
4520 tst current_texture_mask, dirty_textures_mask; \
4521 blne setup_sprite_update_texture_8bpp_cache \
4522
4523
75e28f62
E
4524#define setup_sprite_block_count_single() \
4525 sub_tile_height \
4526
4527#define setup_sprite_block_count_double() \
4528 sub_tile_height, lsl #1 \
4529
4530#define setup_sprite_tile_add_blocks(type) \
4531 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4532 cmp num_blocks, #MAX_BLOCKS; \
4533 \
59d15d23 4534 movgt num_blocks, setup_sprite_block_count_##type(); \
4535 blgt setup_sprite_flush_blocks \
75e28f62
E
4536
4537
4538#define setup_sprite_tile_full_4bpp(edge) \
4539 setup_sprite_tile_add_blocks(double); \
4540 \
4541 4: \
4542 and texture_block_ptr, texture_offset, texture_mask; \
4543 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4544 \
e1f6de8f 4545 pld [fb_ptr]; \
75e28f62 4546 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4547 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4548 \
4549 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4550 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4551 \
e1f6de8f 4552 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4553 add texture_block_ptr, texture_offset, #8; \
4554 \
4555 and texture_block_ptr, texture_block_ptr, texture_mask; \
4556 add block, block, #40; \
4557 \
4558 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4559 add fb_ptr, fb_ptr, #16; \
4560 \
e1f6de8f 4561 vst1.u32 { draw_mask_fb_ptr_left }, [block, :64]; \
75e28f62
E
4562 add block, block, #24; \
4563 \
e1f6de8f 4564 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4565 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4566 \
e1f6de8f 4567 pld [fb_ptr]; \
75e28f62
E
4568 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4569 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4570 \
e1f6de8f 4571 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4572 add block, block, #40; \
4573 \
4574 add texture_offset, texture_offset, #0x10; \
4575 add fb_ptr, fb_ptr, #(2048 - 16); \
4576 \
e1f6de8f 4577 vst1.u32 { draw_mask_fb_ptr_right }, [block, :64]; \
75e28f62
E
4578 add block, block, #24; \
4579 \
4580 subs sub_tile_height, sub_tile_height, #1; \
4581 bne 4b; \
4582 \
4583 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4584 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4585
4586
4587#define setup_sprite_tile_half_4bpp(edge) \
4588 setup_sprite_tile_add_blocks(single); \
4589 \
4590 4: \
4591 and texture_block_ptr, texture_offset, texture_mask; \
4592 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4593 \
e1f6de8f 4594 pld [fb_ptr]; \
75e28f62 4595 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4596 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4597 \
4598 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4599 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4600 \
e1f6de8f 4601 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4602 add block, block, #40; \
4603 \
4604 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4605 vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64]; \
75e28f62
E
4606 \
4607 add block, block, #24; \
4608 add texture_offset, texture_offset, #0x10; \
4609 \
4610 add fb_ptr, fb_ptr, #2048; \
4611 subs sub_tile_height, sub_tile_height, #1; \
4612 \
4613 bne 4b; \
4614 \
4615 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4616 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4617
4618
4619#define setup_sprite_tile_full_8bpp(edge) \
4620 setup_sprite_tile_add_blocks(double); \
4621 add block, block, #16; \
4622 \
4623 4: \
4624 and texture_block_ptr, texture_offset, texture_mask; \
4625 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4626 \
e1f6de8f 4627 pld [fb_ptr]; \
75e28f62 4628 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4629 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4630 \
4631 add texture_block_ptr, texture_offset, #8; \
e1f6de8f 4632 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4633 \
4634 and texture_block_ptr, texture_block_ptr, texture_mask; \
4635 add block, block, #24; \
4636 \
4637 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4638 \
4639 add fb_ptr, fb_ptr, #16; \
e1f6de8f 4640 vst1.u32 { draw_mask_fb_ptr_left }, [block, :64]; \
75e28f62
E
4641 \
4642 add block, block, #40; \
e1f6de8f 4643 vld1.u32 { texels }, [texture_block_ptr, :64]; \
4644 pld [fb_ptr]; \
75e28f62
E
4645 \
4646 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
e1f6de8f 4647 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4648 add block, block, #24; \
4649 \
4650 add texture_offset, texture_offset, #0x10; \
4651 add fb_ptr, fb_ptr, #(2048 - 16); \
4652 \
e1f6de8f 4653 vst1.u32 { draw_mask_fb_ptr_right }, [block, :64]; \
75e28f62
E
4654 add block, block, #40; \
4655 \
4656 subs sub_tile_height, sub_tile_height, #1; \
4657 bne 4b; \
4658 \
4659 sub block, block, #16; \
4660 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4661 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4662
4663
4664#define setup_sprite_tile_half_8bpp(edge) \
4665 setup_sprite_tile_add_blocks(single); \
4666 add block, block, #16; \
4667 \
4668 4: \
4669 and texture_block_ptr, texture_offset, texture_mask; \
4670 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
e1f6de8f 4671 pld [fb_ptr]; \
75e28f62
E
4672 \
4673 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4674 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62 4675 \
e1f6de8f 4676 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4677 add block, block, #24; \
4678 \
e1f6de8f 4679 vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64]; \
75e28f62
E
4680 add block, block, #40; \
4681 \
4682 add texture_offset, texture_offset, #0x10; \
4683 add fb_ptr, fb_ptr, #2048; \
4684 \
4685 subs sub_tile_height, sub_tile_height, #1; \
4686 bne 4b; \
4687 \
4688 sub block, block, #16; \
4689 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4690 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4691
4692
4693#define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4694 add texture_offset, texture_offset_base, #8; \
4695 add fb_ptr, fb_ptr, #16 \
4696
4697#define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4698 mov texture_offset, texture_offset_base \
4699
4700#define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4701 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4702
4703#define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4704 mov texture_offset, texture_offset_base \
4705
4706#define setup_sprite_tile_column_edge_post_adjust_half_right() \
4707 sub fb_ptr, fb_ptr, #16 \
4708
4709#define setup_sprite_tile_column_edge_post_adjust_half_left() \
4710
4711#define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4712 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4713
4714#define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4715
4716
59d15d23 4717#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode, \
4718 x4mode) \
75e28f62 4719 mov sub_tile_height, column_data; \
59d15d23 4720 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4721 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4722 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62 4723
59d15d23 4724#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode, \
4725 x4mode) \
75e28f62
E
4726 and sub_tile_height, column_data, #0xFF; \
4727 mov tiles_remaining, column_data, lsr #16; \
59d15d23 4728 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4729 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4730 \
4731 subs tiles_remaining, tiles_remaining, #1; \
4732 beq 2f; \
4733 \
4734 3: \
4735 mov sub_tile_height, #16; \
59d15d23 4736 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4737 subs tiles_remaining, tiles_remaining, #1; \
4738 bne 3b; \
4739 \
4740 2: \
4741 uxtb sub_tile_height, column_data, ror #8; \
59d15d23 4742 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4743 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62
E
4744
4745
4746#define setup_sprite_column_data_single() \
4747 mov column_data, height; \
e1f6de8f 4748 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset] \
75e28f62
E
4749
4750#define setup_sprite_column_data_multi() \
4751 and height_rounded, height_rounded, #0xF; \
4752 rsb column_data, offset_v, #16; \
4753 \
4754 add height_rounded, height_rounded, #1; \
4755 sub tile_height, tile_height, #1; \
4756 \
4757 orr column_data, column_data, tile_height, lsl #16; \
e1f6de8f 4758 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]; \
75e28f62
E
4759 \
4760 orr column_data, column_data, height_rounded, lsl #8 \
4761
59d15d23 4762#define setup_sprite_setup_left_draw_mask_fb_ptr() \
4763 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4764 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4765
4766#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column() \
4767 mov fb_ptr_advance_column, #32; \
4768 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4769 \
ed0fd81d 4770 sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \
59d15d23 4771 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4772
4773#define setup_sprite_setup_right_draw_mask_fb_ptr() \
4774 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4775 vdup.u8 draw_mask_fb_ptr_right, block_masks[5] \
4776
4777#define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode, \
4778 edge, x4mode) \
4779 setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode: \
75e28f62
E
4780 setup_sprite_column_data_##multi_height(); \
4781 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4782 vorr.u32 block_masks, block_masks, block_masks_shifted; \
59d15d23 4783 setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
75e28f62 4784 \
59d15d23 4785 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
75e28f62
E
4786 ldmia sp!, { r4 - r11, pc } \
4787
4788#define setup_sprite_tiled_advance_column() \
4789 add texture_offset_base, texture_offset_base, #0x100; \
4790 tst texture_offset_base, #0xF00; \
4791 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4792
4793#define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
59d15d23 4794 right_mode, x4mode) \
4795 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
75e28f62 4796 setup_sprite_column_data_##multi_height(); \
75e28f62 4797 \
59d15d23 4798 setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode(); \
75e28f62 4799 \
59d15d23 4800 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
75e28f62
E
4801 \
4802 subs tile_width, tile_width, #2; \
4803 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4804 \
75e28f62
E
4805 beq 1f; \
4806 \
59d15d23 4807 vmov.u8 draw_masks_fb_ptrs, #0; \
4808 vmov.u8 draw_masks_fb_ptrs2, #0; \
4809 \
75e28f62
E
4810 0: \
4811 setup_sprite_tiled_advance_column(); \
59d15d23 4812 setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode); \
75e28f62
E
4813 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4814 subs tile_width, tile_width, #1; \
4815 bne 0b; \
4816 \
4817 1: \
59d15d23 4818 setup_sprite_setup_right_draw_mask_fb_ptr##x4mode(); \
75e28f62
E
4819 \
4820 setup_sprite_tiled_advance_column(); \
59d15d23 4821 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
75e28f62
E
4822 ldmia sp!, { r4 - r11, pc } \
4823
4824
59d15d23 4825#define setup_sprite_offset_u_adjust() \
4826
4827#define setup_sprite_get_left_block_mask() \
4828 and left_block_mask, left_block_mask, #0xFF \
4829
4830#define setup_sprite_compare_left_block_mask() \
4831 cmp left_block_mask, #0xFF \
4832
4833#define setup_sprite_get_right_block_mask() \
4834 uxtb right_block_mask, right_block_mask, ror #8 \
4835
4836#define setup_sprite_compare_right_block_mask() \
4837 cmp right_block_mask, #0xFF \
4838
4839
4840
4841/* 4x stuff */
4842#define fb_ptr2 column_data
4843
4844#define setup_sprite_offset_u_adjust_4x() \
4845 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4846 lsl offset_u_right, #1; \
4847 lsl offset_u, #1; \
4848 add offset_u_right, #1 \
4849
4850#define setup_sprite_get_left_block_mask_4x() \
4851 sxth left_block_mask, left_block_mask \
4852
4853#define setup_sprite_compare_left_block_mask_4x() \
4854 cmp left_block_mask, #0xFFFFFFFF \
4855
4856#define setup_sprite_get_right_block_mask_4x() \
4857 sxth right_block_mask, right_block_mask, ror #16 \
4858
4859#define setup_sprite_compare_right_block_mask_4x() \
4860 cmp right_block_mask, #0xFFFFFFFF \
4861
4862
4863#define widen_texels_16bpp(texels_) \
4864 vmov texels_wide_low, texels_; \
4865 vmov texels_wide_high, texels_; \
4866 vzip.16 texels_wide_low, texels_wide_high \
4867
4868#define widen_texels_8bpp(texels_) \
4869 vmov texels_wide_low, texels_; \
4870 vmov texels_wide_high, texels_; \
4871 vzip.8 texels_wide_low, texels_wide_high \
4872
4873#define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
e1f6de8f 4874 vst1.u32 { texels_ }, [block_, :128]; \
59d15d23 4875 add block_, block_, #40; \
4876 \
4877 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
e1f6de8f 4878 vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64]; \
59d15d23 4879 add block_, block_, #24 \
4880
4881/* assumes 16-byte offset already added to block_ */
4882#define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
e1f6de8f 4883 vst1.u32 { texels_ }, [block_, :64]; \
59d15d23 4884 add block_, block_, #24; \
4885 \
4886 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
e1f6de8f 4887 vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64]; \
59d15d23 4888 add block_, block_, #40 \
4889
4890#define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4891 draw_mask_fb_ptr_b_) \
4892 widen_texels_16bpp(texels_low); \
4893 add fb_ptr_tmp, fb_ptr, #1024*2; \
4894 \
4895 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr); \
4896 \
4897 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4898 widen_texels_16bpp(texels_high); \
4899 \
4900 add fb_ptr_tmp, fb_ptr, #8*2; \
4901 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4902 \
4903 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4904 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4905
4906#define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4907 draw_mask_fb_ptr_b_) \
4908 widen_texels_8bpp(texels); \
4909 add fb_ptr_tmp, fb_ptr, #1024*2; \
4910 \
4911 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr); \
4912 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4913 \
4914 add fb_ptr_tmp, fb_ptr, #8*2; \
4915 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4916 \
4917 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4918 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4919
4920
4921#define setup_sprite_tiled_initialize_4bpp_4x() \
e1f6de8f 4922 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]; \
4923 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]; \
59d15d23 4924 \
4925 vuzp.u8 clut_a, clut_b \
4926
4927#define setup_sprite_tiled_initialize_8bpp_4x() \
4928
4929
4930#define setup_sprite_block_count_single_4x() \
4931 sub_tile_height, lsl #2 \
4932
4933#define setup_sprite_block_count_double_4x() \
4934 sub_tile_height, lsl #(1+2) \
4935
4936#define setup_sprite_tile_full_4bpp_4x(edge) \
4937 setup_sprite_tile_add_blocks(double_4x); \
4938 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4939 \
4940 4: \
4941 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 4942 pld [fb_ptr]; \
59d15d23 4943 \
4944 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4945 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 4946 \
4947 add texture_block_ptr, texture_offset, #8; \
4948 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4949 \
4950 and texture_block_ptr, texture_block_ptr, texture_mask; \
4951 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4952 \
4953 vzip.8 texels_low, texels_high; \
4954 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
4955 draw_mask_fb_ptr_left_b); \
4956 \
4957 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4958 pld [fb_ptr, #2048]; \
59d15d23 4959 \
e1f6de8f 4960 vld1.u32 { texels }, [texture_block_ptr, :64]; \
8438c3c7 4961 add fb_ptr, fb_ptr, #16*2; \
59d15d23 4962 \
8438c3c7 4963 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
59d15d23 4964 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4965 \
4966 vzip.8 texels_low, texels_high; \
4967 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
4968 draw_mask_fb_ptr_right_b); \
4969 \
4970 add texture_offset, texture_offset, #0x10; \
4971 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
4972 \
4973 subs sub_tile_height, sub_tile_height, #1; \
4974 bne 4b; \
4975 \
4976 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4977 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4978 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 4979
4980
4981#define setup_sprite_tile_half_4bpp_4x(edge) \
4982 setup_sprite_tile_add_blocks(single_4x); \
4983 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4984 \
4985 4: \
4986 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 4987 pld [fb_ptr]; \
59d15d23 4988 \
4989 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4990 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 4991 \
4992 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4993 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4994 \
4995 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4996 add texture_offset, texture_offset, #0x10; \
4997 \
4998 vzip.8 texels_low, texels_high; \
4999 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5000 draw_mask_fb_ptr_##edge##_b); \
5001 \
e1f6de8f 5002 pld [fb_ptr, #2048]; \
59d15d23 5003 add fb_ptr, fb_ptr, #2048 * 2; \
59d15d23 5004 \
8438c3c7 5005 subs sub_tile_height, sub_tile_height, #1; \
59d15d23 5006 bne 4b; \
5007 \
5008 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5009 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5010 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5011
5012
5013#define setup_sprite_tile_full_8bpp_4x(edge) \
5014 setup_sprite_tile_add_blocks(double_4x); \
5015 add block, block, #16; \
5016 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5017 \
5018 4: \
5019 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 5020 pld [fb_ptr]; \
59d15d23 5021 \
5022 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 5023 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5024 \
5025 add texture_block_ptr, texture_offset, #8; \
5026 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
5027 draw_mask_fb_ptr_left_b); \
5028 \
e1f6de8f 5029 pld [fb_ptr, #2048]; \
59d15d23 5030 and texture_block_ptr, texture_block_ptr, texture_mask; \
5031 \
5032 add fb_ptr, fb_ptr, #16*2; \
5033 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5034 \
e1f6de8f 5035 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5036 \
5037 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
5038 draw_mask_fb_ptr_right_b); \
5039 \
5040 add texture_offset, texture_offset, #0x10; \
5041 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
5042 \
5043 subs sub_tile_height, sub_tile_height, #1; \
5044 bne 4b; \
5045 \
5046 sub block, block, #16; \
5047 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5048 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5049 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5050
5051
5052#define setup_sprite_tile_half_8bpp_4x(edge) \
5053 setup_sprite_tile_add_blocks(single_4x); \
5054 add block, block, #16; \
5055 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5056 \
5057 4: \
5058 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 5059 pld [fb_ptr]; \
59d15d23 5060 \
5061 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 5062 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5063 \
e1f6de8f 5064 pld [fb_ptr, #2048]; \
59d15d23 5065 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5066 draw_mask_fb_ptr_##edge##_b); \
5067 \
5068 add texture_offset, texture_offset, #0x10; \
5069 add fb_ptr, fb_ptr, #2048 * 2; \
5070 \
5071 subs sub_tile_height, sub_tile_height, #1; \
5072 bne 4b; \
5073 \
5074 sub block, block, #16; \
5075 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5076 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5077 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5078
5079
5080#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x() \
5081 add texture_offset, texture_offset_base, #8; \
5082 add fb_ptr, fb_ptr, #16 * 2 \
5083
5084#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x() \
5085 mov texture_offset, texture_offset_base \
5086
5087#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge) \
5088 setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x() \
5089
5090#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge) \
5091 mov texture_offset, texture_offset_base \
5092
5093#define setup_sprite_tile_column_edge_post_adjust_half_right_4x() \
5094 sub fb_ptr, fb_ptr, #16 * 2 \
5095
5096#define setup_sprite_tile_column_edge_post_adjust_half_left_4x() \
5097
5098#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge) \
5099 setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x() \
5100
5101#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge) \
5102
5103
5104#define setup_sprite_setup_left_draw_mask_fb_ptr_4x() \
5105 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5106 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5107 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5108 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5109
5110#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x() \
5111 mov fb_ptr_advance_column, #32 * 2; \
5112 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5113 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
ed0fd81d 5114 sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \
59d15d23 5115 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5116 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5117
5118#define setup_sprite_setup_right_draw_mask_fb_ptr_4x() \
5119 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4]; \
5120 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5]; \
5121 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6]; \
5122 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7] \
5123
5124
75e28f62
E
5125// r0: psx_gpu
5126// r1: x
5127// r2: y
5128// r3: u
e1f6de8f 5129// [sp]: v
5130// [sp + 4]: width
5131// [sp + 8]: height
5132// [sp + 12]: color (unused)
75e28f62 5133
59d15d23 5134#define setup_sprite_tiled_builder(texture_mode, x4mode) \
5135 \
5136setup_sprite_tile_column_width_multi(texture_mode, multi, full, full, \
5137 x4mode); \
5138setup_sprite_tile_column_width_single(texture_mode, multi, full, none, \
5139 x4mode); \
5140setup_sprite_tile_column_width_multi(texture_mode, single, full, full, \
5141 x4mode); \
5142setup_sprite_tile_column_width_single(texture_mode, single, full, none, \
5143 x4mode); \
5144setup_sprite_tile_column_width_multi(texture_mode, multi, half, full, \
5145 x4mode); \
5146setup_sprite_tile_column_width_single(texture_mode, multi, half, right, \
5147 x4mode); \
5148setup_sprite_tile_column_width_multi(texture_mode, single, half, full, \
5149 x4mode); \
5150setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
5151 x4mode); \
5152setup_sprite_tile_column_width_multi(texture_mode, multi, full, half, \
5153 x4mode); \
5154setup_sprite_tile_column_width_single(texture_mode, multi, half, left, \
5155 x4mode); \
5156setup_sprite_tile_column_width_multi(texture_mode, single, full, half, \
5157 x4mode); \
5158setup_sprite_tile_column_width_single(texture_mode, single, half, left, \
5159 x4mode); \
5160setup_sprite_tile_column_width_multi(texture_mode, multi, half, half, \
5161 x4mode); \
5162setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \
5163 x4mode); \
75e28f62
E
5164 \
5165.align 4; \
5166 \
59d15d23 5167function(setup_sprite_##texture_mode##x4mode) \
75e28f62 5168 stmdb sp!, { r4 - r11, r14 }; \
59d15d23 5169 setup_sprite_tiled_initialize_##texture_mode##x4mode(); \
75e28f62 5170 \
e1f6de8f 5171 ldr v, [sp, #36]; \
75e28f62
E
5172 and offset_u, u, #0xF; \
5173 \
e1f6de8f 5174 ldr width, [sp, #40]; \
5175 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62 5176 \
e1f6de8f 5177 ldr height, [sp, #44]; \
75e28f62
E
5178 add fb_ptr, fb_ptr, y, lsl #11; \
5179 \
5180 add fb_ptr, fb_ptr, x, lsl #1; \
5181 and offset_v, v, #0xF; \
5182 \
5183 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
5184 add width_rounded, offset_u, width; \
5185 \
5186 add height_rounded, offset_v, height; \
5187 add width_rounded, width_rounded, #15; \
5188 \
5189 add height_rounded, height_rounded, #15; \
5190 mov tile_width, width_rounded, lsr #4; \
5191 \
5192 /* texture_offset_base = VH-VL-00-00 */\
5193 mov texture_offset_base, v, lsl #8; \
5194 and offset_u_right, width_rounded, #0xF; \
5195 \
5196 /* texture_offset_base = VH-UH-UL-00 */\
5197 bfi texture_offset_base, u, #4, #8; \
59d15d23 5198 mov right_block_mask, #0xFFFFFFFE; \
5199 \
5200 setup_sprite_offset_u_adjust##x4mode(); \
75e28f62
E
5201 \
5202 /* texture_offset_base = VH-UH-VL-00 */\
5203 bfi texture_offset_base, v, #4, #4; \
59d15d23 5204 mov left_block_mask, #0xFFFFFFFF; \
75e28f62
E
5205 \
5206 mov tile_height, height_rounded, lsr #4; \
5207 mvn left_block_mask, left_block_mask, lsl offset_u; \
5208 \
5209 /* texture_mask = HH-HL-WH-WL */\
e1f6de8f 5210 ldrh texture_mask, [psx_gpu, #psx_gpu_texture_mask_width_offset]; \
75e28f62
E
5211 mov right_block_mask, right_block_mask, lsl offset_u_right; \
5212 \
5213 /* texture_mask_rev = WH-WL-HH-HL */\
5214 rev16 texture_mask_rev, texture_mask; \
5215 vmov block_masks, left_block_mask, right_block_mask; \
5216 \
5217 /* texture_mask = HH-HL-HL-WL */\
5218 bfi texture_mask, texture_mask_rev, #4, #4; \
5219 /* texture_mask_rev = 00-00-00-WH */\
5220 mov texture_mask_rev, texture_mask_rev, lsr #12; \
5221 \
5222 /* texture_mask = HH-WH-HL-WL */\
5223 bfi texture_mask, texture_mask_rev, #8, #4; \
59d15d23 5224 setup_sprite_get_left_block_mask##x4mode(); \
75e28f62
E
5225 \
5226 mov control_mask, #0; \
59d15d23 5227 setup_sprite_compare_left_block_mask##x4mode(); \
75e28f62 5228 \
59d15d23 5229 setup_sprite_get_right_block_mask##x4mode(); \
75e28f62
E
5230 orreq control_mask, control_mask, #0x4; \
5231 \
e1f6de8f 5232 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
59d15d23 5233 setup_sprite_compare_right_block_mask##x4mode(); \
75e28f62
E
5234 \
5235 orreq control_mask, control_mask, #0x8; \
5236 cmp tile_width, #1; \
5237 \
5238 add block, psx_gpu, #psx_gpu_blocks_offset; \
5239 orreq control_mask, control_mask, #0x1; \
5240 \
5241 cmp tile_height, #1; \
5242 add block, block, num_blocks, lsl #6; \
5243 \
5244 orreq control_mask, control_mask, #0x2; \
8184d7c5 5245 JT_OP_REL(9f, control_mask, temp); \
e1f6de8f 5246 JT_OP(ldr pc, [pc, control_mask, lsl #2]); \
75e28f62
E
5247 nop; \
5248 \
8184d7c5 5249 9: \
5250 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode); \
5251 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode); \
5252 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode); \
5253 .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5254 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode); \
5255 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5256 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode); \
5257 .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5258 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode); \
5259 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode); \
5260 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode); \
5261 .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5262 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode); \
75e28f62 5263 .word 0x00000000; \
8184d7c5 5264 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode); \
59d15d23 5265
5266
5267setup_sprite_tiled_builder(4bpp,);
5268setup_sprite_tiled_builder(8bpp,);
75e28f62 5269
59d15d23 5270#undef draw_mask_fb_ptr_left
5271#undef draw_mask_fb_ptr_right
75e28f62 5272
59d15d23 5273setup_sprite_tiled_builder(4bpp, _4x);
5274setup_sprite_tiled_builder(8bpp, _4x);
75e28f62
E
5275
5276
5277#undef block_ptr
5278#undef num_blocks
5279#undef clut_ptr
5280
5281#define psx_gpu r0
5282#define block_ptr r0
5283#define num_blocks r1
5284#define clut_ptr r2
5285#define texel_shift_mask r3
5286#define block_pixels_a r4
5287#define block_pixels_b r5
5288#define texel_0 r6
5289#define texel_2 r7
5290#define texel_4 r8
5291#define texel_6 r9
5292#define texel_1 r10
5293#define texel_3 r11
5294#define texel_5 r12
5295#define texel_7 r14
5296#define texels_01 r6
5297#define texels_23 r7
5298#define texels_45 r8
5299#define texels_67 r9
5300
5301function(texture_sprite_blocks_8bpp)
5302 stmdb sp!, { r4 - r11, r14 }
5303 movw texel_shift_mask, #(0xFF << 1)
5304
e1f6de8f 5305 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5306 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
75e28f62
E
5307
5308 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
e1f6de8f 5309 ldr block_pixels_a, [block_ptr, #16]
75e28f62
E
5310
5311 0:
5312 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
e1f6de8f 5313 ldr block_pixels_b, [block_ptr, #20]
75e28f62
E
5314
5315 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
e1f6de8f 5316 ldrh texel_0, [clut_ptr, texel_0]
75e28f62
E
5317
5318 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
e1f6de8f 5319 ldrh texel_1, [clut_ptr, texel_1]
75e28f62
E
5320
5321 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
e1f6de8f 5322 ldr block_pixels_a, [block_ptr, #(64 + 16)]
75e28f62 5323
e1f6de8f 5324 ldrh texel_2, [clut_ptr, texel_2]
75e28f62
E
5325 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5326
e1f6de8f 5327 ldrh texel_3, [clut_ptr, texel_3]
75e28f62
E
5328 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5329
e1f6de8f 5330 ldrh texel_4, [clut_ptr, texel_4]
75e28f62
E
5331 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5332
e1f6de8f 5333 ldrh texel_5, [clut_ptr, texel_5]
75e28f62
E
5334 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5335
e1f6de8f 5336 ldrh texel_6, [clut_ptr, texel_6]
75e28f62
E
5337 orr texels_01, texel_0, texel_1, lsl #16
5338
e1f6de8f 5339 ldrh texel_7, [clut_ptr, texel_7]
75e28f62
E
5340 orr texels_23, texel_2, texel_3, lsl #16
5341
5342 orr texels_45, texel_4, texel_5, lsl #16
e1f6de8f 5343 str texels_01, [block_ptr, #0]
75e28f62
E
5344
5345 orr texels_67, texel_6, texel_7, lsl #16
e1f6de8f 5346 str texels_23, [block_ptr, #4]
75e28f62
E
5347
5348 subs num_blocks, num_blocks, #1
e1f6de8f 5349 str texels_45, [block_ptr, #8]
75e28f62 5350
e1f6de8f 5351 str texels_67, [block_ptr, #12]
75e28f62
E
5352 add block_ptr, block_ptr, #64
5353
5354 bne 0b
5355
5356 ldmia sp!, { r4 - r11, pc }
5357
5358
5359#undef width_rounded
5360#undef texture_mask
5361#undef num_blocks
5362#undef texture_offset
59d15d23 5363#undef texels_low
5364#undef texels_high
5365#undef texels_wide_low
5366#undef texels_wide_high
5367#undef texels_wide
5368#undef fb_ptr2
8184d7c5 5369#undef temp
75e28f62
E
5370
5371#define psx_gpu r0
5372#define x r1
5373#define y r2
5374#define u r3
5375#define v r4
5376#define width r5
5377#define height r6
5378#define left_offset r8
5379#define width_rounded r9
5380#define right_width r10
59d15d23 5381
75e28f62
E
5382#define block_width r11
5383
5384#define texture_offset_base r1
5385#define texture_mask r2
5386#define texture_page_ptr r3
5387#define num_blocks r4
5388#define block r5
5389#define fb_ptr r7
5390#define texture_offset r8
5391#define blocks_remaining r9
59d15d23 5392#define fb_ptr2 r10
75e28f62
E
5393#define fb_ptr_pitch r12
5394#define texture_block_ptr r14
5395
5396#define texture_mask_width r2
5397#define texture_mask_height r3
5398#define left_mask_bits r4
5399#define right_mask_bits r5
5400
5401
5402#undef block_masks
5403#undef block_masks_shifted
5404#undef texels
5405
5406#define block_masks d0
5407#define block_masks_shifted d1
5408#define draw_mask_fb_ptr d2
5409#define texels q2
5410
59d15d23 5411#define draw_mask_fb_ptr_a d2
5412#define draw_mask_fb_ptr_b d3
5413#define texels_low d4
5414#define texels_high d5
5415#define texels_wide_low d6
5416#define texels_wide_high d7
5417#define texels_wide q3
75e28f62 5418
75e28f62 5419
59d15d23 5420setup_sprites_16bpp_flush:
5421 vpush { d0 - d3 }
75e28f62 5422
4d646738 5423 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 5424 bl flush_render_block_buffer
4d646738 5425 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 5426
59d15d23 5427 vpop { d0 - d3 }
75e28f62
E
5428
5429 add block, psx_gpu, #psx_gpu_blocks_offset
5430 mov num_blocks, block_width
5431
5432 bx lr
5433
5434function(setup_sprite_16bpp)
5435 stmdb sp!, { r4 - r11, r14 }
e1f6de8f 5436 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62 5437
e1f6de8f 5438 ldr v, [sp, #36]
75e28f62
E
5439 add fb_ptr, fb_ptr, y, lsl #11
5440
e1f6de8f 5441 ldr width, [sp, #40]
75e28f62
E
5442 add fb_ptr, fb_ptr, x, lsl #1
5443
e1f6de8f 5444 ldr height, [sp, #44]
75e28f62
E
5445 and left_offset, u, #0x7
5446
5447 add texture_offset_base, u, u
5448 add width_rounded, width, #7
5449
ed0fd81d 5450 add texture_offset_base, texture_offset_base, v, lsl #11
75e28f62
E
5451 mov left_mask_bits, #0xFF
5452
e1f6de8f 5453 ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
75e28f62
E
5454 add width_rounded, width_rounded, left_offset
5455
e1f6de8f 5456 ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
75e28f62
E
5457 sub fb_ptr, fb_ptr, left_offset, lsl #1
5458
5459 add texture_mask, texture_mask_width, texture_mask_width
5460 mov right_mask_bits, #0xFE
5461
5462 and right_width, width_rounded, #0x7
5463 mvn left_mask_bits, left_mask_bits, lsl left_offset
5464
ed0fd81d 5465 add texture_mask, texture_mask, texture_mask_height, lsl #11
75e28f62
E
5466 mov block_width, width_rounded, lsr #3
5467
5468 mov right_mask_bits, right_mask_bits, lsl right_width
5469 movw fb_ptr_pitch, #(2048 + 16)
5470
5471 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5472 vmov block_masks, left_mask_bits, right_mask_bits
5473
e1f6de8f 5474 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5475 add block, psx_gpu, #psx_gpu_blocks_offset
5476
6ea0f7bf 5477 bic texture_offset_base, texture_offset_base, #0xF
75e28f62
E
5478 cmp block_width, #1
5479
e1f6de8f 5480 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
75e28f62
E
5481 add block, block, num_blocks, lsl #6
5482
5483 bne 0f
5484
5485 vext.32 block_masks_shifted, block_masks, block_masks, #1
5486 vorr.u32 block_masks, block_masks, block_masks_shifted
5487 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5488
5489 1:
5490 add num_blocks, num_blocks, #1
5491 cmp num_blocks, #MAX_BLOCKS
59d15d23 5492 blgt setup_sprites_16bpp_flush
75e28f62
E
5493
5494 and texture_block_ptr, texture_offset_base, texture_mask
5495 subs height, height, #1
5496
5497 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5498 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5499
e1f6de8f 5500 vst1.u32 { texels }, [block, :128]
75e28f62
E
5501 add block, block, #40
5502
5503 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5504 pld [fb_ptr]
75e28f62 5505
e1f6de8f 5506 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5507
5508 add block, block, #24
5509 add texture_offset_base, texture_offset_base, #2048
5510 add fb_ptr, fb_ptr, #2048
e1f6de8f 5511 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5512 bne 1b
5513
5514 ldmia sp!, { r4 - r11, pc }
5515
5516 0:
5517 add num_blocks, num_blocks, block_width
5518 mov texture_offset, texture_offset_base
5519
5520 cmp num_blocks, #MAX_BLOCKS
59d15d23 5521 blgt setup_sprites_16bpp_flush
75e28f62
E
5522
5523 add texture_offset_base, texture_offset_base, #2048
5524 and texture_block_ptr, texture_offset, texture_mask
5525
5526 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5527 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5528
e1f6de8f 5529 vst1.u32 { texels }, [block, :128]
75e28f62
E
5530 add block, block, #40
5531
5532 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5533 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5534 pld [fb_ptr]
75e28f62 5535
e1f6de8f 5536 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5537 subs blocks_remaining, block_width, #2
5538
5539 add texture_offset, texture_offset, #16
5540 add fb_ptr, fb_ptr, #16
5541
5542 vmov.u8 draw_mask_fb_ptr, #0
5543
5544 add block, block, #24
5545 beq 2f
5546
5547 1:
5548 and texture_block_ptr, texture_offset, texture_mask
5549 subs blocks_remaining, blocks_remaining, #1
5550
5551 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5552 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5553
e1f6de8f 5554 vst1.u32 { texels }, [block, :128]
75e28f62
E
5555 add block, block, #40
5556
5557 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5558 pld [fb_ptr]
75e28f62 5559
e1f6de8f 5560 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5561
5562 add texture_offset, texture_offset, #16
5563 add fb_ptr, fb_ptr, #16
5564
5565 add block, block, #24
5566 bne 1b
5567
5568 2:
5569 and texture_block_ptr, texture_offset, texture_mask
5570 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5571
e1f6de8f 5572 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62
E
5573 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5574
e1f6de8f 5575 vst1.u32 { texels }, [block, :128]
75e28f62
E
5576 add block, block, #40
5577
5578 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5579 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5580
5581 add block, block, #24
5582 subs height, height, #1
5583
5584 add fb_ptr, fb_ptr, fb_ptr_pitch
e1f6de8f 5585 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5586
5587 bne 0b
5588
5589 ldmia sp!, { r4 - r11, pc }
5590
5591
59d15d23 5592// 4x version
5593// FIXME: duplicate code with normal version :(
5594#undef draw_mask_fb_ptr
5595
5596function(setup_sprite_16bpp_4x)
5597 stmdb sp!, { r4 - r11, r14 }
e1f6de8f 5598 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
59d15d23 5599
e1f6de8f 5600 ldr v, [sp, #36]
59d15d23 5601 add fb_ptr, fb_ptr, y, lsl #11
5602
e1f6de8f 5603 ldr width, [sp, #40]
59d15d23 5604 add fb_ptr, fb_ptr, x, lsl #1
5605
e1f6de8f 5606 ldr height, [sp, #44]
59d15d23 5607 and left_offset, u, #0x7
5608
5609 add texture_offset_base, u, u
5610 add width_rounded, width, #7
5611
ed0fd81d 5612 add texture_offset_base, texture_offset_base, v, lsl #11
59d15d23 5613 movw left_mask_bits, #0xFFFF
5614
e1f6de8f 5615 ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
59d15d23 5616 add width_rounded, width_rounded, left_offset
5617
5618 lsl left_offset, #1
5619
e1f6de8f 5620 ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
59d15d23 5621 sub fb_ptr, fb_ptr, left_offset, lsl #1
5622
5623 add texture_mask, texture_mask_width, texture_mask_width
5624 movw right_mask_bits, #0xFFFC
5625
5626 and right_width, width_rounded, #0x7
5627 mvn left_mask_bits, left_mask_bits, lsl left_offset
5628
5629 lsl right_width, #1
5630
ed0fd81d 5631 add texture_mask, texture_mask, texture_mask_height, lsl #11
59d15d23 5632 mov block_width, width_rounded, lsr #3
5633
5634 mov right_mask_bits, right_mask_bits, lsl right_width
5635 movw fb_ptr_pitch, #(2048 + 16) * 2
5636
5637 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5638 vmov block_masks, left_mask_bits, right_mask_bits
5639
e1f6de8f 5640 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5641 add block, psx_gpu, #psx_gpu_blocks_offset
5642
5643 bic texture_offset_base, texture_offset_base, #0xF
5644 cmp block_width, #1
5645
e1f6de8f 5646 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
59d15d23 5647 add block, block, num_blocks, lsl #6
5648
5649 lsl block_width, #2
5650 bne 0f
5651
5652 vext.32 block_masks_shifted, block_masks, block_masks, #1
5653 vorr.u32 block_masks, block_masks, block_masks_shifted
5654 vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5655 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5656
5657 1:
5658 add num_blocks, num_blocks, block_width
5659 cmp num_blocks, #MAX_BLOCKS
5660 blgt setup_sprites_16bpp_flush
5661
5662 and texture_block_ptr, texture_offset_base, texture_mask
5663 subs height, height, #1
5664
5665 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5666 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5667
5668 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5669
5670 add texture_offset_base, texture_offset_base, #2048
5671 add fb_ptr, fb_ptr, #2048*2
e1f6de8f 5672 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5673 bne 1b
5674
5675 ldmia sp!, { r4 - r11, pc }
5676
5677 0:
5678 add num_blocks, num_blocks, block_width
5679 mov texture_offset, texture_offset_base
5680
5681 vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5682 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5683
5684 cmp num_blocks, #MAX_BLOCKS
5685 blgt setup_sprites_16bpp_flush
5686
5687 add texture_offset_base, texture_offset_base, #2048
5688 and texture_block_ptr, texture_offset, texture_mask
5689
5690 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5691 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5692
5693 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5694
5695 subs blocks_remaining, block_width, #2*4
5696 add texture_offset, texture_offset, #16
5697
5698 vmov.u8 draw_mask_fb_ptr_a, #0
5699 vmov.u8 draw_mask_fb_ptr_b, #0
5700
5701 add fb_ptr, fb_ptr, #16*2
5702 beq 2f
5703
5704 1:
5705 and texture_block_ptr, texture_offset, texture_mask
5706 subs blocks_remaining, blocks_remaining, #4
5707
5708 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5709 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5710
5711 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5712 add texture_offset, texture_offset, #16
5713
5714 add fb_ptr, fb_ptr, #16*2
5715 bgt 1b
5716
5717 2:
5718 vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5719 vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5720
5721 and texture_block_ptr, texture_offset, texture_mask
5722 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5723
e1f6de8f 5724 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5725
5726 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5727 subs height, height, #1
5728
5729 add fb_ptr, fb_ptr, fb_ptr_pitch
e1f6de8f 5730 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5731
5732 bne 0b
5733
5734 ldmia sp!, { r4 - r11, pc }
5735
5736
f0931e56 5737#undef width
5738#undef right_width
5739#undef right_mask_bits
5740#undef color
5741#undef height
5742#undef blocks_remaining
5743#undef colors
5744#undef right_mask
5745#undef test_mask
5746#undef draw_mask
5747
5748#define psx_gpu r0
5749#define x r1
5750#define y r2
5751#define width r3
5752#define right_width r5
5753#define right_mask_bits r6
5754#define fb_ptr r7
5755#define color r8
5756#define height r9
5757#define fb_ptr_pitch r12
5758
5759// referenced by setup_sprites_16bpp_flush
5760#define num_blocks r4
5761#define block r5
5762#define block_width r11
5763
5764#define color_r r1
5765#define color_g r2
5766#define color_b r8
5767#define blocks_remaining r6
5768
5769#define colors q0
5770#define right_mask q1
5771#define test_mask q2
5772#define draw_mask q2
5773#define draw_mask_bits_fb_ptr d6
5774
5775
5776.align 3
5777
5778function(setup_sprite_untextured)
e1f6de8f 5779 ldrh r12, [psx_gpu, #psx_gpu_render_state_offset]
f0931e56 5780 tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
5781 | RENDER_FLAGS_BLEND)
e1f6de8f 5782 ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset]
d5c08ed3 5783 tsteq r12, #RENDER_INTERLACE_ENABLED
f0931e56 5784 beq setup_sprite_untextured_simple
5785
5786 stmdb sp!, { r4 - r11, r14 }
5787
e1f6de8f 5788 ldr width, [sp, #40]
5789 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
f0931e56 5790
e1f6de8f 5791 ldr height, [sp, #44]
f0931e56 5792 add fb_ptr, fb_ptr, y, lsl #11
5793
5794 add fb_ptr, fb_ptr, x, lsl #1
5795 sub right_width, width, #1
5796
e1f6de8f 5797 ldr color, [sp, #48]
f0931e56 5798 and right_width, #7
5799
5800 add block_width, width, #7
5801 add right_width, #1
5802
5803 lsr block_width, #3
5804 mov right_mask_bits, #0xff
5805
5806 sub fb_ptr_pitch, block_width, #1
5807 lsl right_mask_bits, right_width
5808
5809 lsl fb_ptr_pitch, #3+1
5810 ubfx color_r, color, #3, #5
5811
5812 rsb fb_ptr_pitch, #1024*2
5813 ubfx color_g, color, #11, #5
5814
e1f6de8f 5815 vld1.u32 { test_mask }, [psx_gpu, :128]
f0931e56 5816 ubfx color_b, color, #19, #5
5817
5818 vdup.u16 right_mask, right_mask_bits
5819 orr color, color_r, color_b, lsl #10
5820
e1f6de8f 5821 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
f0931e56 5822 orr color, color, color_g, lsl #5
5823
5824 vtst.u16 right_mask, right_mask, test_mask
5825 add block, psx_gpu, #psx_gpu_blocks_offset
5826
5827 vdup.u16 colors, color
5828 add block, block, num_blocks, lsl #6
5829
5830
5831setup_sprite_untextured_height_loop:
5832 add num_blocks, block_width
5833 sub blocks_remaining, block_width, #1
5834
5835 cmp num_blocks, #MAX_BLOCKS
5836 blgt setup_sprites_16bpp_flush
5837
5838 cmp blocks_remaining, #0
5839 ble 1f
5840
5841 vmov.u8 draw_mask, #0 /* zero_mask */
5842 vmov.u8 draw_mask_bits_fb_ptr, #0
5843
5844 0:
e1f6de8f 5845 vst1.u32 { draw_mask }, [block, :128]!
f0931e56 5846 subs blocks_remaining, #1
5847
e1f6de8f 5848 vst1.u32 { colors }, [block, :128]
f0931e56 5849 add block, block, #24
5850
5851 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
e1f6de8f 5852 vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
f0931e56 5853
5854 add block, block, #24
5855 add fb_ptr, #8*2
5856 bgt 0b
5857
5858 1:
e1f6de8f 5859 vst1.u32 { right_mask }, [block, :128]!
f0931e56 5860 subs height, #1
5861
e1f6de8f 5862 vst1.u32 { colors }, [block, :128]
f0931e56 5863 add block, block, #24
5864
5865 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
e1f6de8f 5866 vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
f0931e56 5867
5868 add block, block, #24
5869 add fb_ptr, fb_ptr_pitch
5870
e1f6de8f 5871 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
f0931e56 5872 bgt setup_sprite_untextured_height_loop
5873
5874 ldmia sp!, { r4 - r11, pc }
5875
5876
5877
75e28f62
E
5878#undef texture_page_ptr
5879#undef vram_ptr
5880#undef dirty_textures_mask
5881#undef current_texture_mask
5882
5883#define psx_gpu r0
5884#define current_texture_page r1
5885#define texture_page_ptr r2
5886#define vram_ptr_a r3
5887#define current_texture_page_x r12
5888#define current_texture_page_y r4
5889#define dirty_textures_mask r5
5890#define tile_y r6
5891#define tile_x r7
5892#define sub_y r8
5893#define current_texture_mask r9
5894#define c_4096 r10
5895#define vram_ptr_b r11
5896
5897#define texel_block_a d0
5898#define texel_block_b d1
5899#define texel_block_expanded_a q1
5900#define texel_block_expanded_b q2
5901#define texel_block_expanded_ab q2
5902#define texel_block_expanded_c q3
5903#define texel_block_expanded_d q4
5904#define texel_block_expanded_cd q3
5905
5906function(update_texture_4bpp_cache)
5907 stmdb sp!, { r4 - r11, r14 }
5908 vpush { q0 - q3 }
5909
e1f6de8f 5910 ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
75e28f62 5911
e1f6de8f 5912 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
5913 ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
75e28f62
E
5914
5915 and current_texture_page_x, current_texture_page, #0xF
e1f6de8f 5916 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62
E
5917
5918 mov current_texture_page_y, current_texture_page, lsr #4
e1f6de8f 5919 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
5920
5921 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5922 mov tile_y, #16
5923
5924 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5925 bic dirty_textures_mask, current_texture_mask
5926
5927 mov tile_x, #16
e1f6de8f 5928 str dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
5929
5930 mov sub_y, #8
5931 movw c_4096, #4096
5932
5933 add vram_ptr_b, vram_ptr_a, #2048
5934
5935 0:
e1f6de8f 5936 vld1.u32 { texel_block_a }, [vram_ptr_a, :64], c_4096
5937 vld1.u32 { texel_block_b }, [vram_ptr_b, :64], c_4096
75e28f62
E
5938
5939 vmovl.u8 texel_block_expanded_a, texel_block_a
5940 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5941 vmovl.u8 texel_block_expanded_c, texel_block_b
5942 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5943
5944 vbic.u16 texel_block_expanded_a, #0x00F0
5945 vbic.u16 texel_block_expanded_b, #0x00F0
5946 vbic.u16 texel_block_expanded_c, #0x00F0
5947 vbic.u16 texel_block_expanded_d, #0x00F0
5948
5949 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5950 texel_block_expanded_b
5951 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5952 texel_block_expanded_d
5953
5954 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
e1f6de8f 5955 [texture_page_ptr, :256]!
75e28f62
E
5956
5957 subs sub_y, sub_y, #1
5958 bne 0b
5959
5960 mov sub_y, #8
5961 add vram_ptr_a, vram_ptr_a, #8
5962 add vram_ptr_b, vram_ptr_b, #8
5963
5964 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5965 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5966
5967 subs tile_x, tile_x, #1
5968 bne 0b
5969
5970 mov tile_x, #16
5971 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5972 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5973
5974 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5975 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5976
5977 subs tile_y, tile_y, #1
5978 bne 0b
5979
5980 vpop { q0 - q3 }
5981 ldmia sp!, { r4 - r11, pc }
5982
5983
5984#undef current_texture_page
5985
5986#define psx_gpu r0
5987#define texture_page r1
5988#define texture_page_ptr r2
5989#define vram_ptr_a r3
5990#define texture_page_x r12
5991#define texture_page_y r4
5992#define current_texture_page r5
5993#define tile_y r6
5994#define tile_x r7
5995#define sub_y r8
5996#define c_4096 r10
5997#define vram_ptr_b r11
5998
5999
6000#undef texels_a
6001#undef texels_b
6002
6003#define texels_a q0
6004#define texels_b q1
6005#define texels_c q2
6006#define texels_d q3
6007
6008
6009function(update_texture_8bpp_cache_slice)
6010 stmdb sp!, { r4 - r11, r14 }
6011 vpush { q0 - q3 }
6012
e1f6de8f 6013 ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
6014 ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
75e28f62 6015
e1f6de8f 6016 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
75e28f62
E
6017 mov tile_y, #16
6018
6019 and texture_page_x, texture_page, #0xF
6020 mov texture_page_y, texture_page, lsr #4
6021
6022 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
6023 mov tile_x, #8
6024
6025 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6026 eor current_texture_page, current_texture_page, texture_page
6027
6028 ands current_texture_page, current_texture_page, #0x1
6029 mov sub_y, #4
6030
6031 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6032 movw c_4096, #4096
6033
6034 add vram_ptr_b, vram_ptr_a, #2048
6035
6036 0:
e1f6de8f 6037 vld1.u32 { texels_a }, [vram_ptr_a, :128], c_4096
6038 vld1.u32 { texels_b }, [vram_ptr_b, :128], c_4096
6039 vld1.u32 { texels_c }, [vram_ptr_a, :128], c_4096
6040 vld1.u32 { texels_d }, [vram_ptr_b, :128], c_4096
75e28f62 6041
e1f6de8f 6042 vst1.u32 { texels_a, texels_b }, [texture_page_ptr, :256]!
6043 vst1.u32 { texels_c, texels_d }, [texture_page_ptr, :256]!
75e28f62
E
6044
6045 subs sub_y, sub_y, #1
6046 bne 0b
6047
6048 mov sub_y, #4
6049
6050 add vram_ptr_a, vram_ptr_a, #16
6051 add vram_ptr_b, vram_ptr_b, #16
6052
6053 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6054 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6055
6056 subs tile_x, tile_x, #1
6057 bne 0b
6058
6059 mov tile_x, #8
6060
6061 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6062 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6063
6064 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6065 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6066
6067 subs tile_y, tile_y, #1
6068 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6069
6070 bne 0b
6071
6072 vpop { q0 - q3 }
6073 ldmia sp!, { r4 - r11, pc }
6074
50f9355a 6075
6076/* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6077function(scale2x_tiles8)
6078 push { r4, r14 }
6079
6080 mov r4, r1
6081 add r12, r0, #1024*2
6082 mov r14, r2
6083
60840:
e1f6de8f 6085 vld1.u16 { q0 }, [r1, :128]!
6086 vld1.u16 { q2 }, [r1, :128]!
50f9355a 6087 vmov q1, q0
6088 vmov q3, q2
6089 vzip.16 q0, q1
6090 vzip.16 q2, q3
6091 subs r14, #2
e1f6de8f 6092 vst1.u16 { q0, q1 }, [r0, :128]!
6093 vst1.u16 { q0, q1 }, [r12, :128]!
50f9355a 6094 blt 1f
e1f6de8f 6095 vst1.u16 { q2, q3 }, [r0, :128]!
6096 vst1.u16 { q2, q3 }, [r12, :128]!
50f9355a 6097 bgt 0b
60981:
6099 subs r3, #1
6100 mov r14, r2
6101 add r0, #1024*2*2
6102 add r4, #1024*2
ed0fd81d 6103 sub r0, r0, r2, lsl #4+1
50f9355a 6104 mov r1, r4
6105 add r12, r0, #1024*2
6106 bgt 0b
6107 nop
6108
6109 pop { r4, pc }
59d15d23 6110
6111// vim:filetype=armasm