Merge pull request #123 from gameblabla/diablofix_hack
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
CommitLineData
75e28f62
E
1/*
2 * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
59d15d23 3 * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
75e28f62
E
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of
8 * the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15
16#define MAX_SPANS 512
17#define MAX_BLOCKS 64
18#define MAX_BLOCKS_PER_ROW 128
19
f0931e56 20#define RENDER_STATE_MASK_EVALUATE 0x20
21#define RENDER_FLAGS_MODULATE_TEXELS 0x1
22#define RENDER_FLAGS_BLEND 0x2
d5c08ed3 23#define RENDER_INTERLACE_ENABLED 0x1
f0931e56 24
cb88320b 25#include "psx_gpu_offsets.h"
75e28f62 26
cb88320b 27#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
75e28f62 28
75e28f62
E
29#define edge_data_left_x_offset 0
30#define edge_data_num_blocks_offset 2
31#define edge_data_right_mask_offset 4
32#define edge_data_y_offset 6
33
ed0fd81d 34.syntax unified
35.text
75e28f62
E
36
37#define psx_gpu r0
38#define v_a r1
39#define v_b r2
40#define v_c r3
41
42#define x0 r4
43#define x1 r5
44#define x2 r6
45#define x0_x1 r5
46#define x1_x2 r6
47#define y0 r7
48#define y1 r8
49#define y2 r9
50#define y0_y1 r7
51#define y1_y2 r8
52#define b0 r9
53#define b1 r10
54#define b2 r11
55#define b0_b1 r10
56#define b1_b2 r11
57
58
59#define area_r_s r5
60
61#define g_bx0 r2
62#define g_bx r3
63#define g_bx2 r4
64#define g_bx3 r5
65#define b_base r6
66#define g_by r8
67
68#define gs_bx r7
69#define gs_by r10
70
71#define ga_bx g_bx
72#define ga_by g_by
73
74#define gw_bx_h g_bx
75#define gw_by_h g_by
76
77#define gw_bx_l r11
78#define gw_by_l gw_bx_l
79
80#define store_a r0
81#define store_b r1
82#define store_inc r5
83
84
85#define v0 q0
86#define uvrgb0 d0
87#define x0_y0 d1
88
89#define v1 q1
90#define uvrgb1 d2
91#define x1_y1 d3
92
93#define v2 q2
94#define uvrgb2 d4
95#define x2_y2 d5
96
97#define x0_ab q3
98#define uvrg_xxxx0 q3
99#define uvrg0 d6
100#define xxxx0 d7
101
102#define x1_ab q4
103#define uvrg_xxxx1 q4
104#define uvrg1 d8
105#define xxxx1 d9
106
107#define x2_ab q5
108#define uvrg_xxxx2 q5
109#define uvrg2 d10
110#define xxxx2 d11
111
112#define y0_ab q6
113#define yyyy_uvrg0 q6
114#define yyyy0 d12
115#define uvrg0b d13
116
117#define y1_ab q7
118#define yyyy_uvrg1 q7
119#define yyyy1 d14
120#define uvrg1b d15
121
122#define y2_ab q8
123#define yyyy_uvrg2 q8
124#define yyyy2 d16
125#define uvrg2b d17
126
127#define d0_ab q9
128#define d0_a d18
129#define d0_b d19
130
131#define d1_ab q10
132#define d1_a d20
133#define d1_b d21
134
135#define d2_ab q11
136#define d2_a d22
137#define d2_b d23
138
139#define d3_ab q12
140#define d3_a d24
141#define d3_b d25
142
143#define ga_uvrg_x q1
144#define ga_uvrg_y q4
145
146#define dx x0_x1
147#define dy y0_y1
148#define db b0_b1
149
150#define uvrg_base q11
151
152#define gs_uvrg_x q5
153#define gs_uvrg_y q6
154
155#define g_uvrg_x q1
156#define ga_uv_x d2
157#define g_uv_x d2
158#define ga_rg_x d3
159#define g_rg_x d3
160
161#define g_uvrg_y q4
162#define ga_uv_y d8
163#define g_uv_y d8
164#define ga_rg_y d9
165#define g_rg_y d9
166
167#define gw_uv_x q1
168#define gw_rg_x q2
169#define gw_uv_y q4
170#define gw_rg_y q3
171
172#define w_mask q9
173#define w_mask_l d18
174
175#define r_shift q10
176
177#define uvrg_dx0 q0
178#define uvrg_dx0l d0
179#define uvrg_dx0h d1
180
181#define uvrg_dx1 q1
182#define uvrg_dx1l d2
183#define uvrg_dx1h d3
184
185#define uvrg_dx2 q2
186#define uvrg_dx2l d4
187#define uvrg_dx2h d5
188
189#define uvrg_dx3 q3
190#define uvrg_dx3l d6
191#define uvrg_dx3h d7
192
c6063f89 193#define uvrgb_phase q13
75e28f62
E
194
195.align 4
196
0e4ad319 197#include "arm_features.h"
8184d7c5 198
0e4ad319 199#define function(name) FUNCTION(name):
200
201#ifndef TEXRELS_FORBIDDEN
75e28f62 202
8184d7c5 203#define JT_OP_REL(table_label, index_reg, temp)
204#define JT_OP(x...) x
205#define JTE(start, target) target
206
207#else
208
8184d7c5 209#define JT_OP_REL(table_label, index_reg, temp) \
210 adr temp, table_label; \
e1f6de8f 211 ldr temp, [temp, index_reg, lsl #2]; \
8184d7c5 212 add pc, pc, temp \
213
214#define JT_OP(x...)
215#define JTE(start, target) (target - start)
216
0e4ad319 217#endif
4d646738 218
0e4ad319 219#ifdef __MACH__
8184d7c5 220#define flush_render_block_buffer _flush_render_block_buffer
221#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
222#define update_texture_8bpp_cache _update_texture_8bpp_cache
8184d7c5 223#endif
224
75e28f62
E
225@ r0: psx_gpu
226@ r1: v_a
227@ r2: v_b
228@ r3: v_c
229
230function(compute_all_gradients)
231 // First compute the triangle area reciprocal and shift. The division will
232 // happen concurrently with much of the work which follows.
233 @ r12 = psx_gpu->triangle_area
e1f6de8f 234 ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
75e28f62
E
235 stmdb sp!, { r4 - r11, lr }
236
237 @ load exponent of 62 into upper half of double
238 movw r4, #0
239 clz r14, r12 @ r14 = shift
240
241 movt r4, #((62 + 1023) << 4)
242 mov r12, r12, lsl r14 @ r12 = triangle_area_normalized
243
244 @ load area normalized into lower half of double
245 mov r5, r12, lsr #10
246 vmov.f64 d30, r5, r4 @ d30 = (1 << 62) + ta_n
247
248 movt r4, #((1022 + 31) << 4)
249 mov r5, r12, lsl #20
250
251 add r4, r4, r12, lsr #11
252 vmov.f64 d31, r5, r4
253
254 vdiv.f64 d30, d30, d31 @ d30 = ((1 << 62) + ta_n) / ta_n
255
256 // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
257 // ( d0 * d1 ) - ( d2 * d3 ) =
258 // ( m0 ) - ( m1 ) = gradient
259
260 // This is split to do 12 elements at a time over three sets: a, b, and c.
261 // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
262 // two of the slots are unused.
263
264 // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
265 // is g.
266
267 // First type is: uvrg bxxx xxxx
268 // Second type is: yyyy ybyy uvrg
269 // Since x_a and y_c are the same the same variable is used for both.
270
e1f6de8f 271 vld1.u32 { v0 }, [v_a, :128] @ v0 = { uvrg0, b0, x0, y0 }
272 ldrsh x0, [v_a, #8] @ load x0
75e28f62 273
e1f6de8f 274 vld1.u32 { v1 }, [v_b, :128] @ v1 = { uvrg1, b1, x1, y1}
275 ldrh x1, [v_b, #8] @ load x1
75e28f62 276
e1f6de8f 277 vld1.u32 { v2 }, [v_c, :128] @ v2 = { uvrg2, b2, x2, y2 }
278 ldrh x2, [v_c, #8] @ load x2
75e28f62
E
279
280 vmovl.u8 uvrg_xxxx0, uvrgb0 @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
e1f6de8f 281 ldrh y0, [v_a, #10] @ load y0
75e28f62
E
282
283 vmovl.u8 uvrg_xxxx1, uvrgb1 @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
e1f6de8f 284 ldrh y1, [v_b, #10] @ load y1
75e28f62
E
285
286 vmovl.u8 uvrg_xxxx2, uvrgb2 @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
e1f6de8f 287 ldrh y2, [v_c, #10] @ load y2
75e28f62
E
288
289 vmov.u8 uvrg0b, uvrg0 @ uvrg0b = { uv0, rg0 }
290 vdup.u16 xxxx0, x0_y0[0] @ xxxx0 = { xx0, xx0 }
291
292 orr x1_x2, x1, x2, lsl #16 @ x1_x2 = { x1, x2 }
293 pkhbt x0_x1, x0, x1, lsl #16 @ x0_x1 = { x0, x1 }
294
295 vmov.u8 uvrg1b, uvrg1 @ uvrg1b = { uv1, rg1 }
296 vdup.u16 xxxx1, x1_y1[0] @ xxxx1 = { xx1, xx1 }
297
298 vmov.u8 uvrg2b, uvrg2 @ uvrg2b = { uv2, rg2 }
299 vdup.u16 xxxx2, x2_y2[0] @ xxxx2 = { xx2, xx2 }
300
e1f6de8f 301 ldrb b2, [v_c, #4] @ load b2
75e28f62
E
302 orr y0_y1, y0, y1, lsl #16 @ y0_y1 = { y0, y1 }
303
e1f6de8f 304 ldrb b1, [v_b, #4] @ load b1
75e28f62
E
305 orr y1_y2, y1, y2, lsl #16 @ y1_y2 = { y1, y2 }
306
307 vdup.u16 yyyy0, x0_y0[1] @ yyyy0 = { yy0, yy0 }
308 vsub.s16 d0_ab, x1_ab, x0_ab
309
e1f6de8f 310 ldrb b0, [v_a, #4] @ load b0
75e28f62
E
311 orr b1_b2, b1, b2, lsl #16 @ b1_b2 = { b1, b2 }
312
313 vdup.u16 yyyy1, x1_y1[1] @ yyyy1 = { yy1, yy1 }
314 vsub.s16 d2_ab, x2_ab, x1_ab
315
316 vdup.u16 yyyy2, x2_y2[1] @ yyyy2 = { yy2, yy2 }
317 vsub.s16 d1_ab, y2_ab, y1_ab
318
319 orr b0_b1, b0, b1, lsl #16 @ b1_b2 = { b1, b2 }
320 ssub16 dx, x1_x2, x0_x1 @ dx = { x1 - x0, x2 - x1 }
321
322 ssub16 dy, y1_y2, y0_y1 @ dy = { y1 - y0, y2 - y1 }
323 ssub16 db, b1_b2, b0_b1 @ db = { b1 - b0, b2 - b1 }
324
325 vsub.s16 d3_ab, y1_ab, y0_ab
326 smusdx ga_by, dx, db @ ga_by = ((x1 - x0) * (b2 - b1)) -
327 @ ((x2 - X1) * (b1 - b0))
328 vmull.s16 ga_uvrg_x, d0_a, d1_a
329 smusdx ga_bx, db, dy @ ga_bx = ((b1 - b0) * (y2 - y1)) -
330 @ ((b2 - b1) * (y1 - y0))
331 vmlsl.s16 ga_uvrg_x, d2_a, d3_a
332 movs gs_bx, ga_bx, asr #31
333
334 vmull.s16 ga_uvrg_y, d0_b, d1_b
335 rsbmi ga_bx, ga_bx, #0
336
c6063f89 337 @ r12 = psx_gpu->uvrgb_phase
e1f6de8f 338 ldr r12, [psx_gpu, #psx_gpu_uvrgb_phase_offset]
c6063f89 339
75e28f62
E
340 vmlsl.s16 ga_uvrg_y, d2_b, d3_b
341 movs gs_by, ga_by, asr #31
342
343 vshr.u64 d0, d30, #22
c6063f89 344 add b_base, r12, b0, lsl #16
345
346 vdup.u32 uvrgb_phase, r12
75e28f62
E
347
348 rsbmi ga_by, ga_by, #0
349 vclt.s32 gs_uvrg_x, ga_uvrg_x, #0 @ gs_uvrg_x = ga_uvrg_x < 0
350
351 @ r12 = psx_gpu->triangle_winding_offset
e1f6de8f 352 ldrb r12, [psx_gpu, #psx_gpu_triangle_winding_offset]
75e28f62
E
353 vclt.s32 gs_uvrg_y, ga_uvrg_y, #0 @ gs_uvrg_y = ga_uvrg_y < 0
354
75e28f62
E
355 rsb r12, r12, #0 @ r12 = -(triangle->winding)
356
357 vdup.u32 w_mask, r12 @ w_mask = { -w, -w, -w, -w }
358 sub r14, r14, #(62 - 12) @ r14 = shift - (62 - FIXED_BITS)
359
360 vshll.u16 uvrg_base, uvrg0, #16 @ uvrg_base = uvrg0 << 16
361 vdup.u32 r_shift, r14 @ r_shift = { shift, shift, shift, shift }
362
c6063f89 363 vadd.u32 uvrg_base, uvrgb_phase
75e28f62
E
364 vabs.s32 ga_uvrg_x, ga_uvrg_x @ ga_uvrg_x = abs(ga_uvrg_x)
365
366 vmov area_r_s, s0 @ area_r_s = triangle_reciprocal
367 vabs.s32 ga_uvrg_y, ga_uvrg_y @ ga_uvrg_y = abs(ga_uvrg_y)
368
369 vmull.u32 gw_rg_x, ga_rg_x, d0[0]
370 vmull.u32 gw_uv_x, ga_uv_x, d0[0]
371 vmull.u32 gw_rg_y, ga_rg_y, d0[0]
372 vmull.u32 gw_uv_y, ga_uv_y, d0[0]
373
374 vshl.u64 gw_rg_x, gw_rg_x, r_shift
375 vshl.u64 gw_uv_x, gw_uv_x, r_shift
376 vshl.u64 gw_rg_y, gw_rg_y, r_shift
377 vshl.u64 gw_uv_y, gw_uv_y, r_shift
378
379 veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
380 vmovn.u64 g_uv_x, gw_uv_x
381
382 veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
383 vmovn.u64 g_rg_x, gw_rg_x
384
385 veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
386 vmovn.u64 g_uv_y, gw_uv_y
387
388 vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
389 vmovn.u64 g_rg_y, gw_rg_y
390
391 veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
392 mov ga_bx, ga_bx, lsl #13
393
394 vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
395 mov ga_by, ga_by, lsl #13
396
397 vdup.u32 x0_y0, x0
398 umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
399
400 vshl.u32 g_uvrg_x, g_uvrg_x, #4
401 vshl.u32 g_uvrg_y, g_uvrg_y, #4
402
403 umull gw_by_l, gw_by_h, ga_by, area_r_s
404 vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
405
406 eor gs_bx, gs_bx, r12
407 vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
408
409 veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
410 eor gs_by, gs_by, r12
411
412 rsb r11, r14, #0 @ r11 = negative shift for scalar lsr
413 add store_a, psx_gpu, #psx_gpu_uvrg_offset
414
415 sub r11, r11, #(32 - 13)
416
417 add store_b, store_a, #16
418 mov store_inc, #32
419
420 vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
e1f6de8f 421 vst1.u32 { uvrg_base }, [store_a, :128], store_inc
75e28f62 422
e1f6de8f 423 vst1.u32 { uvrg_dx1 }, [store_b, :128], store_inc
75e28f62
E
424 mov g_bx, gw_bx_h, lsr r11
425
e1f6de8f 426 vst1.u32 { g_uvrg_y }, [store_a, :128], store_inc
75e28f62
E
427 mov g_by, gw_by_h, lsr r11
428
429 vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l }, \
e1f6de8f 430 [store_b, :128], store_inc
75e28f62
E
431 eor g_bx, g_bx, gs_bx
432
433 vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h }, \
e1f6de8f 434 [store_b, :128], store_inc
75e28f62
E
435 sub g_bx, g_bx, gs_bx
436
437 lsl g_bx, g_bx, #4
438 eor g_by, g_by, gs_by
439
440 mls b_base, g_bx, x0, b_base
441 sub g_by, g_by, gs_by
442
443 lsl g_by, g_by, #4
444 mov g_bx0, #0
445
446 add g_bx2, g_bx, g_bx
447 add g_bx3, g_bx, g_bx2
448
449 stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
450
451 ldmia sp!, { r4 - r11, pc }
452
453
454#define psx_gpu r0
455#define v_a r1
456#define v_b r2
457#define v_c r3
458
459#define temp r14
460
461#define x_a r4
462#define x_b r5
463#define x_c r6
464#define y_a r1
465#define y_b r2
466#define y_c r3
467
468#define height_minor_a r7
469#define height_minor_b r8
470#define height_major r9
471#define height r9
472
473#define reciprocal_table_ptr r10
474
475#define edge_alt_low r4
476#define edge_alt_high r5
477#define edge_dx_dy_alt r6
478#define edge_shift_alt r10
479
480#define edge_dx_dy_alt_low r4
481#define edge_dx_dy_alt_high r5
482
483#define span_edge_data r4
484#define span_uvrg_offset r5
485#define span_b_offset r6
486
487#define clip r14
488
489#define b r11
490#define b_dy r12
491
492
493#define alternate_x q0
494#define alternate_dx_dy q1
495#define alternate_x_32 q2
496
497#define alternate_x_low d0
498#define alternate_x_high d1
499#define alternate_dx_dy_low d2
500#define alternate_dx_dy_high d3
501#define alternate_x_32_low d4
502#define alternate_x_32_high d5
503
504#define left_x q3
505#define right_x q4
506#define left_dx_dy q5
507#define right_dx_dy q6
508#define left_edge q7
509#define right_edge q8
510
511#define left_x_low d6
512#define left_x_high d7
513#define right_x_low d8
514#define right_x_high d9
515#define left_dx_dy_low d10
516#define left_dx_dy_high d11
517#define right_dx_dy_low d12
518#define right_dx_dy_high d13
519#define left_edge_low d14
520#define left_edge_high d15
521#define right_edge_low d16
522#define right_edge_high d17
523
524#define y_mid_point d18
525#define c_0x0004 d19
526
527#define left_right_x_16 q11
528#define span_shifts_y q12
529#define c_0x0001 q13
530
531#define span_shifts d24
532#define y_x4 d25
533#define c_0xFFFE d26
534#define c_0x0007 d27
535
536#define left_right_x_16_low d22
537#define left_right_x_16_high d23
538
539#define uvrg q14
540#define uvrg_dy q15
541
542#define alternate_x_16 d4
543
544#define v_clip q3
545#define v_clip_low d6
546
547#define right_x_32 q10
548#define left_x_32 q11
549#define alternate_select d24
550
551#define right_x_32_low d20
552#define right_x_32_high d21
553#define left_x_32_low d22
554#define left_x_32_high d23
555
556#define edges_xy q0
557#define edges_dx_dy d2
558#define edge_shifts d3
559#define edge_shifts_64 q2
560
561#define edges_xy_left d0
562#define edges_xy_right d1
563
564#define height_reciprocals d6
565#define heights d7
566
567#define widths d8
568#define c_0x01 d9
569#define x_starts d10
570#define x_ends d11
571
572#define heights_b d12
573#define edges_dx_dy_64 q10
574
575#define edges_dx_dy_64_left d20
576#define edges_dx_dy_64_right d21
577
578
579#define setup_spans_prologue() \
580 stmdb sp!, { r4 - r11, lr }; \
581 \
e1f6de8f 582 ldrsh x_a, [v_a, #8]; \
583 ldrsh x_b, [v_b, #8]; \
584 ldrsh x_c, [v_c, #8]; \
585 ldrsh y_a, [v_a, #10]; \
586 ldrsh y_b, [v_b, #10]; \
587 ldrsh y_c, [v_c, #10]; \
75e28f62
E
588 \
589 add temp, psx_gpu, #psx_gpu_uvrg_offset; \
e1f6de8f 590 vld1.32 { uvrg }, [temp]; \
75e28f62 591 add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
e1f6de8f 592 vld1.32 { uvrg_dy }, [temp]; \
593 ldr reciprocal_table_ptr, [psx_gpu, #psx_gpu_reciprocal_table_ptr_offset]; \
75e28f62
E
594 \
595 vmov.u32 c_0x01, #0x01 \
596
597#define setup_spans_load_b() \
e1f6de8f 598 ldr b, [psx_gpu, #psx_gpu_b_offset]; \
599 ldr b_dy, [psx_gpu, #psx_gpu_b_dy_offset] \
75e28f62
E
600
601#define setup_spans_prologue_b() \
602 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
603 add temp, psx_gpu, #psx_gpu_viewport_start_x_offset; \
604 \
605 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
606 vmov.u16 c_0x0004, #0x0004; \
607 \
608 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
609 vmov.u16 c_0x0001, #0x0001; \
610 \
e1f6de8f 611 vld1.u16 { left_edge_low[], left_edge_high[] }, [temp]; \
75e28f62
E
612 add temp, psx_gpu, #psx_gpu_viewport_end_x_offset; \
613 \
e1f6de8f 614 vld1.u16 { right_edge_low[], right_edge_high[] }, [temp]; \
75e28f62
E
615 vadd.u16 right_edge, right_edge, c_0x0001; \
616 \
617 vmov.u16 c_0x0007, #0x0007; \
618 vmvn.u16 c_0xFFFE, #0x0001 \
619
620
621#define compute_edge_delta_x2() \
e1f6de8f 622 ldr temp, [reciprocal_table_ptr, height, lsl #2]; \
75e28f62
E
623 \
624 vdup.u32 heights, height; \
625 vsub.u32 widths, x_ends, x_starts; \
626 \
627 vdup.u32 edge_shifts, temp; \
628 vsub.u32 heights_b, heights, c_0x01; \
7d5140f5 629 vshr.u32 height_reciprocals, edge_shifts, #10; \
75e28f62
E
630 \
631 vmla.s32 heights_b, x_starts, heights; \
632 vbic.u16 edge_shifts, #0xE0; \
633 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
634 vmull.s32 edges_xy, heights_b, height_reciprocals \
635
636#define width_alt r6
637#define height_reciprocal_alt r11
638#define height_b_alt r12
639
640#define compute_edge_delta_x3(start_c, height_a, height_b) \
ed0fd81d 641 vmov heights, height_a, height_b; \
e1f6de8f 642 ldr temp, [reciprocal_table_ptr, height_a, lsl #2]; \
75e28f62 643 vmov.u32 edge_shifts[0], temp; \
e1f6de8f 644 ldr temp, [reciprocal_table_ptr, height_b, lsl #2]; \
75e28f62 645 vmov.u32 edge_shifts[1], temp; \
e1f6de8f 646 ldr edge_shift_alt, [reciprocal_table_ptr, height_minor_b, lsl #2]; \
75e28f62
E
647 \
648 vsub.u32 widths, x_ends, x_starts; \
649 sub width_alt, x_c, start_c; \
650 \
651 vsub.u32 heights_b, heights, c_0x01; \
652 sub height_b_alt, height_minor_b, #1; \
653 \
7d5140f5
E
654 vshr.u32 height_reciprocals, edge_shifts, #10; \
655 lsr height_reciprocal_alt, edge_shift_alt, #10; \
75e28f62
E
656 \
657 vmla.s32 heights_b, x_starts, heights; \
658 mla height_b_alt, height_minor_b, start_c, height_b_alt; \
659 \
660 vbic.u16 edge_shifts, #0xE0; \
661 and edge_shift_alt, edge_shift_alt, #0x1F; \
662 \
663 vmul.s32 edges_dx_dy, widths, height_reciprocals; \
664 mul edge_dx_dy_alt, width_alt, height_reciprocal_alt; \
665 \
666 vmull.s32 edges_xy, heights_b, height_reciprocals; \
667 smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt \
668
669
670#define setup_spans_adjust_y_up() \
671 vsub.u32 y_x4, y_x4, c_0x0004 \
672
673#define setup_spans_adjust_y_down() \
674 vadd.u32 y_x4, y_x4, c_0x0004 \
675
676#define setup_spans_adjust_interpolants_up() \
677 vsub.u32 uvrg, uvrg, uvrg_dy; \
678 sub b, b, b_dy \
679
680#define setup_spans_adjust_interpolants_down() \
681 vadd.u32 uvrg, uvrg, uvrg_dy; \
682 add b, b, b_dy \
683
684
685#define setup_spans_clip_interpolants_increment() \
686 mla b, b_dy, clip, b; \
687 vmla.s32 uvrg, uvrg_dy, v_clip \
688
689#define setup_spans_clip_interpolants_decrement() \
690 mls b, b_dy, clip, b; \
691 vmls.s32 uvrg, uvrg_dy, v_clip \
692
693#define setup_spans_clip_alternate_yes() \
694 smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip \
695
696#define setup_spans_clip_alternate_no() \
697
698#define setup_spans_clip(direction, alternate_active) \
699 vdup.u32 v_clip, clip; \
700 setup_spans_clip_alternate_##alternate_active(); \
701 setup_spans_clip_interpolants_##direction(); \
702 vmlal.s32 edges_xy, edges_dx_dy, v_clip_low \
703
704
705#define setup_spans_adjust_edges_alternate_no(left_index, right_index) \
706 vmovl.s32 edge_shifts_64, edge_shifts; \
707 vmovl.s32 edges_dx_dy_64, edges_dx_dy; \
708 \
709 vshl.s64 edges_xy, edges_xy, edge_shifts_64; \
710 vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64; \
711 \
712 vmov left_x_low, edges_xy_##left_index; \
713 vmov right_x_low, edges_xy_##right_index; \
714 \
715 vmov left_dx_dy_low, edges_dx_dy_64_##left_index; \
716 vmov left_dx_dy_high, edges_dx_dy_64_##left_index; \
717 vmov right_dx_dy_low, edges_dx_dy_64_##right_index; \
718 vmov right_dx_dy_high, edges_dx_dy_64_##right_index; \
719 \
720 vadd.u64 left_x_high, left_x_low, left_dx_dy_low; \
721 vadd.u64 right_x_high, right_x_low, right_dx_dy_low; \
722 \
723 vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy; \
724 vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy \
725
726
727#define setup_spans_adjust_edges_alternate_yes(left_index, right_index) \
728 setup_spans_adjust_edges_alternate_no(left_index, right_index); \
729 \
730 vdup.u16 y_mid_point, y_b; \
731 rsb temp, edge_shift_alt, #32; \
732 \
733 lsl edge_alt_high, edge_alt_high, edge_shift_alt; \
734 orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp; \
735 lsl edge_alt_low, edge_alt_low, edge_shift_alt; \
736 vmov alternate_x_low, edge_alt_low, edge_alt_high; \
737 \
738 asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp; \
739 lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt; \
740 vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high; \
741 vmov alternate_dx_dy_high, alternate_dx_dy_low; \
742 \
743 vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low; \
744 vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy \
745
746
747#define setup_spans_y_select_up() \
748 vclt.s16 alternate_select, y_x4, y_mid_point \
749
750#define setup_spans_y_select_down() \
751 vcgt.s16 alternate_select, y_x4, y_mid_point \
752
753
754#define setup_spans_alternate_select_left() \
755 vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select \
756
757#define setup_spans_alternate_select_right() \
758 vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select \
759
760
761#define setup_spans_set_x4_alternate_yes(alternate, direction) \
762 vshrn.s64 alternate_x_32_low, alternate_x, #32; \
763 vshrn.s64 left_x_32_low, left_x, #32; \
764 vshrn.s64 right_x_32_low, right_x, #32; \
765 \
766 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
767 vadd.u64 left_x, left_x, left_dx_dy; \
768 vadd.u64 right_x, right_x, right_dx_dy; \
769 \
770 vshrn.s64 alternate_x_32_high, alternate_x, #32; \
771 vshrn.s64 left_x_32_high, left_x, #32; \
772 vshrn.s64 right_x_32_high, right_x, #32; \
773 \
774 vadd.u64 alternate_x, alternate_x, alternate_dx_dy; \
775 vadd.u64 left_x, left_x, left_dx_dy; \
776 vadd.u64 right_x, right_x, right_dx_dy; \
777 \
778 vmovn.u32 alternate_x_16, alternate_x_32; \
779 setup_spans_y_select_##direction(); \
780 vmovn.u32 left_right_x_16_low, left_x_32; \
781 \
782 vmovn.u32 left_right_x_16_high, right_x_32; \
783 setup_spans_alternate_select_##alternate(); \
784 \
e1f6de8f 785 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
786 str b, [span_b_offset], #4; \
75e28f62
E
787 setup_spans_adjust_interpolants_##direction(); \
788 \
789 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
790 \
e1f6de8f 791 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
792 str b, [span_b_offset], #4; \
75e28f62
E
793 setup_spans_adjust_interpolants_##direction(); \
794 \
795 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
796 \
e1f6de8f 797 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
798 str b, [span_b_offset], #4; \
75e28f62
E
799 setup_spans_adjust_interpolants_##direction(); \
800 \
801 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
802 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
803 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
804 \
e1f6de8f 805 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
806 str b, [span_b_offset], #4; \
75e28f62
E
807 setup_spans_adjust_interpolants_##direction(); \
808 \
809 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
810 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
811 \
e1f6de8f 812 vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \
75e28f62
E
813 \
814 setup_spans_adjust_y_##direction() \
815
816
817#define setup_spans_set_x4_alternate_no(alternate, direction) \
818 vshrn.s64 left_x_32_low, left_x, #32; \
819 vshrn.s64 right_x_32_low, right_x, #32; \
820 \
821 vadd.u64 left_x, left_x, left_dx_dy; \
822 vadd.u64 right_x, right_x, right_dx_dy; \
823 \
824 vshrn.s64 left_x_32_high, left_x, #32; \
825 vshrn.s64 right_x_32_high, right_x, #32; \
826 \
827 vadd.u64 left_x, left_x, left_dx_dy; \
828 vadd.u64 right_x, right_x, right_dx_dy; \
829 \
830 vmovn.u32 left_right_x_16_low, left_x_32; \
831 vmovn.u32 left_right_x_16_high, right_x_32; \
832 \
e1f6de8f 833 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
834 str b, [span_b_offset], #4; \
75e28f62
E
835 setup_spans_adjust_interpolants_##direction(); \
836 \
837 vmax.s16 left_right_x_16, left_right_x_16, left_edge; \
838 \
e1f6de8f 839 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
840 str b, [span_b_offset], #4; \
75e28f62
E
841 setup_spans_adjust_interpolants_##direction(); \
842 \
843 vmin.s16 left_right_x_16, left_right_x_16, right_edge; \
844 \
e1f6de8f 845 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
846 str b, [span_b_offset], #4; \
75e28f62
E
847 setup_spans_adjust_interpolants_##direction(); \
848 \
849 vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low; \
850 vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007; \
851 vand.u16 span_shifts, left_right_x_16_high, c_0x0007; \
852 \
e1f6de8f 853 vst1.u32 { uvrg }, [span_uvrg_offset, :128]!; \
854 str b, [span_b_offset], #4; \
75e28f62
E
855 setup_spans_adjust_interpolants_##direction(); \
856 \
857 vshl.u16 span_shifts, c_0xFFFE, span_shifts; \
858 vshr.u16 left_right_x_16_high, left_right_x_16_high, #3; \
859 \
e1f6de8f 860 vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!; \
75e28f62
E
861 \
862 setup_spans_adjust_y_##direction() \
863
864
865#define edge_adjust_low r11
866#define edge_adjust_high r12
867
868#define setup_spans_alternate_adjust_yes() \
869 smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a; \
870 subs edge_alt_low, edge_alt_low, edge_adjust_low; \
871 sbc edge_alt_high, edge_alt_high, edge_adjust_high \
872
873#define setup_spans_alternate_adjust_no() \
874
875
876#define setup_spans_down(left_index, right_index, alternate, alternate_active) \
877 setup_spans_alternate_adjust_##alternate_active(); \
878 setup_spans_load_b(); \
879 \
e1f6de8f 880 ldrsh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]; \
75e28f62
E
881 subs y_c, y_c, temp; \
882 subgt height, height, y_c; \
883 addgt height, height, #1; \
884 \
e1f6de8f 885 ldrsh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]; \
75e28f62
E
886 subs clip, temp, y_a; \
887 ble 0f; \
888 \
889 sub height, height, clip; \
890 add y_a, y_a, clip; \
891 setup_spans_clip(increment, alternate_active); \
892 \
893 0: \
894 cmp height, #0; \
895 ble 1f; \
896 \
897 orr temp, y_a, y_a, lsl #16; \
898 add temp, temp, #(1 << 16); \
899 add y_a, temp, #2; \
900 add y_a, y_a, #(2 << 16); \
ed0fd81d 901 vmov y_x4, temp, y_a; \
75e28f62
E
902 \
903 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
904 right_index); \
905 setup_spans_prologue_b(); \
906 \
e1f6de8f 907 strh height, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
908 \
909 2: \
910 setup_spans_set_x4_alternate_##alternate_active(alternate, down); \
911 subs height, height, #4; \
912 bhi 2b; \
913 \
914 1: \
915
916
917#define setup_spans_alternate_pre_increment_yes() \
918 adds edge_alt_low, edge_alt_low, edge_dx_dy_alt; \
919 adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31 \
920
921#define setup_spans_alternate_pre_increment_no() \
922
923
924#define setup_spans_up_decrement_yes() \
925 suble height, height, #1 \
926
927#define setup_spans_up_decrement_no() \
928
929
930#define setup_spans_up(left_index, right_index, alternate, alternate_active) \
931 setup_spans_alternate_adjust_##alternate_active(); \
932 setup_spans_load_b(); \
933 sub y_a, y_a, #1; \
934 \
e1f6de8f 935 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]; \
75e28f62
E
936 subs temp, temp, y_c; \
937 subgt height, height, temp; \
938 setup_spans_up_decrement_##alternate_active(); \
939 \
e1f6de8f 940 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]; \
75e28f62
E
941 subs clip, y_a, temp; \
942 ble 0f; \
943 \
944 sub height, height, clip; \
945 sub y_a, y_a, clip; \
946 setup_spans_clip(decrement, alternate_active); \
947 \
948 0: \
949 cmp height, #0; \
950 ble 1f; \
951 \
952 orr temp, y_a, y_a, lsl #16; \
953 sub temp, temp, #(1 << 16); \
954 sub y_a, temp, #2; \
955 sub y_a, y_a, #(2 << 16); \
ed0fd81d 956 vmov y_x4, temp, y_a; \
75e28f62
E
957 \
958 vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
959 \
960 setup_spans_alternate_pre_increment_##alternate_active(); \
961 setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
962 right_index); \
963 setup_spans_adjust_interpolants_up(); \
964 setup_spans_prologue_b(); \
965 \
e1f6de8f 966 strh height, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
967 \
968 2: \
969 setup_spans_set_x4_alternate_##alternate_active(alternate, up); \
970 subs height, height, #4; \
971 bhi 2b; \
972 \
973 1: \
974
975
976#define setup_spans_epilogue() \
977 ldmia sp!, { r4 - r11, pc } \
978
979
980#define setup_spans_up_up(minor, major) \
981 setup_spans_prologue(); \
982 sub height_minor_a, y_a, y_b; \
983 sub height_minor_b, y_b, y_c; \
984 sub height, y_a, y_c; \
985 \
986 vdup.u32 x_starts, x_a; \
ed0fd81d 987 vmov x_ends, x_c, x_b; \
75e28f62
E
988 \
989 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
990 setup_spans_up(major, minor, minor, yes); \
991 setup_spans_epilogue() \
992
993function(setup_spans_up_left)
994 setup_spans_up_up(left, right)
995
996function(setup_spans_up_right)
997 setup_spans_up_up(right, left)
998
75e28f62
E
999#define setup_spans_down_down(minor, major) \
1000 setup_spans_prologue(); \
1001 sub height_minor_a, y_b, y_a; \
1002 sub height_minor_b, y_c, y_b; \
1003 sub height, y_c, y_a; \
1004 \
1005 vdup.u32 x_starts, x_a; \
ed0fd81d 1006 vmov x_ends, x_c, x_b; \
75e28f62
E
1007 \
1008 compute_edge_delta_x3(x_b, height_major, height_minor_a); \
1009 setup_spans_down(major, minor, minor, yes); \
1010 setup_spans_epilogue() \
1011
1012function(setup_spans_down_left)
1013 setup_spans_down_down(left, right)
1014
1015function(setup_spans_down_right)
1016 setup_spans_down_down(right, left)
1017
1018
1019#define setup_spans_up_flat() \
1020 sub height, y_a, y_c; \
1021 \
1022 compute_edge_delta_x2(); \
1023 setup_spans_up(left, right, none, no); \
1024 setup_spans_epilogue() \
1025
1026function(setup_spans_up_a)
1027 setup_spans_prologue()
1028
ed0fd81d 1029 vmov x_starts, x_a, x_b
75e28f62
E
1030 vdup.u32 x_ends, x_c
1031
1032 setup_spans_up_flat()
1033
1034function(setup_spans_up_b)
1035 setup_spans_prologue()
1036
1037 vdup.u32 x_starts, x_a
ed0fd81d 1038 vmov x_ends, x_b, x_c
75e28f62
E
1039
1040 setup_spans_up_flat()
1041
1042#define setup_spans_down_flat() \
1043 sub height, y_c, y_a; \
1044 \
1045 compute_edge_delta_x2(); \
1046 setup_spans_down(left, right, none, no); \
1047 setup_spans_epilogue() \
1048
1049function(setup_spans_down_a)
1050 setup_spans_prologue()
1051
ed0fd81d 1052 vmov x_starts, x_a, x_b
75e28f62
E
1053 vdup.u32 x_ends, x_c
1054
1055 setup_spans_down_flat()
1056
1057function(setup_spans_down_b)
1058 setup_spans_prologue()
1059
1060 vdup.u32 x_starts, x_a
ed0fd81d 1061 vmov x_ends, x_b, x_c
75e28f62
E
1062
1063 setup_spans_down_flat()
1064
1065
1066#define middle_y r9
1067
1068#define edges_xy_b q11
1069#define edges_dx_dy_b d26
1070#define edge_shifts_b d27
1071#define edges_dx_dy_and_shifts_b q13
1072#define height_increment d20
1073
1074#define edges_dx_dy_and_shifts q1
1075
1076#define edges_xy_b_left d22
1077#define edges_xy_b_right d23
1078
1079#define setup_spans_up_down_load_edge_set_b() \
1080 vmov edges_xy, edges_xy_b; \
1081 vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b \
1082
1083
1084function(setup_spans_up_down)
1085 setup_spans_prologue()
1086
1087 // s32 middle_y = y_a;
1088 sub height_minor_a, y_a, y_b
1089 sub height_minor_b, y_c, y_a
1090 sub height_major, y_c, y_b
1091
ed0fd81d 1092 vmov x_starts, x_a, x_c
75e28f62
E
1093 vdup.u32 x_ends, x_b
1094
1095 compute_edge_delta_x3(x_a, height_minor_a, height_major)
1096
1097 mov temp, #0
ed0fd81d 1098 vmov height_increment, temp, height_minor_b
75e28f62
E
1099 vmlal.s32 edges_xy, edges_dx_dy, height_increment
1100
1101 vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1102 vmov edges_xy_b_right, edges_xy_right
1103
1104 vmov edge_shifts_b, edge_shifts
1105 vmov.u32 edge_shifts_b[0], edge_shift_alt
1106
1107 vneg.s32 edges_dx_dy_b, edges_dx_dy
1108 vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1109
1110 mov middle_y, y_a
1111
1112 setup_spans_load_b()
1113 sub y_a, y_a, #1
1114
e1f6de8f 1115 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
75e28f62
E
1116 subs temp, temp, y_b
1117 subgt height_minor_a, height_minor_a, temp
1118
e1f6de8f 1119 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
75e28f62
E
1120 subs clip, y_a, temp
1121 ble 0f
1122
1123 sub height_minor_a, height_minor_a, clip
1124 sub y_a, y_a, clip
1125 setup_spans_clip(decrement, no)
1126
1127 0:
1128 cmp height_minor_a, #0
1129 ble 3f
1130
1131 orr temp, y_a, y_a, lsl #16
1132 sub temp, temp, #(1 << 16)
1133 sub y_a, temp, #2
1134 sub y_a, y_a, #(2 << 16)
ed0fd81d 1135 vmov y_x4, temp, y_a
75e28f62
E
1136
1137 vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1138
e1f6de8f 1139 strh height_minor_a, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1140
1141 setup_spans_adjust_edges_alternate_no(left, right);
1142 setup_spans_adjust_interpolants_up()
1143 setup_spans_up_down_load_edge_set_b()
1144
1145 setup_spans_prologue_b()
1146
1147
1148 2:
1149 setup_spans_set_x4_alternate_no(none, up)
1150 subs height_minor_a, height_minor_a, #4
1151 bhi 2b
1152
1153 add span_edge_data, span_edge_data, height_minor_a, lsl #3
1154 add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1155 add span_b_offset, span_b_offset, height_minor_a, lsl #2
1156
1157 4:
1158 add temp, psx_gpu, #psx_gpu_uvrg_offset
e1f6de8f 1159 vld1.32 { uvrg }, [temp]
75e28f62
E
1160 mov y_a, middle_y
1161
1162 setup_spans_load_b()
1163
e1f6de8f 1164 ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
75e28f62
E
1165 subs y_c, y_c, temp
1166 subgt height_minor_b, height_minor_b, y_c
1167 addgt height_minor_b, height_minor_b, #1
1168
e1f6de8f 1169 ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
75e28f62
E
1170 subs clip, temp, y_a
1171 ble 0f
1172
1173 sub height_minor_b, height_minor_b, clip
1174 add y_a, y_a, clip
1175 setup_spans_clip(increment, no)
1176
1177 0:
1178 cmp height_minor_b, #0
1179 ble 1f
1180
1181 orr temp, y_a, y_a, lsl #16
1182 add temp, temp, #(1 << 16)
1183 add y_a, temp, #2
1184 add y_a, y_a, #(2 << 16)
ed0fd81d 1185 vmov y_x4, temp, y_a
75e28f62
E
1186
1187 setup_spans_adjust_edges_alternate_no(left, right)
1188
e1f6de8f 1189 ldrh temp, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62 1190 add temp, temp, height_minor_b
b7569147 1191
1192 cmp temp, #MAX_SPANS
1193 beq 5f
1194
e1f6de8f 1195 strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1196
1197 2:
1198 setup_spans_set_x4_alternate_no(none, down)
1199 subs height_minor_b, height_minor_b, #4
1200 bhi 2b
1201
1202 1:
1203 setup_spans_epilogue()
1204
1205 3:
1206 setup_spans_up_down_load_edge_set_b()
1207 setup_spans_prologue_b()
1208 bal 4b
1209
b7569147 1210 5:
1211 // FIXME: overflow corner case
1212 sub temp, temp, height_minor_b
1213 bics height_minor_b, #3
1214 add temp, temp, height_minor_b
e1f6de8f 1215 strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
b7569147 1216 bne 2b
1217 bal 1b
1218
75e28f62
E
1219#undef span_uvrg_offset
1220#undef span_edge_data
1221#undef span_b_offset
1222#undef left_x
1223#undef b
1224
1225#define psx_gpu r0
1226#define num_spans r1
1227#define span_uvrg_offset r2
1228#define span_edge_data r3
1229#define span_b_offset r4
1230#define b_dx r5
1231#define span_num_blocks r6
1232#define y r7
1233#define left_x r8
1234#define b r9
1235#define dither_offset_ptr r10
1236#define block_ptr_a r11
1237#define fb_ptr r12
1238#define num_blocks r14
1239
1240#define uvrg_dx_ptr r2
1241#define texture_mask_ptr r3
1242#define dither_shift r8
1243#define dither_row r10
1244
1245#define c_32 r7
1246#define b_dx4 r8
1247#define b_dx8 r9
1248#define block_ptr_b r10
1249
1250#define block_span_ptr r10
1251#define right_mask r8
1252
1253#define color r2
1254#define color_r r3
1255#define color_g r4
1256#define color_b r5
1257
1258#undef uvrg
1259
1260#define u_block q0
1261#define v_block q1
1262#define r_block q2
1263#define g_block q3
1264#define b_block q4
1265
1266#define uv_dx4 d10
1267#define rg_dx4 d11
1268#define uv_dx8 d12
1269#define rg_dx8 d13
1270#define b_whole_8 d14
1271#define fb_mask_ptrs d15
1272
1273#define uvrg_dx4 q5
1274#define uvrg_dx8 q6
1275#define uv_dx8 d12
1276#define rg_dx8 d13
1277
1278#define u_whole q8
1279#define v_whole q9
1280#define r_whole q10
1281#define g_whole q11
1282#define b_whole q12
1283
1284#define u_whole_low d16
1285#define u_whole_high d17
1286#define v_whole_low d18
1287#define v_whole_high d19
1288#define r_whole_low d20
1289#define r_whole_high d21
1290#define g_whole_low d22
1291#define g_whole_high d23
1292#define b_whole_low d24
1293#define b_whole_high d25
1294
1295#define dx4 q13
1296#define dx8 q13
1297
1298#define u_whole_8 d26
1299#define v_whole_8 d27
1300#define u_whole_8b d24
1301#define r_whole_8 d24
1302#define g_whole_8 d25
1303
1304#define uv_whole_8 q13
1305#define uv_whole_8b q14
1306
1307#define dither_offsets q14
1308#define texture_mask q15
1309#define texture_mask_u d30
1310#define texture_mask_v d31
1311
1312#define dither_offsets_short d28
1313
1314#define v_left_x q8
1315#define uvrg q9
1316#define block_span q10
1317
1318#define uv d18
1319#define rg d19
1320
1321#define draw_mask q1
1322#define draw_mask_edge q13
1323#define test_mask q0
1324
1325#define uvrg_dx q3
1326
1327#define colors q2
1328
1329#define setup_blocks_texture_swizzled() \
1330 vand.u8 u_whole_8b, u_whole_8, texture_mask_u; \
1331 vsli.u8 u_whole_8, v_whole_8, #4; \
1332 vsri.u8 v_whole_8, u_whole_8b, #4 \
1333
1334#define setup_blocks_texture_unswizzled() \
1335
1336
1337#define setup_blocks_shaded_textured_builder(swizzling) \
1338.align 3; \
1339 \
1340function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect) \
e1f6de8f 1341 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
1342 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1343 \
e1f6de8f 1344 vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128]; \
75e28f62
E
1345 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1346 \
1347 cmp num_spans, #0; \
1348 bxeq lr; \
1349 \
1350 stmdb sp!, { r4 - r11, r14 }; \
1351 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1352 \
e1f6de8f 1353 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
1354 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1355 \
e1f6de8f 1356 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16]; \
75e28f62
E
1357 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1358 \
e1f6de8f 1359 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1360 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1361 \
1362 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
1363 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1364 \
1365 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1366 \
1367 0: \
1368 vmov.u8 fb_mask_ptrs, #0; \
1369 \
e1f6de8f 1370 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
1371 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1372 \
e1f6de8f 1373 ldrh y, [span_edge_data, #edge_data_y_offset]; \
1374 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
1375 \
1376 cmp span_num_blocks, #0; \
1377 beq 1f; \
1378 \
e1f6de8f 1379 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
1380 add num_blocks, span_num_blocks, num_blocks; \
1381 \
1382 cmp num_blocks, #MAX_BLOCKS; \
1383 bgt 2f; \
1384 \
1385 3: \
e1f6de8f 1386 ldr b, [span_b_offset]; \
75e28f62
E
1387 add fb_ptr, fb_ptr, y, lsl #11; \
1388 \
1389 vdup.u32 v_left_x, left_x; \
1390 and y, y, #0x3; \
1391 \
e1f6de8f 1392 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
1393 add fb_ptr, fb_ptr, left_x, lsl #1; \
1394 \
1395 mla b, b_dx, left_x, b; \
1396 and dither_shift, left_x, #0x03; \
1397 \
e1f6de8f 1398 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
1399 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1400 \
1401 mov dither_shift, dither_shift, lsl #3; \
1402 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1403 \
1404 mov c_32, #32; \
1405 subs span_num_blocks, span_num_blocks, #1; \
1406 \
1407 mov dither_row, dither_row, ror dither_shift; \
1408 mov b_dx4, b_dx, lsl #2; \
1409 \
1410 vdup.u32 dither_offsets_short, dither_row; \
1411 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1412 \
1413 vdup.u32 b_block, b; \
1414 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1415 \
1416 vdup.u32 u_block, uv[0]; \
1417 mov b_dx8, b_dx, lsl #3; \
1418 \
1419 vdup.u32 v_block, uv[1]; \
1420 vdup.u32 r_block, rg[0]; \
1421 vdup.u32 g_block, rg[1]; \
1422 \
e1f6de8f 1423 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1424 \
1425 vadd.u32 u_block, u_block, block_span; \
e1f6de8f 1426 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1427 \
1428 vadd.u32 v_block, v_block, block_span; \
e1f6de8f 1429 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1430 \
1431 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 1432 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1433 \
1434 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 1435 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
1436 \
1437 vadd.u32 b_block, b_block, block_span; \
1438 add block_ptr_b, block_ptr_a, #16; \
1439 \
1440 vshrn.u32 u_whole_low, u_block, #16; \
1441 vshrn.u32 v_whole_low, v_block, #16; \
1442 vshrn.u32 r_whole_low, r_block, #16; \
1443 vshrn.u32 g_whole_low, g_block, #16; \
1444 \
1445 vdup.u32 dx4, uv_dx4[0]; \
1446 vshrn.u32 b_whole_low, b_block, #16; \
1447 \
1448 vaddhn.u32 u_whole_high, u_block, dx4; \
1449 vdup.u32 dx4, uv_dx4[1]; \
1450 \
1451 vaddhn.u32 v_whole_high, v_block, dx4; \
1452 vdup.u32 dx4, rg_dx4[0]; \
1453 \
1454 vaddhn.u32 r_whole_high, r_block, dx4; \
1455 vdup.u32 dx4, rg_dx4[1]; \
1456 \
1457 vaddhn.u32 g_whole_high, g_block, dx4; \
1458 vdup.u32 dx4, b_dx4; \
1459 \
1460 vaddhn.u32 b_whole_high, b_block, dx4; \
1461 vdup.u32 dx8, uv_dx8[0]; \
1462 \
1463 vadd.u32 u_block, u_block, dx8; \
1464 vdup.u32 dx8, uv_dx8[1]; \
1465 \
1466 vadd.u32 v_block, v_block, dx8; \
1467 vdup.u32 dx8, rg_dx8[0]; \
1468 \
1469 vadd.u32 r_block, r_block, dx8; \
1470 vdup.u32 dx8, rg_dx8[1]; \
1471 \
1472 vadd.u32 g_block, g_block, dx8; \
1473 vdup.u32 dx8, b_dx8; \
1474 \
1475 vadd.u32 b_block, b_block, dx8; \
1476 vmovn.u16 u_whole_8, u_whole; \
1477 \
1478 vmovn.u16 v_whole_8, v_whole; \
1479 \
1480 vmovn.u16 b_whole_8, b_whole; \
e1f6de8f 1481 pld [fb_ptr]; \
75e28f62
E
1482 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1483 \
1484 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1485 setup_blocks_texture_##swizzling(); \
1486 \
1487 vmovn.u16 r_whole_8, r_whole; \
1488 beq 5f; \
1489 \
1490 4: \
1491 vmovn.u16 g_whole_8, g_whole; \
1492 vshrn.u32 u_whole_low, u_block, #16; \
1493 \
e1f6de8f 1494 vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32; \
75e28f62
E
1495 vshrn.u32 v_whole_low, v_block, #16; \
1496 \
e1f6de8f 1497 vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32; \
75e28f62
E
1498 vshrn.u32 r_whole_low, r_block, #16; \
1499 \
e1f6de8f 1500 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1501 vshrn.u32 g_whole_low, g_block, #16; \
1502 \
1503 vdup.u32 dx4, uv_dx4[0]; \
1504 vshrn.u32 b_whole_low, b_block, #16; \
1505 \
1506 vaddhn.u32 u_whole_high, u_block, dx4; \
1507 vdup.u32 dx4, uv_dx4[1]; \
1508 \
1509 vaddhn.u32 v_whole_high, v_block, dx4; \
1510 vdup.u32 dx4, rg_dx4[0]; \
1511 \
1512 vaddhn.u32 r_whole_high, r_block, dx4; \
1513 vdup.u32 dx4, rg_dx4[1]; \
1514 \
1515 vaddhn.u32 g_whole_high, g_block, dx4; \
1516 vdup.u32 dx4, b_dx4; \
1517 \
1518 vaddhn.u32 b_whole_high, b_block, dx4; \
1519 vdup.u32 dx8, uv_dx8[0]; \
1520 \
1521 vadd.u32 u_block, u_block, dx8; \
1522 vdup.u32 dx8, uv_dx8[1]; \
1523 \
1524 vadd.u32 v_block, v_block, dx8; \
1525 vdup.u32 dx8, rg_dx8[0]; \
1526 \
1527 vadd.u32 r_block, r_block, dx8; \
1528 vdup.u32 dx8, rg_dx8[1]; \
1529 \
1530 vadd.u32 g_block, g_block, dx8; \
1531 vdup.u32 dx8, b_dx8; \
1532 \
1533 vadd.u32 b_block, b_block, dx8; \
1534 vmovn.u16 u_whole_8, u_whole; \
1535 \
1536 add fb_ptr, fb_ptr, #16; \
1537 vmovn.u16 v_whole_8, v_whole; \
1538 \
e1f6de8f 1539 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
75e28f62
E
1540 vmovn.u16 b_whole_8, b_whole; \
1541 \
e1f6de8f 1542 pld [fb_ptr]; \
75e28f62
E
1543 \
1544 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1545 subs span_num_blocks, span_num_blocks, #1; \
1546 \
1547 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1548 setup_blocks_texture_##swizzling(); \
1549 \
1550 vmovn.u16 r_whole_8, r_whole; \
1551 bne 4b; \
1552 \
1553 5: \
1554 vmovn.u16 g_whole_8, g_whole; \
e1f6de8f 1555 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62 1556 \
e1f6de8f 1557 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
1558 vdup.u8 draw_mask, right_mask; \
1559 \
1560 vmov.u32 fb_mask_ptrs[0], right_mask; \
1561 vtst.u16 draw_mask, draw_mask, test_mask; \
1562 vzip.u8 u_whole_8, v_whole_8; \
1563 \
1564 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
e1f6de8f 1565 vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32; \
1566 vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32; \
1567 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1568 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1569 \
1570 1: \
1571 add span_uvrg_offset, span_uvrg_offset, #16; \
1572 add span_b_offset, span_b_offset, #4; \
1573 \
1574 add span_edge_data, span_edge_data, #8; \
1575 subs num_spans, num_spans, #1; \
1576 \
e1f6de8f 1577 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1578 bne 0b; \
1579 \
1580 ldmia sp!, { r4 - r11, pc }; \
1581 \
1582 2: \
1583 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1584 vpush { texture_mask }; \
1585 vpush { uvrg_dx4 }; \
1586 \
4d646738 1587 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 1588 bl flush_render_block_buffer; \
4d646738 1589 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
1590 \
1591 vpop { uvrg_dx4 }; \
1592 vpop { texture_mask }; \
1593 \
1594 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1595 vmov.u8 fb_mask_ptrs, #0; \
1596 \
1597 mov num_blocks, span_num_blocks; \
1598 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1599 bal 3b \
1600
1601
1602setup_blocks_shaded_textured_builder(swizzled)
1603setup_blocks_shaded_textured_builder(unswizzled)
1604
1605
1606#define setup_blocks_unshaded_textured_builder(swizzling) \
1607.align 3; \
1608 \
1609function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect) \
e1f6de8f 1610 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
1611 add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset; \
1612 \
e1f6de8f 1613 vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128]; \
75e28f62
E
1614 add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset; \
1615 \
1616 cmp num_spans, #0; \
1617 bxeq lr; \
1618 \
1619 stmdb sp!, { r4 - r11, r14 }; \
1620 vshl.u32 uvrg_dx4, uvrg_dx, #2; \
1621 \
1622 vshl.u32 uvrg_dx8, uvrg_dx, #3; \
1623 \
e1f6de8f 1624 vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16]; \
75e28f62
E
1625 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
1626 \
e1f6de8f 1627 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1628 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
1629 \
1630 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1631 \
1632 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
1633 \
1634 0: \
1635 vmov.u8 fb_mask_ptrs, #0; \
1636 \
e1f6de8f 1637 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
1638 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
1639 \
e1f6de8f 1640 ldrh y, [span_edge_data, #edge_data_y_offset]; \
1641 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
1642 \
1643 cmp span_num_blocks, #0; \
1644 beq 1f; \
1645 \
e1f6de8f 1646 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
1647 add num_blocks, span_num_blocks, num_blocks; \
1648 \
1649 cmp num_blocks, #MAX_BLOCKS; \
1650 bgt 2f; \
1651 \
1652 3: \
1653 add fb_ptr, fb_ptr, y, lsl #11; \
1654 \
1655 vdup.u32 v_left_x, left_x; \
1656 and y, y, #0x3; \
1657 \
e1f6de8f 1658 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
1659 add fb_ptr, fb_ptr, left_x, lsl #1; \
1660 \
1661 and dither_shift, left_x, #0x03; \
1662 \
e1f6de8f 1663 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
1664 vshr.u32 uvrg_dx, uvrg_dx4, #2; \
1665 \
1666 mov dither_shift, dither_shift, lsl #3; \
1667 vmla.u32 uvrg, uvrg_dx, v_left_x; \
1668 \
1669 mov c_32, #32; \
1670 subs span_num_blocks, span_num_blocks, #1; \
1671 \
1672 mov dither_row, dither_row, ror dither_shift; \
1673 \
1674 vdup.u32 dither_offsets_short, dither_row; \
1675 add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset; \
1676 \
1677 vshll.s8 dither_offsets, dither_offsets_short, #4; \
1678 \
1679 vdup.u32 u_block, uv[0]; \
1680 \
1681 vdup.u32 v_block, uv[1]; \
e1f6de8f 1682 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1683 \
1684 vadd.u32 u_block, u_block, block_span; \
e1f6de8f 1685 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
1686 \
1687 vadd.u32 v_block, v_block, block_span; \
1688 add block_ptr_b, block_ptr_a, #16; \
1689 \
1690 vshrn.u32 u_whole_low, u_block, #16; \
1691 vshrn.u32 v_whole_low, v_block, #16; \
1692 \
1693 vdup.u32 dx4, uv_dx4[0]; \
1694 \
1695 vaddhn.u32 u_whole_high, u_block, dx4; \
1696 vdup.u32 dx4, uv_dx4[1]; \
1697 \
1698 vaddhn.u32 v_whole_high, v_block, dx4; \
1699 vdup.u32 dx8, uv_dx8[0]; \
1700 \
1701 vadd.u32 u_block, u_block, dx8; \
1702 vdup.u32 dx8, uv_dx8[1]; \
1703 \
1704 vadd.u32 v_block, v_block, dx8; \
1705 vmovn.u16 u_whole_8, u_whole; \
1706 \
1707 vmovn.u16 v_whole_8, v_whole; \
1708 \
e1f6de8f 1709 pld [fb_ptr]; \
75e28f62
E
1710 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1711 \
1712 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1713 setup_blocks_texture_##swizzling(); \
1714 \
1715 beq 5f; \
1716 \
1717 4: \
1718 vshrn.u32 u_whole_low, u_block, #16; \
1719 \
e1f6de8f 1720 vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32; \
75e28f62
E
1721 vshrn.u32 v_whole_low, v_block, #16; \
1722 \
1723 add block_ptr_b, block_ptr_b, #32; \
e1f6de8f 1724 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1725 \
1726 vdup.u32 dx4, uv_dx4[0]; \
1727 vaddhn.u32 u_whole_high, u_block, dx4; \
1728 vdup.u32 dx4, uv_dx4[1]; \
1729 \
1730 vaddhn.u32 v_whole_high, v_block, dx4; \
1731 vdup.u32 dx8, uv_dx8[0]; \
1732 \
1733 vadd.u32 u_block, u_block, dx8; \
1734 vdup.u32 dx8, uv_dx8[1]; \
1735 \
1736 vadd.u32 v_block, v_block, dx8; \
1737 vmovn.u16 u_whole_8, u_whole; \
1738 \
1739 add fb_ptr, fb_ptr, #16; \
1740 vmovn.u16 v_whole_8, v_whole; \
1741 \
e1f6de8f 1742 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1743 pld [fb_ptr]; \
75e28f62
E
1744 \
1745 vmov.u32 fb_mask_ptrs[1], fb_ptr; \
1746 subs span_num_blocks, span_num_blocks, #1; \
1747 \
1748 vand.u8 uv_whole_8, uv_whole_8, texture_mask; \
1749 setup_blocks_texture_##swizzling(); \
1750 \
1751 bne 4b; \
1752 \
1753 5: \
e1f6de8f 1754 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62 1755 \
e1f6de8f 1756 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
1757 vdup.u8 draw_mask, right_mask; \
1758 \
1759 vmov.u32 fb_mask_ptrs[0], right_mask; \
1760 vtst.u16 draw_mask, draw_mask, test_mask; \
1761 vzip.u8 u_whole_8, v_whole_8; \
1762 \
1763 vbic.u16 uv_whole_8, uv_whole_8, draw_mask; \
1764 add block_ptr_b, block_ptr_b, #32; \
e1f6de8f 1765 vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32; \
1766 vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
1767 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
75e28f62
E
1768 \
1769 1: \
1770 add span_uvrg_offset, span_uvrg_offset, #16; \
1771 add span_edge_data, span_edge_data, #8; \
1772 subs num_spans, num_spans, #1; \
1773 \
e1f6de8f 1774 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
1775 bne 0b; \
1776 \
1777 ldmia sp!, { r4 - r11, pc }; \
1778 \
1779 2: \
1780 /* TODO: Load from psx_gpu instead of saving/restoring these */\
1781 vpush { texture_mask }; \
1782 vpush { uvrg_dx4 }; \
1783 \
4d646738 1784 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 1785 bl flush_render_block_buffer; \
4d646738 1786 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
1787 \
1788 vpop { uvrg_dx4 }; \
1789 vpop { texture_mask }; \
1790 \
1791 vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
1792 vmov.u8 fb_mask_ptrs, #0; \
1793 \
1794 mov num_blocks, span_num_blocks; \
1795 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
1796 bal 3b \
1797
1798
1799setup_blocks_unshaded_textured_builder(swizzled)
1800setup_blocks_unshaded_textured_builder(unswizzled)
1801
1802
1803.align 3
1804
1805function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
e1f6de8f 1806 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1807 veor.u32 draw_mask, draw_mask, draw_mask
1808
1809 cmp num_spans, #0
1810 bxeq lr
1811
1812 stmdb sp!, { r4 - r11, r14 }
e1f6de8f 1813 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62 1814
e1f6de8f 1815 ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
75e28f62
E
1816
1817 ubfx color_r, color, #3, #5
1818 ubfx color_g, color, #11, #5
1819 ubfx color_b, color, #19, #5
1820
1821 orr color, color_r, color_b, lsl #10
1822 orr color, color, color_g, lsl #5
1823
1824 vdup.u16 colors, color
1825
e1f6de8f 1826 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
1827 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1828
1829 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1830 add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1831
1832 0:
e1f6de8f 1833 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1834 ldrh y, [span_edge_data, #edge_data_y_offset]
75e28f62 1835
e1f6de8f 1836 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62
E
1837
1838 cmp span_num_blocks, #0
1839 beq 1f
1840
e1f6de8f 1841 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
75e28f62
E
1842 add num_blocks, span_num_blocks, num_blocks
1843
1844 cmp num_blocks, #MAX_BLOCKS
1845 bgt 2f
1846
1847 3:
1848 add fb_ptr, fb_ptr, y, lsl #11
1849 and y, y, #0x3
1850
1851 add fb_ptr, fb_ptr, left_x, lsl #1
1852 mov c_32, #32
1853
1854 subs span_num_blocks, span_num_blocks, #1
1855
1856 add block_ptr_b, block_ptr_a, #16
e1f6de8f 1857 pld [fb_ptr]
75e28f62
E
1858
1859 vmov.u32 fb_mask_ptrs[1], fb_ptr
1860 beq 5f
1861
1862 4:
e1f6de8f 1863 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_32
1864 vst1.u32 { colors }, [block_ptr_b, :128], c_32
1865 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
75e28f62
E
1866
1867 add fb_ptr, fb_ptr, #16
1868 add block_ptr_b, block_ptr_b, #32
1869
e1f6de8f 1870 pld [fb_ptr]
75e28f62
E
1871
1872 vmov.u32 fb_mask_ptrs[1], fb_ptr
1873 subs span_num_blocks, span_num_blocks, #1
1874
1875 bne 4b
1876
1877 5:
e1f6de8f 1878 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]
75e28f62
E
1879
1880 vdup.u8 draw_mask_edge, right_mask
1881 vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1882
e1f6de8f 1883 vst1.u32 { colors }, [block_ptr_b, :128], c_32
1884 vst1.u32 { draw_mask_edge }, [block_ptr_a, :128], c_32
75e28f62 1885 add block_ptr_b, block_ptr_b, #32
e1f6de8f 1886 vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
75e28f62
E
1887
1888 1:
1889 add span_edge_data, span_edge_data, #8
1890 subs num_spans, num_spans, #1
1891
e1f6de8f 1892 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
1893 bne 0b
1894
1895 ldmia sp!, { r4 - r11, pc }
1896
1897 2:
1898 vpush { colors }
1899
4d646738 1900 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 1901 bl flush_render_block_buffer
4d646738 1902 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62
E
1903
1904 vpop { colors }
1905
e1f6de8f 1906 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
1907 veor.u32 draw_mask, draw_mask, draw_mask
1908
1909 mov num_blocks, span_num_blocks
1910 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1911 bal 3b
1912
1913
1914#define mask_msb_scalar r14
1915
1916#define msb_mask q15
1917
1918#define pixels_low d16
1919
1920#define msb_mask_low d30
1921#define msb_mask_high d31
1922
1923
1924.align 3
1925
1926function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
e1f6de8f 1927 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
75e28f62
E
1928
1929 cmp num_spans, #0
1930 bxeq lr
1931
1932 stmdb sp!, { r4 - r11, r14 }
1933
e1f6de8f 1934 ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
75e28f62
E
1935
1936 ubfx color_r, color, #3, #5
1937 ubfx color_g, color, #11, #5
1938
e1f6de8f 1939 ldrh mask_msb_scalar, [psx_gpu, #psx_gpu_mask_msb_offset]
75e28f62
E
1940 ubfx color_b, color, #19, #5
1941
1942 orr color, color_r, color_b, lsl #10
1943 orr color, color, color_g, lsl #5
1944 orr color, color, mask_msb_scalar
1945
1946 vdup.u16 colors, color
1947
1948 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
ed0fd81d 1949 orr color, color, color, lsl #16
3867c6ef 1950
75e28f62
E
1951
1952 0:
e1f6de8f 1953 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1954 ldrh y, [span_edge_data, #edge_data_y_offset]
75e28f62 1955
e1f6de8f 1956 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62
E
1957
1958 cmp span_num_blocks, #0
1959 beq 1f
1960
e1f6de8f 1961 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
75e28f62
E
1962
1963 add fb_ptr, fb_ptr, y, lsl #11
1964 subs span_num_blocks, span_num_blocks, #1
1965
1966 add fb_ptr, fb_ptr, left_x, lsl #1
1967 beq 3f
1968
1969 2:
e1f6de8f 1970 vst1.u32 { colors }, [fb_ptr]!
75e28f62
E
1971 subs span_num_blocks, span_num_blocks, #1
1972
1973 bne 2b
1974
1975 3:
e1f6de8f 1976 ldrb right_mask, [span_edge_data, #edge_data_right_mask_offset]
75e28f62 1977
3867c6ef
E
1978 cmp right_mask, #0x0
1979 beq 5f
1980
1981 tst right_mask, #0xF
e1f6de8f 1982 streq color, [fb_ptr], #4
3867c6ef 1983 moveq right_mask, right_mask, lsr #4
e1f6de8f 1984 streq color, [fb_ptr], #4
3867c6ef
E
1985
1986 tst right_mask, #0x3
e1f6de8f 1987 streq color, [fb_ptr], #4
3867c6ef
E
1988 moveq right_mask, right_mask, lsr #2
1989
1990 tst right_mask, #0x1
e1f6de8f 1991 strheq color, [fb_ptr]
75e28f62
E
1992
1993 1:
1994 add span_edge_data, span_edge_data, #8
1995 subs num_spans, num_spans, #1
75e28f62
E
1996 bne 0b
1997
1998 ldmia sp!, { r4 - r11, pc }
1999
3867c6ef 2000 5:
e1f6de8f 2001 vst1.u32 { colors }, [fb_ptr]
3867c6ef 2002 bal 1b
75e28f62
E
2003
2004
2005#undef c_64
2006
2007#define c_64 r7
2008#define rg_dx_ptr r2
2009
2010
2011#undef r_block
2012#undef g_block
2013#undef b_block
2014#undef r_whole
2015#undef g_whole
2016#undef b_whole
2017#undef r_whole_low
2018#undef r_whole_high
2019#undef g_whole_low
2020#undef g_whole_high
2021#undef b_whole_low
2022#undef b_whole_high
2023#undef r_whole_8
2024#undef g_whole_8
2025#undef b_whole_8
2026#undef dither_offsets
2027#undef rg_dx4
2028#undef rg_dx8
2029#undef dx4
2030#undef dx8
2031#undef v_left_x
2032#undef uvrg
2033#undef block_span
2034#undef rg
2035#undef draw_mask
2036#undef test_mask
2037
2038#define r_block q0
2039#define g_block q1
2040#define b_block q2
2041
2042#define r_whole q3
2043#define g_whole q4
2044#define b_whole q5
2045
2046#define r_whole_low d6
2047#define r_whole_high d7
2048#define g_whole_low d8
2049#define g_whole_high d9
2050#define b_whole_low d10
2051#define b_whole_high d11
2052
2053#define gb_whole_8 q6
2054
2055#define g_whole_8 d12
2056#define b_whole_8 d13
2057
2058#define r_whole_8 d14
2059
2060#define pixels q8
2061
2062#define rg_dx4 d18
2063#define rg_dx8 d19
2064
2065#define dx4 q10
2066#define dx8 q10
2067
2068#define v_left_x d6
2069#define uvrg q4
2070#define block_span q5
2071
2072#define rg d9
2073
2074#define d64_1 d22
2075#define d64_128 d23
2076
2077#define d128_4 q12
2078#define d128_0x7 q13
2079
2080#define d64_4 d24
2081
2082#define dither_offsets q14
2083#define draw_mask q15
2084
2085#define dither_offsets_low d28
2086
2087#define rg_dx d0
2088#define test_mask q10
2089
2090
2091#define setup_blocks_shaded_untextured_dither_a_dithered() \
2092 vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low; \
2093 vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets; \
2094
2095#define setup_blocks_shaded_untextured_dither_b_dithered() \
2096 vqsub.u8 r_whole_8, r_whole_8, d64_4; \
2097 vqsub.u8 gb_whole_8, gb_whole_8, d128_4 \
2098
2099#define setup_blocks_shaded_untextured_dither_a_undithered() \
2100
2101#define setup_blocks_shaded_untextured_dither_b_undithered() \
2102
2103
2104#define setup_blocks_shaded_untextured_indirect_builder(dithering) \
2105.align 3; \
2106 \
2107function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect) \
e1f6de8f 2108 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
2109 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2110 \
e1f6de8f 2111 vld1.u32 { rg_dx }, [rg_dx_ptr, :64]; \
75e28f62
E
2112 \
2113 cmp num_spans, #0; \
2114 bxeq lr; \
2115 \
2116 stmdb sp!, { r4 - r11, r14 }; \
2117 vshl.u32 rg_dx4, rg_dx, #2; \
2118 \
e1f6de8f 2119 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
2120 vshl.u32 rg_dx8, rg_dx, #3; \
2121 \
2122 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2123 \
e1f6de8f 2124 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
2125 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2126 \
2127 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2128 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2129 \
2130 add block_ptr_a, block_ptr_a, num_blocks, lsl #6; \
2131 vmov.u8 d64_1, #1; \
2132 \
2133 vmov.u8 d128_4, #4; \
2134 vmov.u8 d64_128, #128; \
2135 \
2136 vmov.u8 d128_0x7, #0x7; \
2137 \
2138 0: \
e1f6de8f 2139 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
2140 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2141 \
e1f6de8f 2142 ldrh y, [span_edge_data, #edge_data_y_offset]; \
2143 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
2144 \
2145 cmp span_num_blocks, #0; \
2146 beq 1f; \
2147 \
e1f6de8f 2148 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
2149 add num_blocks, span_num_blocks, num_blocks; \
2150 \
2151 cmp num_blocks, #MAX_BLOCKS; \
2152 bgt 2f; \
2153 \
2154 3: \
e1f6de8f 2155 ldr b, [span_b_offset]; \
75e28f62
E
2156 add fb_ptr, fb_ptr, y, lsl #11; \
2157 \
2158 vdup.u32 v_left_x, left_x; \
2159 and y, y, #0x3; \
2160 \
e1f6de8f 2161 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
2162 add fb_ptr, fb_ptr, left_x, lsl #1; \
2163 \
2164 mla b, b_dx, left_x, b; \
2165 and dither_shift, left_x, #0x03; \
2166 \
e1f6de8f 2167 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
2168 vshr.u32 rg_dx, rg_dx4, #2; \
2169 \
2170 mov dither_shift, dither_shift, lsl #3; \
2171 vmla.u32 rg, rg_dx, v_left_x; \
2172 \
2173 mov c_64, #64; \
2174 subs span_num_blocks, span_num_blocks, #1; \
2175 \
2176 mov dither_row, dither_row, ror dither_shift; \
2177 mov b_dx4, b_dx, lsl #2; \
2178 \
2179 vdup.u32 dither_offsets, dither_row; \
2180 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2181 \
2182 vdup.u32 b_block, b; \
2183 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2184 \
2185 mov b_dx8, b_dx, lsl #3; \
2186 vdup.u32 r_block, rg[0]; \
2187 vdup.u32 g_block, rg[1]; \
2188 \
e1f6de8f 2189 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2190 \
2191 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 2192 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2193 \
2194 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 2195 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
2196 \
2197 vadd.u32 b_block, b_block, block_span; \
2198 add block_ptr_b, block_ptr_a, #16; \
2199 \
2200 vshrn.u32 r_whole_low, r_block, #16; \
2201 vshrn.u32 g_whole_low, g_block, #16; \
2202 vshrn.u32 b_whole_low, b_block, #16; \
2203 vdup.u32 dx4, rg_dx4[0]; \
2204 \
2205 vaddhn.u32 r_whole_high, r_block, dx4; \
2206 vdup.u32 dx4, rg_dx4[1]; \
2207 \
2208 vaddhn.u32 g_whole_high, g_block, dx4; \
2209 vdup.u32 dx4, b_dx4; \
2210 \
2211 vaddhn.u32 b_whole_high, b_block, dx4; \
2212 vdup.u32 dx8, rg_dx8[0]; \
2213 \
2214 vadd.u32 r_block, r_block, dx8; \
2215 vdup.u32 dx8, rg_dx8[1]; \
2216 \
2217 vadd.u32 g_block, g_block, dx8; \
2218 vdup.u32 dx8, b_dx8; \
2219 \
2220 vadd.u32 b_block, b_block, dx8; \
2221 \
2222 vmovn.u16 r_whole_8, r_whole; \
2223 vmovn.u16 g_whole_8, g_whole; \
2224 vmovn.u16 b_whole_8, b_whole; \
2225 \
2226 beq 5f; \
2227 veor.u32 draw_mask, draw_mask, draw_mask; \
2228 \
2229 4: \
2230 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2231 vshrn.u32 r_whole_low, r_block, #16; \
2232 \
2233 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2234 vshrn.u32 g_whole_low, g_block, #16; \
2235 \
2236 vshrn.u32 b_whole_low, b_block, #16; \
e1f6de8f 2237 str fb_ptr, [block_ptr_a, #44]; \
75e28f62
E
2238 \
2239 vdup.u32 dx4, rg_dx4[0]; \
2240 vshr.u8 r_whole_8, r_whole_8, #3; \
2241 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2242 \
2243 vaddhn.u32 r_whole_high, r_block, dx4; \
2244 vdup.u32 dx4, rg_dx4[1]; \
2245 \
2246 vaddhn.u32 g_whole_high, g_block, dx4; \
2247 vdup.u32 dx4, b_dx4; \
2248 \
2249 vaddhn.u32 b_whole_high, b_block, dx4; \
2250 vdup.u32 dx8, rg_dx8[0]; \
2251 \
2252 vmull.u8 pixels, r_whole_8, d64_1; \
2253 vmlal.u8 pixels, g_whole_8, d64_4; \
2254 vmlal.u8 pixels, b_whole_8, d64_128; \
2255 \
2256 vadd.u32 r_block, r_block, dx8; \
2257 vdup.u32 dx8, rg_dx8[1]; \
2258 \
2259 vadd.u32 g_block, g_block, dx8; \
2260 vdup.u32 dx8, b_dx8; \
2261 \
2262 vadd.u32 b_block, b_block, dx8; \
2263 add fb_ptr, fb_ptr, #16; \
2264 \
2265 vmovn.u16 r_whole_8, r_whole; \
2266 vmovn.u16 g_whole_8, g_whole; \
2267 vmovn.u16 b_whole_8, b_whole; \
2268 \
e1f6de8f 2269 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64; \
2270 vst1.u32 { pixels }, [block_ptr_b, :128], c_64; \
75e28f62 2271 \
e1f6de8f 2272 pld [fb_ptr]; \
75e28f62
E
2273 \
2274 subs span_num_blocks, span_num_blocks, #1; \
2275 bne 4b; \
2276 \
2277 5: \
e1f6de8f 2278 str fb_ptr, [block_ptr_a, #44]; \
75e28f62
E
2279 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2280 \
e1f6de8f 2281 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62
E
2282 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2283 \
2284 vshr.u8 r_whole_8, r_whole_8, #3; \
2285 vdup.u8 draw_mask, right_mask; \
2286 \
2287 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
e1f6de8f 2288 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
2289 \
2290 vtst.u16 draw_mask, draw_mask, test_mask; \
2291 \
2292 vmull.u8 pixels, r_whole_8, d64_1; \
2293 vmlal.u8 pixels, g_whole_8, d64_4; \
2294 vmlal.u8 pixels, b_whole_8, d64_128; \
2295 \
e1f6de8f 2296 vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64; \
2297 vst1.u32 { pixels }, [block_ptr_b, :128], c_64; \
75e28f62
E
2298 \
2299 1: \
2300 add span_uvrg_offset, span_uvrg_offset, #16; \
2301 add span_b_offset, span_b_offset, #4; \
2302 \
2303 add span_edge_data, span_edge_data, #8; \
2304 subs num_spans, num_spans, #1; \
2305 \
e1f6de8f 2306 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
2307 bne 0b; \
2308 \
2309 ldmia sp!, { r4 - r11, pc }; \
2310 \
2311 2: \
2312 /* TODO: Load from psx_gpu instead of saving/restoring these */\
2313 vpush { rg_dx4 }; \
2314 \
4d646738 2315 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62 2316 bl flush_render_block_buffer; \
4d646738 2317 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
75e28f62
E
2318 \
2319 vpop { rg_dx4 }; \
2320 \
2321 vmov.u8 d64_1, #1; \
2322 vmov.u8 d128_4, #4; \
2323 vmov.u8 d64_128, #128; \
2324 vmov.u8 d128_0x7, #0x7; \
2325 \
2326 vadd.u32 rg_dx8, rg_dx4, rg_dx4; \
2327 \
2328 mov num_blocks, span_num_blocks; \
2329 add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset; \
2330 bal 3b \
2331
2332
2333setup_blocks_shaded_untextured_indirect_builder(undithered)
2334setup_blocks_shaded_untextured_indirect_builder(dithered)
2335
2336
2337#undef draw_mask
2338
2339#define mask_msb_ptr r14
2340
2341#define draw_mask q0
2342#define pixels_low d16
3867c6ef 2343#define pixels_high d17
75e28f62
E
2344
2345
2346
2347#define setup_blocks_shaded_untextured_direct_builder(dithering) \
2348.align 3; \
2349 \
2350function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
e1f6de8f 2351 ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]; \
75e28f62
E
2352 add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8); \
2353 \
e1f6de8f 2354 vld1.u32 { rg_dx }, [rg_dx_ptr, :64]; \
75e28f62
E
2355 \
2356 cmp num_spans, #0; \
2357 bxeq lr; \
2358 \
2359 stmdb sp!, { r4 - r11, r14 }; \
2360 vshl.u32 rg_dx4, rg_dx, #2; \
2361 \
e1f6de8f 2362 ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset]; \
75e28f62
E
2363 vshl.u32 rg_dx8, rg_dx, #3; \
2364 \
2365 add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
2366 add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset; \
2367 \
2368 add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset; \
2369 vmov.u8 d64_1, #1; \
2370 \
2371 vmov.u8 d128_4, #4; \
2372 vmov.u8 d64_128, #128; \
2373 \
2374 vmov.u8 d128_0x7, #0x7; \
2375 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 2376 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
2377 \
2378 0: \
e1f6de8f 2379 ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]; \
75e28f62
E
2380 add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset; \
2381 \
e1f6de8f 2382 ldrh y, [span_edge_data, #edge_data_y_offset]; \
2383 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62
E
2384 \
2385 cmp span_num_blocks, #0; \
2386 beq 1f; \
2387 \
e1f6de8f 2388 ldrh left_x, [span_edge_data, #edge_data_left_x_offset]; \
75e28f62
E
2389 add fb_ptr, fb_ptr, y, lsl #11; \
2390 \
e1f6de8f 2391 ldr b, [span_b_offset]; \
75e28f62
E
2392 vdup.u32 v_left_x, left_x; \
2393 and y, y, #0x3; \
2394 \
e1f6de8f 2395 ldr dither_row, [dither_offset_ptr, y, lsl #2]; \
75e28f62
E
2396 add fb_ptr, fb_ptr, left_x, lsl #1; \
2397 \
2398 mla b, b_dx, left_x, b; \
2399 and dither_shift, left_x, #0x03; \
2400 \
e1f6de8f 2401 vld1.u32 { uvrg }, [span_uvrg_offset, :128]; \
75e28f62
E
2402 vshr.u32 rg_dx, rg_dx4, #2; \
2403 \
2404 mov dither_shift, dither_shift, lsl #3; \
2405 vmla.u32 rg, rg_dx, v_left_x; \
2406 \
2407 subs span_num_blocks, span_num_blocks, #1; \
2408 \
2409 mov dither_row, dither_row, ror dither_shift; \
2410 mov b_dx4, b_dx, lsl #2; \
2411 \
2412 vdup.u32 dither_offsets, dither_row; \
2413 add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset; \
2414 \
2415 vdup.u32 b_block, b; \
2416 vadd.u8 dither_offsets, dither_offsets, d128_4; \
2417 \
2418 mov b_dx8, b_dx, lsl #3; \
2419 vdup.u32 r_block, rg[0]; \
2420 vdup.u32 g_block, rg[1]; \
2421 \
e1f6de8f 2422 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2423 \
2424 vadd.u32 r_block, r_block, block_span; \
e1f6de8f 2425 vld1.u32 { block_span }, [block_span_ptr, :128]!; \
75e28f62
E
2426 \
2427 vadd.u32 g_block, g_block, block_span; \
e1f6de8f 2428 vld1.u32 { block_span }, [block_span_ptr, :128]; \
75e28f62
E
2429 \
2430 vadd.u32 b_block, b_block, block_span; \
2431 add block_ptr_b, block_ptr_a, #16; \
2432 \
2433 vshrn.u32 r_whole_low, r_block, #16; \
2434 vshrn.u32 g_whole_low, g_block, #16; \
2435 vshrn.u32 b_whole_low, b_block, #16; \
2436 vdup.u32 dx4, rg_dx4[0]; \
2437 \
2438 vaddhn.u32 r_whole_high, r_block, dx4; \
2439 vdup.u32 dx4, rg_dx4[1]; \
2440 \
2441 vaddhn.u32 g_whole_high, g_block, dx4; \
2442 vdup.u32 dx4, b_dx4; \
2443 \
2444 vaddhn.u32 b_whole_high, b_block, dx4; \
2445 vdup.u32 dx8, rg_dx8[0]; \
2446 \
2447 vadd.u32 r_block, r_block, dx8; \
2448 vdup.u32 dx8, rg_dx8[1]; \
2449 \
2450 vadd.u32 g_block, g_block, dx8; \
2451 vdup.u32 dx8, b_dx8; \
2452 \
2453 vadd.u32 b_block, b_block, dx8; \
2454 \
2455 vmovn.u16 r_whole_8, r_whole; \
2456 vmovn.u16 g_whole_8, g_whole; \
2457 vmovn.u16 b_whole_8, b_whole; \
2458 \
2459 beq 3f; \
2460 \
2461 2: \
2462 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2463 vshrn.u32 r_whole_low, r_block, #16; \
2464 \
2465 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2466 vshrn.u32 g_whole_low, g_block, #16; \
2467 \
2468 vshrn.u32 b_whole_low, b_block, #16; \
2469 \
2470 vdup.u32 dx4, rg_dx4[0]; \
2471 vshr.u8 r_whole_8, r_whole_8, #3; \
2472 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
2473 \
2474 vaddhn.u32 r_whole_high, r_block, dx4; \
2475 vdup.u32 dx4, rg_dx4[1]; \
2476 \
2477 vmov pixels, msb_mask; \
2478 vaddhn.u32 g_whole_high, g_block, dx4; \
2479 vdup.u32 dx4, b_dx4; \
2480 \
2481 vaddhn.u32 b_whole_high, b_block, dx4; \
2482 vdup.u32 dx8, rg_dx8[0]; \
2483 \
2484 vmlal.u8 pixels, r_whole_8, d64_1; \
2485 vmlal.u8 pixels, g_whole_8, d64_4; \
2486 vmlal.u8 pixels, b_whole_8, d64_128; \
2487 \
2488 vadd.u32 r_block, r_block, dx8; \
2489 vdup.u32 dx8, rg_dx8[1]; \
2490 \
2491 vadd.u32 g_block, g_block, dx8; \
2492 vdup.u32 dx8, b_dx8; \
2493 \
2494 vadd.u32 b_block, b_block, dx8; \
2495 \
2496 vmovn.u16 r_whole_8, r_whole; \
2497 vmovn.u16 g_whole_8, g_whole; \
2498 vmovn.u16 b_whole_8, b_whole; \
2499 \
e1f6de8f 2500 vst1.u32 { pixels }, [fb_ptr]!; \
75e28f62
E
2501 subs span_num_blocks, span_num_blocks, #1; \
2502 bne 2b; \
2503 \
2504 3: \
2505 setup_blocks_shaded_untextured_dither_a_##dithering(); \
2506 \
e1f6de8f 2507 ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]; \
75e28f62
E
2508 setup_blocks_shaded_untextured_dither_b_##dithering(); \
2509 \
2510 vshr.u8 r_whole_8, r_whole_8, #3; \
3867c6ef 2511 rbit right_mask, right_mask; \
75e28f62
E
2512 vmov pixels, msb_mask; \
2513 vbic.u8 gb_whole_8, gb_whole_8, d128_0x7; \
3867c6ef 2514 clz right_mask, right_mask; \
75e28f62
E
2515 \
2516 vmlal.u8 pixels, r_whole_8, d64_1; \
2517 vmlal.u8 pixels, g_whole_8, d64_4; \
2518 vmlal.u8 pixels, b_whole_8, d64_128; \
2519 \
8184d7c5 2520 JT_OP_REL(100f, right_mask, temp); \
e1f6de8f 2521 JT_OP(ldr pc, [pc, right_mask, lsl #2]); \
3867c6ef 2522 nop; \
8184d7c5 2523 100: \
3867c6ef 2524 nop; \
8184d7c5 2525 .word JTE(100b, 4f); \
2526 .word JTE(100b, 5f); \
2527 .word JTE(100b, 6f); \
2528 .word JTE(100b, 7f); \
2529 .word JTE(100b, 8f); \
2530 .word JTE(100b, 9f); \
2531 .word JTE(100b, 10f); \
2532 .word JTE(100b, 11f); \
3867c6ef 2533 \
75e28f62 2534 4: \
e1f6de8f 2535 vst1.u16 { pixels_low[0] }, [fb_ptr]; \
3867c6ef
E
2536 bal 1f; \
2537 \
2538 5: \
e1f6de8f 2539 vst1.u32 { pixels_low[0] }, [fb_ptr]; \
3867c6ef
E
2540 bal 1f; \
2541 \
2542 6: \
e1f6de8f 2543 vst1.u32 { pixels_low[0] }, [fb_ptr]!; \
2544 vst1.u16 { pixels_low[2] }, [fb_ptr]; \
3867c6ef
E
2545 bal 1f; \
2546 \
2547 7: \
e1f6de8f 2548 vst1.u32 { pixels_low }, [fb_ptr]; \
3867c6ef
E
2549 bal 1f; \
2550 \
2551 8: \
e1f6de8f 2552 vst1.u32 { pixels_low }, [fb_ptr]!; \
2553 vst1.u16 { pixels_high[0] }, [fb_ptr]; \
3867c6ef
E
2554 bal 1f; \
2555 \
2556 9: \
e1f6de8f 2557 vst1.u32 { pixels_low }, [fb_ptr]!; \
2558 vst1.u32 { pixels_high[0] }, [fb_ptr]!; \
3867c6ef
E
2559 bal 1f; \
2560 \
2561 10: \
e1f6de8f 2562 vst1.u32 { pixels_low }, [fb_ptr]!; \
2563 vst1.u32 { pixels_high[0] }, [fb_ptr]!; \
2564 vst1.u16 { pixels_high[2] }, [fb_ptr]; \
3867c6ef
E
2565 bal 1f; \
2566 \
2567 11: \
e1f6de8f 2568 vst1.u32 { pixels }, [fb_ptr]; \
3867c6ef 2569 bal 1f; \
75e28f62
E
2570 \
2571 1: \
2572 add span_uvrg_offset, span_uvrg_offset, #16; \
2573 add span_b_offset, span_b_offset, #4; \
2574 \
2575 add span_edge_data, span_edge_data, #8; \
2576 subs num_spans, num_spans, #1; \
2577 \
2578 bne 0b; \
2579 \
2580 ldmia sp!, { r4 - r11, pc } \
2581
2582setup_blocks_shaded_untextured_direct_builder(undithered)
2583setup_blocks_shaded_untextured_direct_builder(dithered)
2584
2585
2586#undef psx_gpu
2587#undef num_blocks
2588#undef triangle
2589#undef c_64
2590
2591#define psx_gpu r0
2592#define block_ptr r1
2593#define num_blocks r2
2594#define uv_01 r3
2595#define uv_23 r4
2596#define uv_45 r5
2597#define uv_67 r6
2598#define uv_0 r7
2599#define uv_1 r3
2600#define uv_2 r8
2601#define uv_3 r4
2602#define uv_4 r9
2603#define uv_5 r5
2604#define uv_6 r10
2605#define uv_7 r6
2606#define texture_ptr r11
2607
2608#define pixel_0 r7
2609#define pixel_1 r3
2610#define pixel_2 r8
2611#define pixel_3 r4
2612#define pixel_4 r9
2613#define pixel_5 r5
2614#define pixel_6 r10
2615#define pixel_7 r6
2616
2617#define pixels_a r7
2618#define pixels_b r9
2619#define pixels_c r8
2620#define pixels_d r10
2621
2622#define c_64 r0
2623
2624#define clut_ptr r12
2625#define current_texture_mask r5
2626#define dirty_textures_mask r6
2627
2628#define texels d0
2629
2630#define clut_low_a d2
2631#define clut_low_b d3
2632#define clut_high_a d4
2633#define clut_high_b d5
2634
2635#define clut_a q1
2636#define clut_b q2
2637
2638#define texels_low d6
2639#define texels_high d7
2640
2641.align 3
2642
2643function(texture_blocks_untextured)
2644 bx lr
2645
2646
2647.align 3
2648
2649function(texture_blocks_4bpp)
2650 stmdb sp!, { r3 - r11, r14 }
2651 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2652
e1f6de8f 2653 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2654 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62 2655
e1f6de8f 2656 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2657 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]
75e28f62 2658
e1f6de8f 2659 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62
E
2660 vuzp.u8 clut_a, clut_b
2661
e1f6de8f 2662 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
2663 tst dirty_textures_mask, current_texture_mask
2664
2665 bne 1f
2666 mov c_64, #64
2667
26680:
2669 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2670
2671 uxtah uv_0, texture_ptr, uv_01
2672 uxtah uv_1, texture_ptr, uv_01, ror #16
2673
2674 uxtah uv_2, texture_ptr, uv_23
2675 uxtah uv_3, texture_ptr, uv_23, ror #16
2676
2677 uxtah uv_4, texture_ptr, uv_45
e1f6de8f 2678 ldrb pixel_0, [uv_0]
75e28f62
E
2679
2680 uxtah uv_5, texture_ptr, uv_45, ror #16
e1f6de8f 2681 ldrb pixel_1, [uv_1]
75e28f62
E
2682
2683 uxtah uv_6, texture_ptr, uv_67
e1f6de8f 2684 ldrb pixel_2, [uv_2]
75e28f62
E
2685
2686 uxtah uv_7, texture_ptr, uv_67, ror #16
e1f6de8f 2687 ldrb pixel_3, [uv_3]
75e28f62 2688
e1f6de8f 2689 ldrb pixel_4, [uv_4]
75e28f62
E
2690 subs num_blocks, num_blocks, #1
2691
e1f6de8f 2692 ldrb pixel_5, [uv_5]
75e28f62
E
2693 orr pixels_a, pixel_0, pixel_1, lsl #8
2694
e1f6de8f 2695 ldrb pixel_6, [uv_6]
75e28f62
E
2696 orr pixels_b, pixel_4, pixel_5, lsl #8
2697
e1f6de8f 2698 ldrb pixel_7, [uv_7]
75e28f62
E
2699 orr pixels_a, pixels_a, pixel_2, lsl #16
2700
2701 orr pixels_b, pixels_b, pixel_6, lsl #16
2702 orr pixels_a, pixels_a, pixel_3, lsl #24
2703
2704 orr pixels_b, pixels_b, pixel_7, lsl #24
ed0fd81d 2705 vmov texels, pixels_a, pixels_b
75e28f62
E
2706
2707 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2708 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2709
e1f6de8f 2710 vst2.u8 { texels_low, texels_high }, [block_ptr, :128], c_64
75e28f62
E
2711 bne 0b
2712
2713 ldmia sp!, { r3 - r11, pc }
2714
27151:
2716 stmdb sp!, { r1 - r2 }
2717 bl update_texture_4bpp_cache
2718
2719 mov c_64, #64
2720 ldmia sp!, { r1 - r2 }
2721 bal 0b
2722
2723
2724.align 3
2725
2726function(texture_blocks_8bpp)
2727 stmdb sp!, { r3 - r11, r14 }
2728 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2729
e1f6de8f 2730 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2731 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62 2732
e1f6de8f 2733 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2734 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62 2735
e1f6de8f 2736 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]
75e28f62
E
2737 tst dirty_textures_mask, current_texture_mask
2738
2739 bne 1f
2740 nop
2741
27420:
2743 ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2744
2745 uxtah uv_0, texture_ptr, uv_01
2746 uxtah uv_1, texture_ptr, uv_01, ror #16
2747
2748 uxtah uv_2, texture_ptr, uv_23
2749 uxtah uv_3, texture_ptr, uv_23, ror #16
2750
2751 uxtah uv_4, texture_ptr, uv_45
e1f6de8f 2752 ldrb pixel_0, [uv_0]
75e28f62
E
2753
2754 uxtah uv_5, texture_ptr, uv_45, ror #16
e1f6de8f 2755 ldrb pixel_1, [uv_1]
75e28f62
E
2756
2757 uxtah uv_6, texture_ptr, uv_67
e1f6de8f 2758 ldrb pixel_2, [uv_2]
75e28f62
E
2759
2760 uxtah uv_7, texture_ptr, uv_67, ror #16
e1f6de8f 2761 ldrb pixel_3, [uv_3]
75e28f62 2762
e1f6de8f 2763 ldrb pixel_4, [uv_4]
75e28f62
E
2764 add pixel_0, pixel_0, pixel_0
2765
e1f6de8f 2766 ldrb pixel_5, [uv_5]
75e28f62
E
2767 add pixel_1, pixel_1, pixel_1
2768
e1f6de8f 2769 ldrb pixel_6, [uv_6]
75e28f62
E
2770 add pixel_2, pixel_2, pixel_2
2771
e1f6de8f 2772 ldrb pixel_7, [uv_7]
75e28f62
E
2773 add pixel_3, pixel_3, pixel_3
2774
e1f6de8f 2775 ldrh pixel_0, [clut_ptr, pixel_0]
75e28f62
E
2776 add pixel_4, pixel_4, pixel_4
2777
e1f6de8f 2778 ldrh pixel_1, [clut_ptr, pixel_1]
75e28f62
E
2779 add pixel_5, pixel_5, pixel_5
2780
e1f6de8f 2781 ldrh pixel_2, [clut_ptr, pixel_2]
75e28f62
E
2782 add pixel_6, pixel_6, pixel_6
2783
e1f6de8f 2784 ldrh pixel_3, [clut_ptr, pixel_3]
75e28f62
E
2785 add pixel_7, pixel_7, pixel_7
2786
e1f6de8f 2787 ldrh pixel_4, [clut_ptr, pixel_4]
75e28f62
E
2788 orr pixels_a, pixel_0, pixel_1, lsl #16
2789
e1f6de8f 2790 ldrh pixel_5, [clut_ptr, pixel_5]
75e28f62
E
2791 orr pixels_c, pixel_2, pixel_3, lsl #16
2792
e1f6de8f 2793 ldrh pixel_6, [clut_ptr, pixel_6]
75e28f62
E
2794 subs num_blocks, num_blocks, #1
2795
e1f6de8f 2796 ldrh pixel_7, [clut_ptr, pixel_7]
75e28f62
E
2797 orr pixels_b, pixel_4, pixel_5, lsl #16
2798
2799 orr pixels_d, pixel_6, pixel_7, lsl #16
2800 stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d }
2801
2802 add block_ptr, block_ptr, #64
2803 bne 0b
2804
2805 ldmia sp!, { r3 - r11, pc }
2806
28071:
4d646738 2808 stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
75e28f62
E
2809
2810 bl update_texture_8bpp_cache
2811
4d646738 2812 ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
75e28f62
E
2813 bal 0b
2814
2815
2816#undef uv_0
2817#undef uv_1
2818#undef uv_2
2819#undef uv_3
2820#undef uv_4
2821#undef uv_5
2822#undef uv_6
2823#undef uv_7
2824
2825#undef pixel_0
2826#undef pixel_1
2827#undef pixel_2
2828#undef pixel_3
2829#undef pixel_4
2830#undef pixel_5
2831#undef pixel_6
2832#undef pixel_7
2833
2834#undef texture_ptr
2835
2836#undef pixels_a
2837#undef pixels_b
2838#undef pixels_c
2839#undef pixels_d
2840
2841#define psx_gpu r0
2842#define block_ptr r1
2843#define num_blocks r2
2844
2845#define uv_0 r3
2846#define uv_1 r4
2847#define u_0 r3
2848#define u_1 r4
2849#define v_0 r5
2850#define v_1 r6
2851
2852#define uv_2 r5
2853#define uv_3 r6
2854#define u_2 r5
2855#define u_3 r6
2856#define v_2 r7
2857#define v_3 r8
2858
2859#define uv_4 r7
2860#define uv_5 r8
2861#define u_4 r7
2862#define u_5 r8
2863#define v_4 r9
2864#define v_5 r10
2865
2866#define uv_6 r9
2867#define uv_7 r10
2868#define u_6 r9
2869#define u_7 r10
2870#define v_6 r11
2871#define v_7 r0
2872
2873#define pixel_0 r3
2874#define pixel_1 r4
2875#define pixel_2 r5
2876#define pixel_3 r6
2877#define pixel_4 r7
2878#define pixel_5 r8
2879#define pixel_6 r9
2880#define pixel_7 r10
2881
2882#define pixels_a r3
2883#define pixels_b r5
2884#define pixels_c r7
2885#define pixels_d r9
2886
2887#define texture_ptr r12
2888
2889
2890.align 3
2891
2892function(texture_blocks_16bpp)
2893 stmdb sp!, { r3 - r11, r14 }
2894 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2895
e1f6de8f 2896 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2897 ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
75e28f62
E
2898
28990:
e1f6de8f 2900 ldrh uv_0, [block_ptr]
75e28f62
E
2901 subs num_blocks, num_blocks, #1
2902
e1f6de8f 2903 ldrh uv_1, [block_ptr, #2]
75e28f62
E
2904
2905 and v_0, uv_0, #0xFF00
2906 and v_1, uv_1, #0xFF00
2907
2908 and u_0, uv_0, #0xFF
2909 and u_1, uv_1, #0xFF
2910
2911 add uv_0, u_0, v_0, lsl #2
e1f6de8f 2912 ldrh uv_2, [block_ptr, #4]
75e28f62
E
2913
2914 add uv_1, u_1, v_1, lsl #2
e1f6de8f 2915 ldrh uv_3, [block_ptr, #6]
75e28f62
E
2916
2917 add uv_0, uv_0, uv_0
2918 add uv_1, uv_1, uv_1
2919
2920 and v_2, uv_2, #0xFF00
2921 and v_3, uv_3, #0xFF00
2922
2923 and u_2, uv_2, #0xFF
2924 and u_3, uv_3, #0xFF
2925
2926 add uv_2, u_2, v_2, lsl #2
e1f6de8f 2927 ldrh uv_4, [block_ptr, #8]
75e28f62
E
2928
2929 add uv_3, u_3, v_3, lsl #2
e1f6de8f 2930 ldrh uv_5, [block_ptr, #10]
75e28f62
E
2931
2932 add uv_2, uv_2, uv_2
2933 add uv_3, uv_3, uv_3
2934
2935 and v_4, uv_4, #0xFF00
2936 and v_5, uv_5, #0xFF00
2937
2938 and u_4, uv_4, #0xFF
2939 and u_5, uv_5, #0xFF
2940
2941 add uv_4, u_4, v_4, lsl #2
e1f6de8f 2942 ldrh uv_6, [block_ptr, #12]
75e28f62
E
2943
2944 add uv_5, u_5, v_5, lsl #2
e1f6de8f 2945 ldrh uv_7, [block_ptr, #14]
75e28f62
E
2946
2947 add uv_4, uv_4, uv_4
e1f6de8f 2948 ldrh pixel_0, [texture_ptr, uv_0]
75e28f62
E
2949
2950 add uv_5, uv_5, uv_5
e1f6de8f 2951 ldrh pixel_1, [texture_ptr, uv_1]
75e28f62
E
2952
2953 and v_6, uv_6, #0xFF00
e1f6de8f 2954 ldrh pixel_2, [texture_ptr, uv_2]
75e28f62
E
2955
2956 and v_7, uv_7, #0xFF00
e1f6de8f 2957 ldrh pixel_3, [texture_ptr, uv_3]
75e28f62
E
2958
2959 and u_6, uv_6, #0xFF
e1f6de8f 2960 ldrh pixel_4, [texture_ptr, uv_4]
75e28f62
E
2961
2962 and u_7, uv_7, #0xFF
e1f6de8f 2963 ldrh pixel_5, [texture_ptr, uv_5]
75e28f62
E
2964
2965 add uv_6, u_6, v_6, lsl #2
2966 add uv_7, u_7, v_7, lsl #2
2967
2968 add uv_6, uv_6, uv_6
2969 add uv_7, uv_7, uv_7
2970
2971 orr pixels_a, pixel_0, pixel_1, lsl #16
2972 orr pixels_b, pixel_2, pixel_3, lsl #16
2973
e1f6de8f 2974 ldrh pixel_6, [texture_ptr, uv_6]
75e28f62
E
2975 orr pixels_c, pixel_4, pixel_5, lsl #16
2976
e1f6de8f 2977 ldrh pixel_7, [texture_ptr, uv_7]
75e28f62
E
2978 orr pixels_d, pixel_6, pixel_7, lsl #16
2979
2980 stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2981 add block_ptr, block_ptr, #64
2982
2983 bne 0b
2984
2985 ldmia sp!, { r3 - r11, pc }
2986
2987
2988#undef num_blocks
2989
2990#undef test_mask
2991#undef texels
2992#undef pixels_b
2993#undef pixels
2994#undef d64_1
2995#undef d64_4
2996#undef d64_128
2997#undef draw_mask
2998#undef msb_mask
2999#undef msb_mask_low
3000#undef msb_mask_high
3001#undef fb_pixels
3002
3003#undef c_32
3004#undef fb_ptr
3005#undef mask_msb_ptr
3006
3007#define psx_gpu r0
3008#define num_blocks r1
3009#define color_ptr r2
3867c6ef
E
3010#define colors_scalar r2
3011#define colors_scalar_compare r3
75e28f62
E
3012#define mask_msb_ptr r2
3013
3014#define block_ptr_load_a r0
3015#define block_ptr_store r3
3016#define block_ptr_load_b r12
3017#define c_32 r2
3018
3019#define c_48 r4
3020#define fb_ptr r14
3021#define draw_mask_bits_scalar r5
3022
3023#define d128_0x07 q0
3024#define d128_0x1F q1
3025#define d128_0x8000 q2
3026#define test_mask q3
3027#define texels q4
3028#define colors_rg q5
3029#define colors_b_dm_bits q6
3030#define texels_rg q7
3031#define pixels_r q8
3032#define pixels_g q9
3033#define pixels_b q10
3034#define pixels q11
3035#define zero_mask q4
3036#define draw_mask q12
3037#define msb_mask q13
3038
3039#define fb_pixels q8
3040
3041#define pixels_gb_low q9
3042
3043#define colors_r d10
3044#define colors_g d11
3045#define colors_b d12
3046#define draw_mask_bits d13
3047#define texels_r d14
3048#define texels_g d15
3049#define pixels_r_low d16
3050#define pixels_g_low d18
3051#define pixels_b_low d19
3052#define msb_mask_low d26
3053#define msb_mask_high d27
3054
3055#define d64_1 d28
3056#define d64_4 d29
3057#define d64_128 d30
3058#define texels_b d31
3059
3060#define shade_blocks_textured_modulated_prologue_indirect() \
3061 mov c_48, #48; \
3062 add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset \
3063
3064#define shade_blocks_textured_modulated_prologue_direct() \
3065 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3066 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16] \
75e28f62 3067
75e28f62 3068
3867c6ef
E
3069#define shade_blocks_textured_modulated_prologue_shaded(dithering, target) \
3070
3071#define shade_blocks_textured_false_modulation_check_undithered(target) \
e1f6de8f 3072 ldr colors_scalar, [psx_gpu, #psx_gpu_triangle_color_offset]; \
3867c6ef
E
3073 movw colors_scalar_compare, #0x8080; \
3074 \
3075 movt colors_scalar_compare, #0x80; \
3076 cmp colors_scalar, colors_scalar_compare; \
3077 beq shade_blocks_textured_unmodulated_##target \
3078
3079#define shade_blocks_textured_false_modulation_check_dithered(target) \
3080
3081#define shade_blocks_textured_modulated_prologue_unshaded(dithering, target) \
3082 shade_blocks_textured_false_modulation_check_##dithering(target); \
75e28f62 3083 add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset; \
e1f6de8f 3084 vld1.u32 { colors_r[] }, [color_ptr, :32]; \
75e28f62
E
3085 vdup.u8 colors_g, colors_r[1]; \
3086 vdup.u8 colors_b, colors_r[2]; \
3087 vdup.u8 colors_r, colors_r[0] \
3088
3089
3090#define shade_blocks_textured_modulated_load_dithered(target) \
e1f6de8f 3091 vld1.u32 { target }, [block_ptr_load_b, :128] \
75e28f62
E
3092
3093#define shade_blocks_textured_modulated_load_last_dithered(target) \
e1f6de8f 3094 vld1.u32 { target }, [block_ptr_load_b, :128], c_32 \
75e28f62
E
3095
3096#define shade_blocks_textured_modulated_load_undithered(target) \
3097
3098#define shade_blocks_textured_modulated_load_last_undithered(target) \
3099 add block_ptr_load_b, block_ptr_load_b, #32 \
3100
3101#define shade_blocks_textured_modulate_dithered(channel) \
3102 vmlal.u8 pixels_##channel, texels_##channel, colors_##channel \
3103
3104#define shade_blocks_textured_modulate_undithered(channel) \
3105 vmull.u8 pixels_##channel, texels_##channel, colors_##channel \
3106
3107
3108#define shade_blocks_textured_modulated_store_draw_mask_indirect(offset) \
e1f6de8f 3109 vst1.u32 { draw_mask }, [block_ptr_store, :128]! \
75e28f62
E
3110
3111#define shade_blocks_textured_modulated_store_draw_mask_direct(offset) \
e1f6de8f 3112 ldr fb_ptr, [block_ptr_load_b, #(offset - 64)]; \
3113 vld1.u32 { fb_pixels }, [fb_ptr]; \
75e28f62
E
3114 vbit.u16 pixels, fb_pixels, draw_mask \
3115
3116#define shade_blocks_textured_modulated_store_pixels_indirect() \
e1f6de8f 3117 vst1.u32 { pixels }, [block_ptr_store, :128], c_48 \
75e28f62
E
3118
3119#define shade_blocks_textured_modulated_store_pixels_direct() \
e1f6de8f 3120 vst1.u32 { pixels }, [fb_ptr] \
75e28f62
E
3121
3122
3123#define shade_blocks_textured_modulated_load_rg_shaded() \
e1f6de8f 3124 vld1.u32 { colors_r, colors_g }, [block_ptr_load_b, :128], c_32 \
75e28f62
E
3125
3126#define shade_blocks_textured_modulated_load_rg_unshaded() \
3127 add block_ptr_load_b, block_ptr_load_b, #32 \
3128
3129#define shade_blocks_textured_modulated_load_bdm_shaded() \
e1f6de8f 3130 vld1.u32 { colors_b, draw_mask_bits }, [block_ptr_load_a, :128], c_32 \
75e28f62
E
3131
3132#define shade_blocks_textured_modulated_load_bdm_unshaded() \
e1f6de8f 3133 ldr draw_mask_bits_scalar, [block_ptr_load_a, #8]; \
75e28f62
E
3134 add block_ptr_load_a, block_ptr_load_a, #32 \
3135
3136#define shade_blocks_textured_modulated_expand_draw_mask_shaded() \
3137 vdup.u16 draw_mask, draw_mask_bits[0] \
3138
3139#define shade_blocks_textured_modulated_expand_draw_mask_unshaded() \
3140 vdup.u16 draw_mask, draw_mask_bits_scalar \
3141
3142
3143#define shade_blocks_textured_modulated_apply_msb_mask_indirect() \
3144
3145#define shade_blocks_textured_modulated_apply_msb_mask_direct() \
3146 vorr.u16 pixels, pixels, msb_mask \
3147
3148
3149#define shade_blocks_textured_modulated_builder(shading, dithering, target) \
3150.align 3; \
3151 \
3152function(shade_blocks_##shading##_textured_modulated_##dithering##_##target) \
3867c6ef 3153 shade_blocks_textured_modulated_prologue_##shading(dithering, target); \
75e28f62 3154 stmdb sp!, { r4 - r5, lr }; \
e1f6de8f 3155 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62 3156 \
e1f6de8f 3157 vld1.u32 { test_mask }, [psx_gpu, :128]; \
75e28f62
E
3158 \
3159 shade_blocks_textured_modulated_prologue_##target(); \
75e28f62
E
3160 \
3161 add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset; \
3162 mov c_32, #32; \
3163 \
3164 add block_ptr_load_b, block_ptr_load_a, #16; \
3165 vmov.u8 d64_1, #1; \
3166 vmov.u8 d64_4, #4; \
3167 vmov.u8 d64_128, #128; \
3168 \
e1f6de8f 3169 vld1.u32 { texels }, [block_ptr_load_a, :128], c_32; \
75e28f62
E
3170 vmov.u8 d128_0x07, #0x07; \
3171 \
3172 shade_blocks_textured_modulated_load_rg_##shading(); \
3173 vmov.u8 d128_0x1F, #0x1F; \
3174 \
3175 shade_blocks_textured_modulated_load_bdm_##shading(); \
3176 vmov.u16 d128_0x8000, #0x8000; \
3177 \
3178 vmovn.u16 texels_r, texels; \
3179 vshrn.u16 texels_g, texels, #5; \
3180 \
3181 vshrn.u16 texels_b, texels, #7; \
3182 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3183 \
3184 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3185 vtst.u16 draw_mask, draw_mask, test_mask; \
3186 \
3187 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3188 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3189 \
3190 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3191 vshr.u8 texels_b, texels_b, #3; \
3192 \
3193 shade_blocks_textured_modulate_##dithering(r); \
3194 shade_blocks_textured_modulate_##dithering(g); \
3195 shade_blocks_textured_modulate_##dithering(b); \
3196 \
3197 vand.u16 pixels, texels, d128_0x8000; \
3198 vceq.u16 zero_mask, texels, #0; \
3199 \
3200 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3201 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3202 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3203 \
3204 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3205 vorr.u16 draw_mask, draw_mask, zero_mask; \
3206 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3207 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3208 \
3209 subs num_blocks, num_blocks, #1; \
3210 beq 1f; \
3211 \
3212 .align 3; \
3213 \
3214 0: \
e1f6de8f 3215 vld1.u32 { texels }, [block_ptr_load_a, :128], c_32; \
75e28f62
E
3216 shade_blocks_textured_modulated_load_rg_##shading(); \
3217 vshrn.u16 texels_g, texels, #5; \
3218 \
3219 shade_blocks_textured_modulated_load_bdm_##shading(); \
3220 vshrn.u16 texels_b, texels, #7; \
3221 \
e1f6de8f 3222 pld [block_ptr_load_a]; \
75e28f62
E
3223 vmovn.u16 texels_r, texels; \
3224 vmlal.u8 pixels, pixels_r_low, d64_1; \
3225 \
3226 vmlal.u8 pixels, pixels_g_low, d64_4; \
3227 vmlal.u8 pixels, pixels_b_low, d64_128; \
3228 shade_blocks_textured_modulated_store_draw_mask_##target(-4); \
3229 \
3230 shade_blocks_textured_modulated_load_##dithering(pixels_r); \
3231 shade_blocks_textured_modulated_expand_draw_mask_##shading(); \
3232 \
3233 shade_blocks_textured_modulated_load_##dithering(pixels_g); \
3234 vand.u8 texels_rg, texels_rg, d128_0x1F; \
3235 \
3236 shade_blocks_textured_modulated_load_last_##dithering(pixels_b); \
3237 vtst.u16 draw_mask, draw_mask, test_mask; \
3238 \
3239 shade_blocks_textured_modulated_store_pixels_##target(); \
3240 vshr.u8 texels_b, texels_b, #3; \
3241 \
3242 shade_blocks_textured_modulate_##dithering(r); \
3243 shade_blocks_textured_modulate_##dithering(g); \
3244 shade_blocks_textured_modulate_##dithering(b); \
3245 \
3246 vand.u16 pixels, texels, d128_0x8000; \
3247 vceq.u16 zero_mask, texels, #0; \
3248 \
3249 subs num_blocks, num_blocks, #1; \
3250 \
3251 vqshrun.s16 pixels_r_low, pixels_r, #4; \
3252 vqshrun.s16 pixels_g_low, pixels_g, #4; \
3253 vqshrun.s16 pixels_b_low, pixels_b, #4; \
3254 \
3255 shade_blocks_textured_modulated_apply_msb_mask_##target(); \
3256 vorr.u16 draw_mask, draw_mask, zero_mask; \
3257 vshr.u8 pixels_r_low, pixels_r_low, #3; \
3258 vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07; \
3259 \
3260 bne 0b; \
3261 \
3262 1: \
3263 vmlal.u8 pixels, pixels_r_low, d64_1; \
3264 vmlal.u8 pixels, pixels_g_low, d64_4; \
3265 vmlal.u8 pixels, pixels_b_low, d64_128; \
3266 \
3267 shade_blocks_textured_modulated_store_draw_mask_##target(28); \
3268 shade_blocks_textured_modulated_store_pixels_##target(); \
3269 \
3270 ldmia sp!, { r4 - r5, pc } \
3271
3272
3273shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3274shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3275shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3276shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3277
3278shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3279shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3280shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3281shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3282
3283
3284#undef c_64
3285#undef fb_ptr
3286#undef color_ptr
3287
3288#undef color_r
3289#undef color_g
3290#undef color_b
3291
3292#undef test_mask
3293#undef pixels
3294#undef draw_mask
3295#undef zero_mask
3296#undef fb_pixels
3297#undef msb_mask
3298#undef msb_mask_low
3299#undef msb_mask_high
3300
3301#define psx_gpu r0
3302#define num_blocks r1
3303#define mask_msb_ptr r2
3304#define color_ptr r3
3305
3306#define block_ptr_load r0
3307#define draw_mask_store_ptr r3
3308#define draw_mask_bits_ptr r12
3309#define draw_mask_ptr r12
3310#define pixel_store_ptr r14
3311
3312#define fb_ptr_cmp r4
3313
3314#define fb_ptr r3
3315#define fb_ptr_next r14
3316
3317#define c_64 r2
3318
3319#define test_mask q0
3320#define pixels q1
3321#define draw_mask q2
3322#define zero_mask q3
3323#define draw_mask_combined q4
3324#define fb_pixels q5
3325#define fb_pixels_next q6
3326#define msb_mask q7
3327
3328#define draw_mask_low d4
3329#define draw_mask_high d5
3330#define msb_mask_low d14
3331#define msb_mask_high d15
3332
3333.align 3
3334function(shade_blocks_textured_unmodulated_indirect)
e1f6de8f 3335 str r14, [sp, #-4]
75e28f62
E
3336 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3337
e1f6de8f 3338 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3339 add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3340
e1f6de8f 3341 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
3342 add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3343
3344 mov c_64, #64
3345 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3346
e1f6de8f 3347 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
75e28f62 3348 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3349 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3350 vceq.u16 zero_mask, pixels, #0
3351
3352 vtst.u16 draw_mask, draw_mask, test_mask
e1f6de8f 3353 vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
75e28f62
E
3354
3355 subs num_blocks, num_blocks, #1
3356 beq 1f
3357
3358 0:
e1f6de8f 3359 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
75e28f62
E
3360 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3361
3362 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3363 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3364 vceq.u16 zero_mask, pixels, #0
3365
3366 vtst.u16 draw_mask, draw_mask, test_mask
e1f6de8f 3367 vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
75e28f62 3368
e1f6de8f 3369 vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
75e28f62
E
3370 subs num_blocks, num_blocks, #1
3371
3372 bne 0b
3373
3374 1:
3375 vorr.u16 draw_mask_combined, draw_mask, zero_mask
e1f6de8f 3376 vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
75e28f62 3377
e1f6de8f 3378 ldr pc, [sp, #-4]
75e28f62
E
3379
3380
3381.align 3
3382
3383function(shade_blocks_textured_unmodulated_direct)
3384 stmdb sp!, { r4, r14 }
3385 add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3386
e1f6de8f 3387 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3388 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3389
e1f6de8f 3390 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
3391 mov c_64, #64
3392
e1f6de8f 3393 vld1.u32 { test_mask }, [psx_gpu, :128]
75e28f62
E
3394 add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3395
3396 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3397 [draw_mask_bits_ptr, :16], c_64
3398 ldr fb_ptr_next, [block_ptr_load, #44]
75e28f62 3399
e1f6de8f 3400 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3401 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3402 vceq.u16 zero_mask, pixels, #0
3403 vtst.u16 draw_mask, draw_mask, test_mask
3404
3405 subs num_blocks, num_blocks, #1
3406 beq 1f
3407
3408 0:
3409 mov fb_ptr, fb_ptr_next
e1f6de8f 3410 ldr fb_ptr_next, [block_ptr_load, #44]
75e28f62
E
3411
3412 vorr.u16 pixels, pixels, msb_mask
3413
3414 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3415 vmov fb_pixels, fb_pixels_next
3416
3417 vld1.u16 { draw_mask_low[], draw_mask_high[] }, \
e1f6de8f 3418 [draw_mask_bits_ptr, :16], c_64
75e28f62
E
3419 vbif.u16 fb_pixels, pixels, draw_mask_combined
3420
75e28f62 3421 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
e1f6de8f 3422 pld [fb_ptr_next, #64]
8438c3c7 3423
75e28f62 3424 add fb_ptr_cmp, fb_ptr_cmp, #14
e1f6de8f 3425 vld1.u32 { pixels }, [block_ptr_load, :128], c_64
8438c3c7 3426
75e28f62
E
3427 cmp fb_ptr_cmp, #28
3428 bls 4f
3429
e1f6de8f 3430 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3431 vceq.u16 zero_mask, pixels, #0
3432
e1f6de8f 3433 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3434 vtst.u16 draw_mask, draw_mask, test_mask
3435
3436 3:
3437 subs num_blocks, num_blocks, #1
3438 bne 0b
3439
3440 1:
3441 vorr.u16 draw_mask_combined, draw_mask, zero_mask
3442 vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3443
e1f6de8f 3444 vst1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3445
3446 ldmia sp!, { r4, pc }
3447
3448 4:
e1f6de8f 3449 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3450 vceq.u16 zero_mask, pixels, #0
3451
e1f6de8f 3452 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3453 vtst.u16 draw_mask, draw_mask, test_mask
3454
3455 bal 3b
3456
3457
3458function(shade_blocks_unshaded_untextured_indirect)
3459 bx lr
3460
3461.align 3
3462
3463function(shade_blocks_unshaded_untextured_direct)
3464 stmdb sp!, { r4, r14 }
3465 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3466
e1f6de8f 3467 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
3468 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3469
e1f6de8f 3470 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
3471 add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3472
3473 add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
e1f6de8f 3474 vld1.u16 { pixels }, [color_ptr, :128]
75e28f62
E
3475
3476 mov c_64, #64
e1f6de8f 3477 vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62
E
3478
3479 vorr.u16 pixels, pixels, msb_mask
3480 subs num_blocks, num_blocks, #1
3481
e1f6de8f 3482 ldr fb_ptr_next, [block_ptr_load], #64
75e28f62 3483
e1f6de8f 3484 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3485 beq 1f
3486
3487 0:
3488 vmov fb_pixels, fb_pixels_next
3489 mov fb_ptr, fb_ptr_next
e1f6de8f 3490 ldr fb_ptr_next, [block_ptr_load], #64
75e28f62
E
3491
3492 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 3493 vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62
E
3494
3495 sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3496 add fb_ptr_cmp, fb_ptr_cmp, #14
3497 cmp fb_ptr_cmp, #28
3498 bls 4f
3499
e1f6de8f 3500 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3501 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
3502
3503 3:
3504 subs num_blocks, num_blocks, #1
3505 bne 0b
3506
3507 1:
3508 vbif.u16 fb_pixels_next, pixels, draw_mask
e1f6de8f 3509 vst1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3510
3511 ldmia sp!, { r4, pc }
3512
3513 4:
e1f6de8f 3514 vst1.u16 { fb_pixels }, [fb_ptr]
3515 vld1.u16 { fb_pixels_next }, [fb_ptr_next]
75e28f62
E
3516 bal 3b
3517
3518
3519#undef draw_mask_ptr
3520#undef c_64
3521#undef fb_ptr
3522#undef fb_ptr_next
3523#undef fb_ptr_cmp
3524
3525#define psx_gpu r0
3526#define num_blocks r1
3527#define msb_mask_ptr r2
3528#define pixel_ptr r3
3529#define draw_mask_ptr r0
3530#define c_64 r2
3531#define fb_ptr r12
3532#define fb_ptr_next r14
3533#define fb_ptr_cmp r4
3534
3535#undef msb_mask
3536#undef draw_mask
3537#undef pixels
3538#undef fb_pixels
3539#undef d128_0x8000
3540#undef msb_mask_low
3541#undef msb_mask_high
3542#undef draw_mask_next
3543#undef pixels_g
3544#undef blend_pixels
3545#undef fb_pixels_next
3546
3547#define msb_mask q0
3548#define draw_mask q1
3549#define pixels q2
3550#define fb_pixels q3
3551#define blend_pixels q4
3552#define pixels_no_msb q5
3553#define blend_mask q6
3554#define fb_pixels_no_msb q7
3555#define d128_0x8000 q8
3556#define d128_0x0421 q9
3557#define fb_pixels_next q10
3558#define blend_pixels_next q11
3559#define pixels_next q12
3560#define draw_mask_next q13
3561#define write_mask q14
3562
3563#define pixels_rb q5
3564#define pixels_mg q7
3565#define pixels_g q7
3566#define d128_0x7C1F q8
3567#define d128_0x03E0 q9
3568#define fb_pixels_rb q10
3569#define fb_pixels_g q11
3570#define fb_pixels_masked q11
3571#define d128_0x83E0 q15
3572#define pixels_fourth q7
3573#define d128_0x1C07 q12
3574#define d128_0x00E0 q13
3575#define d128_0x80E0 q13
3576
3577#define msb_mask_low d0
3578#define msb_mask_high d1
3579
3580#define blend_blocks_average_set_blend_mask_textured(source) \
3581 vclt.s16 blend_mask, source, #0 \
3582
3583#define blend_blocks_average_set_stp_bit_textured() \
3584 vorr.u16 blend_pixels, #0x8000 \
3585
3586#define blend_blocks_average_combine_textured(source) \
3587 vbif.u16 blend_pixels, source, blend_mask \
3588
3589#define blend_blocks_average_set_blend_mask_untextured(source) \
3590
3591#define blend_blocks_average_set_stp_bit_untextured() \
3592
3593#define blend_blocks_average_combine_untextured(source) \
3594
3595#define blend_blocks_average_mask_set_on() \
3596 vclt.s16 write_mask, fb_pixels_next, #0 \
3597
3598#define blend_blocks_average_mask_copy_on() \
3599 vorr.u16 draw_mask, draw_mask_next, write_mask \
3600
3601#define blend_blocks_average_mask_copy_b_on() \
3602 vorr.u16 draw_mask_next, draw_mask_next, write_mask \
3603
3604#define blend_blocks_average_mask_set_off() \
3605
3606#define blend_blocks_average_mask_copy_off() \
3607 vmov draw_mask, draw_mask_next \
3608
3609#define blend_blocks_average_mask_copy_b_off() \
3610
3611#define blend_blocks_average_builder(texturing, mask_evaluate) \
3612.align 3; \
3613 \
3614function(blend_blocks_##texturing##_average_##mask_evaluate) \
3615 stmdb sp!, { r4, r14 }; \
3616 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3617 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3618 \
3619 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3620 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3621 \
3622 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3623 mov c_64, #64; \
3624 \
3625 vmov.u16 d128_0x8000, #0x8000; \
e1f6de8f 3626 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
3627 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62
E
3628 \
3629 vmov.u16 d128_0x0421, #0x0400; \
e1f6de8f 3630 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
3631 \
3632 vorr.u16 d128_0x0421, #0x0021; \
e1f6de8f 3633 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3634 \
3635 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3636 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3637 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3638 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3639 blend_blocks_average_mask_set_##mask_evaluate(); \
3640 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3641 \
3642 subs num_blocks, num_blocks, #1; \
3643 beq 1f; \
3644 \
3645 0: \
3646 mov fb_ptr, fb_ptr_next; \
e1f6de8f 3647 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62
E
3648 \
3649 vmov pixels, pixels_next; \
e1f6de8f 3650 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
3651 \
3652 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3653 \
3654 blend_blocks_average_mask_copy_##mask_evaluate(); \
e1f6de8f 3655 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
3656 \
3657 blend_blocks_average_set_blend_mask_##texturing(pixels); \
3658 blend_blocks_average_set_stp_bit_##texturing(); \
3659 vmov fb_pixels, fb_pixels_next; \
3660 blend_blocks_average_combine_##texturing(pixels); \
3661 \
3662 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3663 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3664 cmp fb_ptr_cmp, #28; \
3665 bls 2f; \
3666 \
e1f6de8f 3667 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3668 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3669 \
3670 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3671 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3672 \
3673 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3674 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
3675 \
3676 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3677 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3678 blend_blocks_average_mask_set_##mask_evaluate(); \
e1f6de8f 3679 vst1.u16 { fb_pixels }, [fb_ptr]; \
75e28f62
E
3680 \
3681 3: \
3682 subs num_blocks, num_blocks, #1; \
3683 bne 0b; \
3684 \
3685 1: \
3686 blend_blocks_average_mask_copy_b_##mask_evaluate(); \
3687 vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next; \
3688 \
3689 blend_blocks_average_set_blend_mask_##texturing(pixels_next); \
3690 blend_blocks_average_set_stp_bit_##texturing(); \
3691 blend_blocks_average_combine_##texturing(pixels_next); \
3692 \
3693 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3694 vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next; \
e1f6de8f 3695 vst1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3696 \
3697 ldmia sp!, { r4, pc }; \
3698 \
3699 2: \
3700 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3701 vbif.u16 fb_pixels, blend_pixels, draw_mask; \
e1f6de8f 3702 vst1.u16 { fb_pixels }, [fb_ptr]; \
75e28f62 3703 \
e1f6de8f 3704 vld1.u16 { fb_pixels_next }, [fb_ptr_next]; \
75e28f62
E
3705 veor.u16 blend_pixels_next, pixels_next, fb_pixels_next; \
3706 vbic.u16 pixels_no_msb, pixels_next, d128_0x8000; \
3707 vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421; \
3708 vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next; \
3709 vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000; \
3710 \
3711 bal 3b \
3712
3713blend_blocks_average_builder(textured, off)
3714blend_blocks_average_builder(untextured, off)
3715blend_blocks_average_builder(textured, on)
3716blend_blocks_average_builder(untextured, on)
3717
3718
3719#define blend_blocks_add_mask_set_on() \
3720 vclt.s16 write_mask, fb_pixels, #0 \
3721
3722#define blend_blocks_add_mask_copy_on() \
3723 vorr.u16 draw_mask, draw_mask, write_mask \
3724
3725#define blend_blocks_add_mask_set_off() \
3726
3727#define blend_blocks_add_mask_copy_off() \
3728
3729
3730#define blend_blocks_add_textured_builder(mask_evaluate) \
3731.align 3; \
3732 \
3733function(blend_blocks_textured_add_##mask_evaluate) \
3734 stmdb sp!, { r4, r14 }; \
3735 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3736 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3737 \
3738 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3739 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3740 \
3741 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3742 mov c_64, #64; \
3743 \
3744 vmov.u16 d128_0x7C1F, #0x7C00; \
3745 vmov.u16 d128_0x03E0, #0x0300; \
3746 vmov.u16 d128_0x83E0, #0x8000; \
3747 vorr.u16 d128_0x03E0, #0x00E0; \
3748 vorr.u16 d128_0x7C1F, #0x001F; \
3749 vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0; \
3750 \
e1f6de8f 3751 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
3752 ldr fb_ptr_next, [pixel_ptr, #28]; \
3753 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 3754 vclt.s16 blend_mask, pixels, #0; \
e1f6de8f 3755 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3756 blend_blocks_add_mask_set_##mask_evaluate(); \
3757 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3758 \
3759 blend_blocks_add_mask_copy_##mask_evaluate(); \
3760 vorr.u16 pixels, pixels, msb_mask; \
3761 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3762 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3763 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3764 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3765 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3766 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3767 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3768 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3769 \
3770 subs num_blocks, num_blocks, #1; \
3771 beq 1f; \
3772 \
3773 0: \
3774 mov fb_ptr, fb_ptr_next; \
3775 \
e1f6de8f 3776 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 3777 \
e1f6de8f 3778 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
3779 vclt.s16 blend_mask, pixels, #0; \
3780 \
3781 vorr.u16 pixels, pixels, msb_mask; \
3782 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3783 vand.u16 pixels_mg, pixels, d128_0x83E0; \
3784 \
8438c3c7 3785 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
e1f6de8f 3786 pld [fb_ptr_next, #64]; \
75e28f62
E
3787 \
3788 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
8438c3c7 3789 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
3790 \
75e28f62 3791 add fb_ptr_cmp, fb_ptr_cmp, #14; \
e1f6de8f 3792 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
8438c3c7 3793 \
75e28f62
E
3794 cmp fb_ptr_cmp, #28; \
3795 bls 2f; \
3796 \
e1f6de8f 3797 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3798 blend_blocks_add_mask_set_##mask_evaluate(); \
3799 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3800 blend_blocks_add_mask_copy_##mask_evaluate(); \
3801 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3802 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
e1f6de8f 3803 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3804 \
3805 3: \
3806 vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0; \
3807 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3808 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg; \
3809 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3810 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0; \
3811 \
3812 subs num_blocks, num_blocks, #1; \
3813 bne 0b; \
3814 \
3815 1: \
3816 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3817 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3818 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
3819 \
3820 ldmia sp!, { r4, pc }; \
3821 \
3822 2: \
e1f6de8f 3823 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3824 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3825 \
e1f6de8f 3826 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3827 blend_blocks_add_mask_set_##mask_evaluate(); \
3828 vand.u16 fb_pixels_masked, fb_pixels, blend_mask; \
3829 blend_blocks_add_mask_copy_##mask_evaluate(); \
3830 vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F; \
3831 bal 3b \
3832
3833
3834#define blend_blocks_add_untextured_builder(mask_evaluate) \
3835.align 3; \
3836 \
3837function(blend_blocks_untextured_add_##mask_evaluate) \
3838 stmdb sp!, { r4, r14 }; \
3839 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3840 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3841 \
3842 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3843 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3844 \
3845 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3846 mov c_64, #64; \
3847 \
3848 vmov.u16 d128_0x7C1F, #0x7C00; \
3849 vmov.u16 d128_0x03E0, #0x0300; \
3850 vorr.u16 d128_0x7C1F, #0x001F; \
3851 vorr.u16 d128_0x03E0, #0x00E0; \
3852 \
e1f6de8f 3853 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
3854 ldr fb_ptr_next, [pixel_ptr, #28]; \
3855 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
3856 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3857 blend_blocks_add_mask_set_##mask_evaluate(); \
3858 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3859 \
3860 blend_blocks_add_mask_copy_##mask_evaluate(); \
3861 vand.u16 pixels_g, pixels, d128_0x03E0; \
3862 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3863 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3864 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3865 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3866 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3867 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3868 \
3869 subs num_blocks, num_blocks, #1; \
3870 beq 1f; \
3871 \
3872 0: \
3873 mov fb_ptr, fb_ptr_next; \
3874 \
e1f6de8f 3875 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 3876 \
e1f6de8f 3877 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
3878 \
3879 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3880 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3881 vand.u16 pixels_g, pixels, d128_0x03E0; \
3882 \
3883 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3884 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
3885 \
3886 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
3887 add fb_ptr_cmp, fb_ptr_cmp, #14; \
3888 cmp fb_ptr_cmp, #28; \
3889 bls 2f; \
3890 \
e1f6de8f 3891 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3892 blend_blocks_add_mask_set_##mask_evaluate(); \
3893 blend_blocks_add_mask_copy_##mask_evaluate(); \
3894 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3895 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 3896 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3897 \
3898 3: \
3899 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3900 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3901 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3902 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
3903 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
3904 \
3905 subs num_blocks, num_blocks, #1; \
3906 bne 0b; \
3907 \
3908 1: \
3909 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
3910 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
3911 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 3912 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
3913 \
3914 ldmia sp!, { r4, pc }; \
3915 \
3916 2: \
e1f6de8f 3917 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
3918 vand.u16 pixels_rb, pixels, d128_0x7C1F; \
3919 \
e1f6de8f 3920 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3921 blend_blocks_add_mask_set_##mask_evaluate(); \
3922 blend_blocks_add_mask_copy_##mask_evaluate(); \
3923 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3924 bal 3b \
3925
3926
3927blend_blocks_add_textured_builder(off)
3928blend_blocks_add_textured_builder(on)
3929blend_blocks_add_untextured_builder(off)
3930blend_blocks_add_untextured_builder(on)
3931
3932#define blend_blocks_subtract_set_blend_mask_textured() \
3933 vclt.s16 blend_mask, pixels_next, #0 \
3934
3935#define blend_blocks_subtract_combine_textured() \
3936 vbif.u16 blend_pixels, pixels, blend_mask \
3937
3938#define blend_blocks_subtract_set_stb_textured() \
3939 vorr.u16 blend_pixels, #0x8000 \
3940
3941#define blend_blocks_subtract_msb_mask_textured() \
3942 vorr.u16 pixels, pixels_next, msb_mask \
3943
3944#define blend_blocks_subtract_set_blend_mask_untextured() \
3945
3946#define blend_blocks_subtract_combine_untextured() \
3947
3948#define blend_blocks_subtract_set_stb_untextured() \
3949 vorr.u16 blend_pixels, blend_pixels, msb_mask \
3950
3951#define blend_blocks_subtract_msb_mask_untextured() \
3952
3953
3954#define blend_blocks_subtract_mask_set_on() \
3955 vclt.s16 write_mask, fb_pixels, #0 \
3956
3957#define blend_blocks_subtract_mask_copy_on() \
3958 vorr.u16 draw_mask, draw_mask_next, write_mask \
3959
3960#define blend_blocks_subtract_mask_set_off() \
3961
3962#define blend_blocks_subtract_mask_copy_off() \
3963 vmov draw_mask, draw_mask_next \
3964
3965
3966#define blend_blocks_subtract_builder(texturing, mask_evaluate) \
3967.align 3; \
3968 \
3969function(blend_blocks_##texturing##_subtract_##mask_evaluate) \
3970 stmdb sp!, { r4, r14 }; \
3971 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 3972 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
3973 \
3974 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 3975 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
3976 \
3977 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
3978 mov c_64, #64; \
3979 \
3980 vmov.u16 d128_0x7C1F, #0x7C00; \
3981 vmov.u16 d128_0x03E0, #0x0300; \
3982 vorr.u16 d128_0x7C1F, #0x001F; \
3983 vorr.u16 d128_0x03E0, #0x00E0; \
3984 \
e1f6de8f 3985 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
3986 ldr fb_ptr_next, [pixel_ptr, #28]; \
3987 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62 3988 blend_blocks_subtract_set_blend_mask_##texturing(); \
e1f6de8f 3989 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
3990 blend_blocks_subtract_mask_set_##mask_evaluate(); \
3991 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
3992 \
3993 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
3994 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
3995 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
3996 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
3997 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
3998 \
3999 subs num_blocks, num_blocks, #1; \
4000 beq 1f; \
4001 \
4002 0: \
4003 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4004 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4005 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4006 \
e1f6de8f 4007 vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4008 blend_blocks_subtract_msb_mask_##texturing(); \
4009 \
e1f6de8f 4010 vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64; \
75e28f62
E
4011 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4012 vand.u16 pixels_rb, pixels_next, d128_0x7C1F; \
4013 blend_blocks_subtract_set_stb_##texturing(); \
4014 vand.u16 pixels_g, pixels_next, d128_0x03E0; \
4015 blend_blocks_subtract_combine_##texturing(); \
4016 blend_blocks_subtract_set_blend_mask_##texturing(); \
4017 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
4018 \
4019 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4020 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4021 cmp fb_ptr_cmp, #28; \
4022 bls 2f; \
4023 \
e1f6de8f 4024 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4025 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4026 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4027 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4028 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
e1f6de8f 4029 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4030 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4031 \
4032 3: \
4033 subs num_blocks, num_blocks, #1; \
4034 bne 0b; \
4035 \
4036 1: \
4037 blend_blocks_subtract_mask_copy_##mask_evaluate(); \
4038 \
4039 blend_blocks_subtract_msb_mask_##texturing(); \
4040 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4041 blend_blocks_subtract_set_stb_##texturing(); \
4042 blend_blocks_subtract_combine_##texturing(); \
4043 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4044 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
4045 \
4046 ldmia sp!, { r4, pc }; \
4047 \
4048 2: \
e1f6de8f 4049 vst1.u16 { blend_pixels }, [fb_ptr]; \
4050 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4051 blend_blocks_subtract_mask_set_##mask_evaluate(); \
4052 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4053 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4054 vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4055 vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4056 bal 3b \
4057
4058
4059blend_blocks_subtract_builder(textured, off)
4060blend_blocks_subtract_builder(textured, on)
4061blend_blocks_subtract_builder(untextured, off)
4062blend_blocks_subtract_builder(untextured, on)
4063
4064
4065#define blend_blocks_add_fourth_textured_builder(mask_evaluate) \
4066.align 3; \
4067 \
4068function(blend_blocks_textured_add_fourth_##mask_evaluate) \
4069 stmdb sp!, { r4, r14 }; \
4070 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 4071 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
4072 \
4073 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 4074 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
4075 \
4076 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4077 mov c_64, #64; \
4078 \
4079 vmov.u16 d128_0x7C1F, #0x7C00; \
4080 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62 4081 vmov.u16 d128_0x1C07, #0x1C00; \
d1c75d1e 4082 vmov.u16 d128_0x00E0, #0x00E0; \
75e28f62
E
4083 vorr.u16 d128_0x7C1F, #0x001F; \
4084 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62 4085 vorr.u16 d128_0x1C07, #0x0007; \
75e28f62 4086 \
e1f6de8f 4087 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
4088 ldr fb_ptr_next, [pixel_ptr, #28]; \
4089 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 4090 vclt.s16 blend_mask, pixels, #0; \
e1f6de8f 4091 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4092 blend_blocks_add_mask_set_##mask_evaluate(); \
4093 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4094 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
75e28f62
E
4095 \
4096 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4097 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4098 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4099 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4100 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4101 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4102 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4103 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4104 \
4105 subs num_blocks, num_blocks, #1; \
4106 beq 1f; \
4107 \
4108 0: \
4109 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4110 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4111 \
d1c75d1e
E
4112 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4113 vbif.u16 blend_pixels, pixels, blend_mask; \
4114 \
e1f6de8f 4115 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62 4116 vclt.s16 blend_mask, pixels, #0; \
75e28f62 4117 vshr.s16 pixels_fourth, pixels, #2; \
d1c75d1e 4118 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
75e28f62
E
4119 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4120 \
4121 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4122 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4123 \
4124 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4125 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4126 cmp fb_ptr_cmp, #28; \
4127 bls 2f; \
4128 \
e1f6de8f 4129 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62 4130 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4131 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e
E
4132 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4133 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 4134 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4135 \
4136 3: \
d1c75d1e 4137 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
75e28f62 4138 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
d1c75d1e 4139 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
75e28f62 4140 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
d1c75d1e 4141 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
75e28f62
E
4142 \
4143 subs num_blocks, num_blocks, #1; \
4144 bne 0b; \
4145 \
4146 1: \
4147 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
d1c75d1e
E
4148 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4149 vbif.u16 blend_pixels, pixels, blend_mask; \
75e28f62 4150 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4151 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
4152 \
4153 ldmia sp!, { r4, pc }; \
4154 \
4155 2: \
e1f6de8f 4156 vst1.u16 { blend_pixels }, [fb_ptr]; \
d1c75d1e 4157 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
75e28f62 4158 \
e1f6de8f 4159 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62 4160 blend_blocks_add_mask_set_##mask_evaluate(); \
75e28f62 4161 blend_blocks_add_mask_copy_##mask_evaluate(); \
d1c75d1e 4162 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
75e28f62
E
4163 bal 3b \
4164
4165
d1c75d1e 4166
75e28f62
E
4167#define blend_blocks_add_fourth_untextured_builder(mask_evaluate) \
4168.align 3; \
4169 \
4170function(blend_blocks_untextured_add_fourth_##mask_evaluate) \
4171 stmdb sp!, { r4, r14 }; \
4172 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset; \
e1f6de8f 4173 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
75e28f62
E
4174 \
4175 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16); \
e1f6de8f 4176 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]; \
75e28f62
E
4177 \
4178 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset; \
4179 mov c_64, #64; \
4180 \
4181 vmov.u16 d128_0x7C1F, #0x7C00; \
4182 vmov.u16 d128_0x03E0, #0x0300; \
75e28f62
E
4183 vmov.u16 d128_0x1C07, #0x1C00; \
4184 vmov.u16 d128_0x00E0, #0x00E0; \
4185 vorr.u16 d128_0x7C1F, #0x001F; \
4186 vorr.u16 d128_0x03E0, #0x00E0; \
75e28f62
E
4187 vorr.u16 d128_0x1C07, #0x0007; \
4188 \
e1f6de8f 4189 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
4190 ldr fb_ptr_next, [pixel_ptr, #28]; \
4191 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
4192 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4193 blend_blocks_add_mask_set_##mask_evaluate(); \
4194 vshr.s16 pixels_fourth, pixels, #2; \
4195 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4196 \
4197 blend_blocks_add_mask_copy_##mask_evaluate(); \
4198 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4199 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4200 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4201 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4202 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4203 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4204 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4205 \
4206 subs num_blocks, num_blocks, #1; \
4207 beq 1f; \
4208 \
4209 0: \
4210 mov fb_ptr, fb_ptr_next; \
e1f6de8f 4211 ldr fb_ptr_next, [pixel_ptr, #28]; \
75e28f62 4212 \
e1f6de8f 4213 vld1.u32 { pixels }, [pixel_ptr, :128], c_64; \
75e28f62
E
4214 \
4215 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4216 vshr.s16 pixels_fourth, pixels, #2; \
4217 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4218 vand.u16 pixels_rb, pixels_fourth, d128_0x1C07; \
4219 \
4220 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4221 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64; \
75e28f62
E
4222 \
4223 sub fb_ptr_cmp, fb_ptr_next, fb_ptr; \
4224 add fb_ptr_cmp, fb_ptr_cmp, #14; \
4225 cmp fb_ptr_cmp, #28; \
4226 bls 2f; \
4227 \
e1f6de8f 4228 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4229 blend_blocks_add_mask_set_##mask_evaluate(); \
4230 blend_blocks_add_mask_copy_##mask_evaluate(); \
4231 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4232 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
e1f6de8f 4233 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4234 \
4235 3: \
4236 vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0; \
4237 vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb; \
4238 vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g; \
4239 vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F; \
4240 vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0; \
4241 \
4242 subs num_blocks, num_blocks, #1; \
4243 bne 0b; \
4244 \
4245 1: \
4246 vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g; \
4247 vorr.u16 blend_pixels, blend_pixels, msb_mask; \
4248 vbit.u16 blend_pixels, fb_pixels, draw_mask; \
e1f6de8f 4249 vst1.u16 { blend_pixels }, [fb_ptr_next]; \
75e28f62
E
4250 \
4251 ldmia sp!, { r4, pc }; \
4252 \
4253 2: \
e1f6de8f 4254 vst1.u16 { blend_pixels }, [fb_ptr]; \
75e28f62
E
4255 vand.u16 pixels_g, pixels_fourth, d128_0x00E0; \
4256 \
e1f6de8f 4257 vld1.u16 { fb_pixels }, [fb_ptr_next]; \
75e28f62
E
4258 blend_blocks_add_mask_set_##mask_evaluate(); \
4259 blend_blocks_add_mask_copy_##mask_evaluate(); \
4260 vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F; \
4261 bal 3b \
4262
4263
4264blend_blocks_add_fourth_textured_builder(off)
4265blend_blocks_add_fourth_textured_builder(on)
4266blend_blocks_add_fourth_untextured_builder(off)
4267blend_blocks_add_fourth_untextured_builder(on)
4268
4269// TODO: Optimize this more. Need a scene that actually uses it for
4270// confirmation..
4271
4272.align 3
4273
4274function(blend_blocks_textured_unblended_on)
4275 stmdb sp!, { r4, r14 }
4276 add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
e1f6de8f 4277 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
4278
4279 add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
e1f6de8f 4280 vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
75e28f62
E
4281
4282 add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4283 mov c_64, #64
4284
e1f6de8f 4285 ldr fb_ptr, [pixel_ptr, #28]
4286 vld1.u16 { fb_pixels }, [fb_ptr]
4287 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62 4288 vclt.s16 write_mask, fb_pixels, #0
e1f6de8f 4289 vld1.u32 { pixels }, [pixel_ptr, :128], c_64
75e28f62
E
4290
4291 subs num_blocks, num_blocks, #1
4292 beq 1f
4293
4294 0:
134f81ec 4295 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4296 vorr.u16 draw_mask, draw_mask, write_mask
4297 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 4298 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62 4299
e1f6de8f 4300 ldr fb_ptr, [pixel_ptr, #28]
4301 vld1.u16 { fb_pixels }, [fb_ptr]
4302 vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
75e28f62 4303 vclt.s16 write_mask, fb_pixels, #0
e1f6de8f 4304 vld1.u32 { pixels }, [pixel_ptr, :128], c_64
75e28f62
E
4305
4306 subs num_blocks, num_blocks, #1
4307 bne 0b
4308
4309 1:
134f81ec 4310 vorr.u16 pixels, pixels, msb_mask
75e28f62
E
4311 vorr.u16 draw_mask, draw_mask, write_mask
4312 vbif.u16 fb_pixels, pixels, draw_mask
e1f6de8f 4313 vst1.u16 { fb_pixels }, [fb_ptr]
75e28f62
E
4314
4315 ldmia sp!, { r4, pc }
4316
4317
4318function(blend_blocks_textured_unblended_off)
4319 bx lr
4320
4321
4322function(warmup)
4323 mov r3, #64
4324 cmp r0, #0
4325 bxeq lr
4326
4327 0:
e1f6de8f 4328 vld1.u32 { u_whole_8, v_whole_8 }, [r1, :128], r3
75e28f62
E
4329
4330 subs r0, r0, #1
4331 bne 0b
4332
4333 bx lr
4334
6c4a10c4 4335#undef vram_ptr
75e28f62 4336#undef color
6c4a10c4 4337#undef width
75e28f62 4338#undef height
6c4a10c4 4339#undef pitch
75e28f62
E
4340
4341#define vram_ptr r0
6c4a10c4
E
4342#define color r1
4343#define width r2
4344#define height r3
75e28f62 4345
6c4a10c4 4346#define pitch r1
75e28f62 4347
6c4a10c4 4348#define num_width r12
75e28f62 4349
87c45ad1
E
4350#undef colors_a
4351#undef colors_b
75e28f62 4352
87c45ad1
E
4353#define colors_a q0
4354#define colors_b q1
75e28f62
E
4355
4356.align 3
4357
4358function(render_block_fill_body)
87c45ad1 4359 vdup.u16 colors_a, color
6c4a10c4 4360 mov pitch, #2048
75e28f62 4361
87c45ad1 4362 vmov colors_b, colors_a
75e28f62 4363 sub pitch, pitch, width, lsl #1
75e28f62 4364
6c4a10c4 4365 mov num_width, width
75e28f62 4366
6c4a10c4 4367 0:
e1f6de8f 4368 vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]!
75e28f62 4369
d1c75d1e 4370 subs num_width, num_width, #16
6c4a10c4 4371 bne 0b
75e28f62 4372
75e28f62 4373 add vram_ptr, vram_ptr, pitch
6c4a10c4
E
4374 mov num_width, width
4375
75e28f62
E
4376 subs height, height, #1
4377 bne 0b
75e28f62 4378
6c4a10c4
E
4379 bx lr
4380
75e28f62
E
4381
4382#undef x
4383#undef y
4384#undef width
4385#undef height
4386#undef fb_ptr
4387#undef texture_mask
4388#undef num_blocks
4389#undef temp
4390#undef dirty_textures_mask
4391#undef clut_ptr
4392#undef current_texture_mask
4393
4394#define psx_gpu r0
4395#define x r1
4396#define y r2
4397#define u r3
4398#define v r4
4399#define width r5
4400#define height r6
4401#define offset_u r8
4402#define offset_v r9
4403#define offset_u_right r10
4404#define width_rounded r11
4405#define height_rounded r12
4406
4407#define texture_offset_base r1
4408#define tile_width r2
4409#define tile_height r3
4410#define num_blocks r4
4411#define block r5
4412#define sub_tile_height r6
4413#define fb_ptr r7
4414#define texture_mask r8
4415#define column_data r9
4416#define texture_offset r10
4417#define tiles_remaining r11
4418#define fb_ptr_advance_column r12
4419#define texture_block_ptr r14
4420
8184d7c5 4421#define temp r14
4422
75e28f62
E
4423#define texture_page_ptr r3
4424#define left_block_mask r4
4425#define right_block_mask r5
4426#define texture_mask_rev r10
4427#define control_mask r11
4428
4429#define dirty_textures_mask r4
4430#define clut_ptr r5
4431#define current_texture_mask r6
4432
4433
4434#undef texels
4435#undef clut_low_a
4436#undef clut_low_b
4437#undef clut_high_a
4438#undef clut_high_b
4439#undef clut_a
4440#undef clut_b
4441#undef texels_low
4442#undef texels_high
4443
4444#define texels d0
4445#define draw_masks_fb_ptrs q1
4446
4447#define draw_mask_fb_ptr_left d2
4448#define draw_mask_fb_ptr_right d3
4449
59d15d23 4450#define draw_mask_fb_ptr_left_a d2
4451#define draw_mask_fb_ptr_left_b d3
4452#define draw_mask_fb_ptr_right_a d10
4453#define draw_mask_fb_ptr_right_b d11
4454#define draw_masks_fb_ptrs2 q5
4455
75e28f62
E
4456#define clut_low_a d4
4457#define clut_low_b d5
4458#define clut_high_a d6
4459#define clut_high_b d7
4460
4461#define block_masks d8
4462#define block_masks_shifted d9
4463
4464#define clut_a q2
4465#define clut_b q3
4466
59d15d23 4467#define texels_low d12
4468#define texels_high d13
75e28f62 4469
59d15d23 4470#define texels_wide_low d14
4471#define texels_wide_high d15
4472#define texels_wide q7
75e28f62
E
4473
4474
59d15d23 4475setup_sprite_flush_blocks:
4476 vpush { q1 - q5 }
75e28f62 4477
4d646738 4478 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 4479 bl flush_render_block_buffer
4d646738 4480 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 4481
59d15d23 4482 vpop { q1 - q5 }
75e28f62
E
4483
4484 add block, psx_gpu, #psx_gpu_blocks_offset
75e28f62
E
4485 bx lr
4486
4487
4488setup_sprite_update_texture_4bpp_cache:
4489 stmdb sp!, { r0 - r3, r14 }
4490 bl update_texture_4bpp_cache
4491 ldmia sp!, { r0 - r3, pc }
4492
4493
4494setup_sprite_update_texture_8bpp_cache:
4d646738 4495 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
75e28f62 4496 bl update_texture_8bpp_cache
4d646738 4497 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
75e28f62
E
4498
4499
4500#define setup_sprite_tiled_initialize_4bpp() \
4501 ldr dirty_textures_mask, \
e1f6de8f 4502 [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]; \
4503 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]; \
75e28f62 4504 \
e1f6de8f 4505 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]; \
4506 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]; \
75e28f62
E
4507 \
4508 tst current_texture_mask, dirty_textures_mask; \
4509 vuzp.u8 clut_a, clut_b; \
4510 \
4511 blne setup_sprite_update_texture_4bpp_cache \
4512
4513#define setup_sprite_tiled_initialize_8bpp() \
4514 ldr dirty_textures_mask, \
e1f6de8f 4515 [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]; \
4516 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]; \
75e28f62
E
4517 \
4518 tst current_texture_mask, dirty_textures_mask; \
4519 blne setup_sprite_update_texture_8bpp_cache \
4520
4521
75e28f62
E
4522#define setup_sprite_block_count_single() \
4523 sub_tile_height \
4524
4525#define setup_sprite_block_count_double() \
4526 sub_tile_height, lsl #1 \
4527
4528#define setup_sprite_tile_add_blocks(type) \
4529 add num_blocks, num_blocks, setup_sprite_block_count_##type(); \
4530 cmp num_blocks, #MAX_BLOCKS; \
4531 \
59d15d23 4532 movgt num_blocks, setup_sprite_block_count_##type(); \
4533 blgt setup_sprite_flush_blocks \
75e28f62
E
4534
4535
4536#define setup_sprite_tile_full_4bpp(edge) \
4537 setup_sprite_tile_add_blocks(double); \
4538 \
4539 4: \
4540 and texture_block_ptr, texture_offset, texture_mask; \
4541 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4542 \
e1f6de8f 4543 pld [fb_ptr]; \
75e28f62 4544 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4545 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4546 \
4547 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4548 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4549 \
e1f6de8f 4550 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4551 add texture_block_ptr, texture_offset, #8; \
4552 \
4553 and texture_block_ptr, texture_block_ptr, texture_mask; \
4554 add block, block, #40; \
4555 \
4556 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4557 add fb_ptr, fb_ptr, #16; \
4558 \
e1f6de8f 4559 vst1.u32 { draw_mask_fb_ptr_left }, [block, :64]; \
75e28f62
E
4560 add block, block, #24; \
4561 \
e1f6de8f 4562 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4563 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4564 \
e1f6de8f 4565 pld [fb_ptr]; \
75e28f62
E
4566 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
4567 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4568 \
e1f6de8f 4569 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4570 add block, block, #40; \
4571 \
4572 add texture_offset, texture_offset, #0x10; \
4573 add fb_ptr, fb_ptr, #(2048 - 16); \
4574 \
e1f6de8f 4575 vst1.u32 { draw_mask_fb_ptr_right }, [block, :64]; \
75e28f62
E
4576 add block, block, #24; \
4577 \
4578 subs sub_tile_height, sub_tile_height, #1; \
4579 bne 4b; \
4580 \
4581 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4582 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4583
4584
4585#define setup_sprite_tile_half_4bpp(edge) \
4586 setup_sprite_tile_add_blocks(single); \
4587 \
4588 4: \
4589 and texture_block_ptr, texture_offset, texture_mask; \
4590 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
4591 \
e1f6de8f 4592 pld [fb_ptr]; \
75e28f62 4593 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4594 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4595 \
4596 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4597 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4598 \
e1f6de8f 4599 vst2.u8 { texels_low, texels_high }, [block, :128]; \
75e28f62
E
4600 add block, block, #40; \
4601 \
4602 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4603 vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64]; \
75e28f62
E
4604 \
4605 add block, block, #24; \
4606 add texture_offset, texture_offset, #0x10; \
4607 \
4608 add fb_ptr, fb_ptr, #2048; \
4609 subs sub_tile_height, sub_tile_height, #1; \
4610 \
4611 bne 4b; \
4612 \
4613 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4614 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4615
4616
4617#define setup_sprite_tile_full_8bpp(edge) \
4618 setup_sprite_tile_add_blocks(double); \
4619 add block, block, #16; \
4620 \
4621 4: \
4622 and texture_block_ptr, texture_offset, texture_mask; \
4623 vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr; \
4624 \
e1f6de8f 4625 pld [fb_ptr]; \
75e28f62 4626 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4627 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62
E
4628 \
4629 add texture_block_ptr, texture_offset, #8; \
e1f6de8f 4630 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4631 \
4632 and texture_block_ptr, texture_block_ptr, texture_mask; \
4633 add block, block, #24; \
4634 \
4635 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4636 \
4637 add fb_ptr, fb_ptr, #16; \
e1f6de8f 4638 vst1.u32 { draw_mask_fb_ptr_left }, [block, :64]; \
75e28f62
E
4639 \
4640 add block, block, #40; \
e1f6de8f 4641 vld1.u32 { texels }, [texture_block_ptr, :64]; \
4642 pld [fb_ptr]; \
75e28f62
E
4643 \
4644 vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr; \
e1f6de8f 4645 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4646 add block, block, #24; \
4647 \
4648 add texture_offset, texture_offset, #0x10; \
4649 add fb_ptr, fb_ptr, #(2048 - 16); \
4650 \
e1f6de8f 4651 vst1.u32 { draw_mask_fb_ptr_right }, [block, :64]; \
75e28f62
E
4652 add block, block, #40; \
4653 \
4654 subs sub_tile_height, sub_tile_height, #1; \
4655 bne 4b; \
4656 \
4657 sub block, block, #16; \
4658 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4659 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4660
4661
4662#define setup_sprite_tile_half_8bpp(edge) \
4663 setup_sprite_tile_add_blocks(single); \
4664 add block, block, #16; \
4665 \
4666 4: \
4667 and texture_block_ptr, texture_offset, texture_mask; \
4668 vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr; \
e1f6de8f 4669 pld [fb_ptr]; \
75e28f62
E
4670 \
4671 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4672 vld1.u32 { texels }, [texture_block_ptr, :64]; \
75e28f62 4673 \
e1f6de8f 4674 vst1.u32 { texels }, [block, :64]; \
75e28f62
E
4675 add block, block, #24; \
4676 \
e1f6de8f 4677 vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64]; \
75e28f62
E
4678 add block, block, #40; \
4679 \
4680 add texture_offset, texture_offset, #0x10; \
4681 add fb_ptr, fb_ptr, #2048; \
4682 \
4683 subs sub_tile_height, sub_tile_height, #1; \
4684 bne 4b; \
4685 \
4686 sub block, block, #16; \
4687 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4688 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
75e28f62
E
4689
4690
4691#define setup_sprite_tile_column_edge_pre_adjust_half_right() \
4692 add texture_offset, texture_offset_base, #8; \
4693 add fb_ptr, fb_ptr, #16 \
4694
4695#define setup_sprite_tile_column_edge_pre_adjust_half_left() \
4696 mov texture_offset, texture_offset_base \
4697
4698#define setup_sprite_tile_column_edge_pre_adjust_half(edge) \
4699 setup_sprite_tile_column_edge_pre_adjust_half_##edge() \
4700
4701#define setup_sprite_tile_column_edge_pre_adjust_full(edge) \
4702 mov texture_offset, texture_offset_base \
4703
4704#define setup_sprite_tile_column_edge_post_adjust_half_right() \
4705 sub fb_ptr, fb_ptr, #16 \
4706
4707#define setup_sprite_tile_column_edge_post_adjust_half_left() \
4708
4709#define setup_sprite_tile_column_edge_post_adjust_half(edge) \
4710 setup_sprite_tile_column_edge_post_adjust_half_##edge() \
4711
4712#define setup_sprite_tile_column_edge_post_adjust_full(edge) \
4713
4714
59d15d23 4715#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode, \
4716 x4mode) \
75e28f62 4717 mov sub_tile_height, column_data; \
59d15d23 4718 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4719 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4720 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62 4721
59d15d23 4722#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode, \
4723 x4mode) \
75e28f62
E
4724 and sub_tile_height, column_data, #0xFF; \
4725 mov tiles_remaining, column_data, lsr #16; \
59d15d23 4726 setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge); \
4727 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4728 \
4729 subs tiles_remaining, tiles_remaining, #1; \
4730 beq 2f; \
4731 \
4732 3: \
4733 mov sub_tile_height, #16; \
59d15d23 4734 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
75e28f62
E
4735 subs tiles_remaining, tiles_remaining, #1; \
4736 bne 3b; \
4737 \
4738 2: \
4739 uxtb sub_tile_height, column_data, ror #8; \
59d15d23 4740 setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge); \
4741 setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge) \
75e28f62
E
4742
4743
4744#define setup_sprite_column_data_single() \
4745 mov column_data, height; \
e1f6de8f 4746 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset] \
75e28f62
E
4747
4748#define setup_sprite_column_data_multi() \
4749 and height_rounded, height_rounded, #0xF; \
4750 rsb column_data, offset_v, #16; \
4751 \
4752 add height_rounded, height_rounded, #1; \
4753 sub tile_height, tile_height, #1; \
4754 \
4755 orr column_data, column_data, tile_height, lsl #16; \
e1f6de8f 4756 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]; \
75e28f62
E
4757 \
4758 orr column_data, column_data, height_rounded, lsl #8 \
4759
59d15d23 4760#define setup_sprite_setup_left_draw_mask_fb_ptr() \
4761 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4762 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4763
4764#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column() \
4765 mov fb_ptr_advance_column, #32; \
4766 vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
4767 \
ed0fd81d 4768 sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \
59d15d23 4769 vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
4770
4771#define setup_sprite_setup_right_draw_mask_fb_ptr() \
4772 vdup.u8 draw_mask_fb_ptr_left, block_masks[4]; \
4773 vdup.u8 draw_mask_fb_ptr_right, block_masks[5] \
4774
4775#define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode, \
4776 edge, x4mode) \
4777 setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode: \
75e28f62
E
4778 setup_sprite_column_data_##multi_height(); \
4779 vext.32 block_masks_shifted, block_masks, block_masks, #1; \
4780 vorr.u32 block_masks, block_masks, block_masks_shifted; \
59d15d23 4781 setup_sprite_setup_left_draw_mask_fb_ptr##x4mode(); \
75e28f62 4782 \
59d15d23 4783 setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
75e28f62
E
4784 ldmia sp!, { r4 - r11, pc } \
4785
4786#define setup_sprite_tiled_advance_column() \
4787 add texture_offset_base, texture_offset_base, #0x100; \
4788 tst texture_offset_base, #0xF00; \
4789 subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00) \
4790
4791#define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode, \
59d15d23 4792 right_mode, x4mode) \
4793 setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
75e28f62 4794 setup_sprite_column_data_##multi_height(); \
75e28f62 4795 \
59d15d23 4796 setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode(); \
75e28f62 4797 \
59d15d23 4798 setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
75e28f62
E
4799 \
4800 subs tile_width, tile_width, #2; \
4801 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4802 \
75e28f62
E
4803 beq 1f; \
4804 \
59d15d23 4805 vmov.u8 draw_masks_fb_ptrs, #0; \
4806 vmov.u8 draw_masks_fb_ptrs2, #0; \
4807 \
75e28f62
E
4808 0: \
4809 setup_sprite_tiled_advance_column(); \
59d15d23 4810 setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode); \
75e28f62
E
4811 add fb_ptr, fb_ptr, fb_ptr_advance_column; \
4812 subs tile_width, tile_width, #1; \
4813 bne 0b; \
4814 \
4815 1: \
59d15d23 4816 setup_sprite_setup_right_draw_mask_fb_ptr##x4mode(); \
75e28f62
E
4817 \
4818 setup_sprite_tiled_advance_column(); \
59d15d23 4819 setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
75e28f62
E
4820 ldmia sp!, { r4 - r11, pc } \
4821
4822
59d15d23 4823#define setup_sprite_offset_u_adjust() \
4824
4825#define setup_sprite_get_left_block_mask() \
4826 and left_block_mask, left_block_mask, #0xFF \
4827
4828#define setup_sprite_compare_left_block_mask() \
4829 cmp left_block_mask, #0xFF \
4830
4831#define setup_sprite_get_right_block_mask() \
4832 uxtb right_block_mask, right_block_mask, ror #8 \
4833
4834#define setup_sprite_compare_right_block_mask() \
4835 cmp right_block_mask, #0xFF \
4836
4837
4838
4839/* 4x stuff */
4840#define fb_ptr2 column_data
4841
4842#define setup_sprite_offset_u_adjust_4x() \
4843 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
4844 lsl offset_u_right, #1; \
4845 lsl offset_u, #1; \
4846 add offset_u_right, #1 \
4847
4848#define setup_sprite_get_left_block_mask_4x() \
4849 sxth left_block_mask, left_block_mask \
4850
4851#define setup_sprite_compare_left_block_mask_4x() \
4852 cmp left_block_mask, #0xFFFFFFFF \
4853
4854#define setup_sprite_get_right_block_mask_4x() \
4855 sxth right_block_mask, right_block_mask, ror #16 \
4856
4857#define setup_sprite_compare_right_block_mask_4x() \
4858 cmp right_block_mask, #0xFFFFFFFF \
4859
4860
4861#define widen_texels_16bpp(texels_) \
4862 vmov texels_wide_low, texels_; \
4863 vmov texels_wide_high, texels_; \
4864 vzip.16 texels_wide_low, texels_wide_high \
4865
4866#define widen_texels_8bpp(texels_) \
4867 vmov texels_wide_low, texels_; \
4868 vmov texels_wide_high, texels_; \
4869 vzip.8 texels_wide_low, texels_wide_high \
4870
4871#define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
e1f6de8f 4872 vst1.u32 { texels_ }, [block_, :128]; \
59d15d23 4873 add block_, block_, #40; \
4874 \
4875 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
e1f6de8f 4876 vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64]; \
59d15d23 4877 add block_, block_, #24 \
4878
4879/* assumes 16-byte offset already added to block_ */
4880#define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_) \
e1f6de8f 4881 vst1.u32 { texels_ }, [block_, :64]; \
59d15d23 4882 add block_, block_, #24; \
4883 \
4884 vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_; \
e1f6de8f 4885 vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64]; \
59d15d23 4886 add block_, block_, #40 \
4887
4888#define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4889 draw_mask_fb_ptr_b_) \
4890 widen_texels_16bpp(texels_low); \
4891 add fb_ptr_tmp, fb_ptr, #1024*2; \
4892 \
4893 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr); \
4894 \
4895 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4896 widen_texels_16bpp(texels_high); \
4897 \
4898 add fb_ptr_tmp, fb_ptr, #8*2; \
4899 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4900 \
4901 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4902 write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4903
4904#define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_, \
4905 draw_mask_fb_ptr_b_) \
4906 widen_texels_8bpp(texels); \
4907 add fb_ptr_tmp, fb_ptr, #1024*2; \
4908 \
4909 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr); \
4910 write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp); \
4911 \
4912 add fb_ptr_tmp, fb_ptr, #8*2; \
4913 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp); \
4914 \
4915 add fb_ptr_tmp, fb_ptr_tmp, #1024*2; \
4916 write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp) \
4917
4918
4919#define setup_sprite_tiled_initialize_4bpp_4x() \
e1f6de8f 4920 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]; \
4921 vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]; \
59d15d23 4922 \
4923 vuzp.u8 clut_a, clut_b \
4924
4925#define setup_sprite_tiled_initialize_8bpp_4x() \
4926
4927
4928#define setup_sprite_block_count_single_4x() \
4929 sub_tile_height, lsl #2 \
4930
4931#define setup_sprite_block_count_double_4x() \
4932 sub_tile_height, lsl #(1+2) \
4933
4934#define setup_sprite_tile_full_4bpp_4x(edge) \
4935 setup_sprite_tile_add_blocks(double_4x); \
4936 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4937 \
4938 4: \
4939 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 4940 pld [fb_ptr]; \
59d15d23 4941 \
4942 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4943 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 4944 \
4945 add texture_block_ptr, texture_offset, #8; \
4946 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4947 \
4948 and texture_block_ptr, texture_block_ptr, texture_mask; \
4949 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4950 \
4951 vzip.8 texels_low, texels_high; \
4952 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
4953 draw_mask_fb_ptr_left_b); \
4954 \
4955 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4956 pld [fb_ptr, #2048]; \
59d15d23 4957 \
e1f6de8f 4958 vld1.u32 { texels }, [texture_block_ptr, :64]; \
8438c3c7 4959 add fb_ptr, fb_ptr, #16*2; \
59d15d23 4960 \
8438c3c7 4961 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
59d15d23 4962 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4963 \
4964 vzip.8 texels_low, texels_high; \
4965 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
4966 draw_mask_fb_ptr_right_b); \
4967 \
4968 add texture_offset, texture_offset, #0x10; \
4969 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
4970 \
4971 subs sub_tile_height, sub_tile_height, #1; \
4972 bne 4b; \
4973 \
4974 ldr column_data, [sp], #8; /* fb_ptr2 */ \
4975 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 4976 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 4977
4978
4979#define setup_sprite_tile_half_4bpp_4x(edge) \
4980 setup_sprite_tile_add_blocks(single_4x); \
4981 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
4982 \
4983 4: \
4984 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 4985 pld [fb_ptr]; \
59d15d23 4986 \
4987 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 4988 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 4989 \
4990 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
4991 vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels; \
4992 \
4993 vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels; \
4994 add texture_offset, texture_offset, #0x10; \
4995 \
4996 vzip.8 texels_low, texels_high; \
4997 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
4998 draw_mask_fb_ptr_##edge##_b); \
4999 \
e1f6de8f 5000 pld [fb_ptr, #2048]; \
59d15d23 5001 add fb_ptr, fb_ptr, #2048 * 2; \
59d15d23 5002 \
8438c3c7 5003 subs sub_tile_height, sub_tile_height, #1; \
59d15d23 5004 bne 4b; \
5005 \
5006 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5007 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5008 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5009
5010
5011#define setup_sprite_tile_full_8bpp_4x(edge) \
5012 setup_sprite_tile_add_blocks(double_4x); \
5013 add block, block, #16; \
5014 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5015 \
5016 4: \
5017 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 5018 pld [fb_ptr]; \
59d15d23 5019 \
5020 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 5021 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5022 \
5023 add texture_block_ptr, texture_offset, #8; \
5024 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a, \
5025 draw_mask_fb_ptr_left_b); \
5026 \
e1f6de8f 5027 pld [fb_ptr, #2048]; \
59d15d23 5028 and texture_block_ptr, texture_block_ptr, texture_mask; \
5029 \
5030 add fb_ptr, fb_ptr, #16*2; \
5031 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
5032 \
e1f6de8f 5033 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5034 \
5035 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a, \
5036 draw_mask_fb_ptr_right_b); \
5037 \
5038 add texture_offset, texture_offset, #0x10; \
5039 add fb_ptr, fb_ptr, #(2048 - 16) * 2; \
5040 \
5041 subs sub_tile_height, sub_tile_height, #1; \
5042 bne 4b; \
5043 \
5044 sub block, block, #16; \
5045 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5046 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5047 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5048
5049
5050#define setup_sprite_tile_half_8bpp_4x(edge) \
5051 setup_sprite_tile_add_blocks(single_4x); \
5052 add block, block, #16; \
5053 str column_data, [sp, #-8]!; /* fb_ptr2 */ \
5054 \
5055 4: \
5056 and texture_block_ptr, texture_offset, texture_mask; \
e1f6de8f 5057 pld [fb_ptr]; \
59d15d23 5058 \
5059 add texture_block_ptr, texture_page_ptr, texture_block_ptr; \
e1f6de8f 5060 vld1.u32 { texels }, [texture_block_ptr, :64]; \
59d15d23 5061 \
e1f6de8f 5062 pld [fb_ptr, #2048]; \
59d15d23 5063 do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a, \
5064 draw_mask_fb_ptr_##edge##_b); \
5065 \
5066 add texture_offset, texture_offset, #0x10; \
5067 add fb_ptr, fb_ptr, #2048 * 2; \
5068 \
5069 subs sub_tile_height, sub_tile_height, #1; \
5070 bne 4b; \
5071 \
5072 sub block, block, #16; \
5073 ldr column_data, [sp], #8; /* fb_ptr2 */ \
5074 add texture_offset, texture_offset, #0xF00; \
e1f6de8f 5075 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset] \
59d15d23 5076
5077
5078#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x() \
5079 add texture_offset, texture_offset_base, #8; \
5080 add fb_ptr, fb_ptr, #16 * 2 \
5081
5082#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x() \
5083 mov texture_offset, texture_offset_base \
5084
5085#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge) \
5086 setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x() \
5087
5088#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge) \
5089 mov texture_offset, texture_offset_base \
5090
5091#define setup_sprite_tile_column_edge_post_adjust_half_right_4x() \
5092 sub fb_ptr, fb_ptr, #16 * 2 \
5093
5094#define setup_sprite_tile_column_edge_post_adjust_half_left_4x() \
5095
5096#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge) \
5097 setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x() \
5098
5099#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge) \
5100
5101
5102#define setup_sprite_setup_left_draw_mask_fb_ptr_4x() \
5103 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5104 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
5105 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5106 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5107
5108#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x() \
5109 mov fb_ptr_advance_column, #32 * 2; \
5110 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
5111 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
ed0fd81d 5112 sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \
59d15d23 5113 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
5114 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
5115
5116#define setup_sprite_setup_right_draw_mask_fb_ptr_4x() \
5117 vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4]; \
5118 vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5]; \
5119 vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6]; \
5120 vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7] \
5121
5122
75e28f62
E
5123// r0: psx_gpu
5124// r1: x
5125// r2: y
5126// r3: u
e1f6de8f 5127// [sp]: v
5128// [sp + 4]: width
5129// [sp + 8]: height
5130// [sp + 12]: color (unused)
75e28f62 5131
59d15d23 5132#define setup_sprite_tiled_builder(texture_mode, x4mode) \
5133 \
5134setup_sprite_tile_column_width_multi(texture_mode, multi, full, full, \
5135 x4mode); \
5136setup_sprite_tile_column_width_single(texture_mode, multi, full, none, \
5137 x4mode); \
5138setup_sprite_tile_column_width_multi(texture_mode, single, full, full, \
5139 x4mode); \
5140setup_sprite_tile_column_width_single(texture_mode, single, full, none, \
5141 x4mode); \
5142setup_sprite_tile_column_width_multi(texture_mode, multi, half, full, \
5143 x4mode); \
5144setup_sprite_tile_column_width_single(texture_mode, multi, half, right, \
5145 x4mode); \
5146setup_sprite_tile_column_width_multi(texture_mode, single, half, full, \
5147 x4mode); \
5148setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
5149 x4mode); \
5150setup_sprite_tile_column_width_multi(texture_mode, multi, full, half, \
5151 x4mode); \
5152setup_sprite_tile_column_width_single(texture_mode, multi, half, left, \
5153 x4mode); \
5154setup_sprite_tile_column_width_multi(texture_mode, single, full, half, \
5155 x4mode); \
5156setup_sprite_tile_column_width_single(texture_mode, single, half, left, \
5157 x4mode); \
5158setup_sprite_tile_column_width_multi(texture_mode, multi, half, half, \
5159 x4mode); \
5160setup_sprite_tile_column_width_multi(texture_mode, single, half, half, \
5161 x4mode); \
75e28f62
E
5162 \
5163.align 4; \
5164 \
59d15d23 5165function(setup_sprite_##texture_mode##x4mode) \
75e28f62 5166 stmdb sp!, { r4 - r11, r14 }; \
59d15d23 5167 setup_sprite_tiled_initialize_##texture_mode##x4mode(); \
75e28f62 5168 \
e1f6de8f 5169 ldr v, [sp, #36]; \
75e28f62
E
5170 and offset_u, u, #0xF; \
5171 \
e1f6de8f 5172 ldr width, [sp, #40]; \
5173 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]; \
75e28f62 5174 \
e1f6de8f 5175 ldr height, [sp, #44]; \
75e28f62
E
5176 add fb_ptr, fb_ptr, y, lsl #11; \
5177 \
5178 add fb_ptr, fb_ptr, x, lsl #1; \
5179 and offset_v, v, #0xF; \
5180 \
5181 sub fb_ptr, fb_ptr, offset_u, lsl #1; \
5182 add width_rounded, offset_u, width; \
5183 \
5184 add height_rounded, offset_v, height; \
5185 add width_rounded, width_rounded, #15; \
5186 \
5187 add height_rounded, height_rounded, #15; \
5188 mov tile_width, width_rounded, lsr #4; \
5189 \
5190 /* texture_offset_base = VH-VL-00-00 */\
5191 mov texture_offset_base, v, lsl #8; \
5192 and offset_u_right, width_rounded, #0xF; \
5193 \
5194 /* texture_offset_base = VH-UH-UL-00 */\
5195 bfi texture_offset_base, u, #4, #8; \
59d15d23 5196 mov right_block_mask, #0xFFFFFFFE; \
5197 \
5198 setup_sprite_offset_u_adjust##x4mode(); \
75e28f62
E
5199 \
5200 /* texture_offset_base = VH-UH-VL-00 */\
5201 bfi texture_offset_base, v, #4, #4; \
59d15d23 5202 mov left_block_mask, #0xFFFFFFFF; \
75e28f62
E
5203 \
5204 mov tile_height, height_rounded, lsr #4; \
5205 mvn left_block_mask, left_block_mask, lsl offset_u; \
5206 \
5207 /* texture_mask = HH-HL-WH-WL */\
e1f6de8f 5208 ldrh texture_mask, [psx_gpu, #psx_gpu_texture_mask_width_offset]; \
75e28f62
E
5209 mov right_block_mask, right_block_mask, lsl offset_u_right; \
5210 \
5211 /* texture_mask_rev = WH-WL-HH-HL */\
5212 rev16 texture_mask_rev, texture_mask; \
5213 vmov block_masks, left_block_mask, right_block_mask; \
5214 \
5215 /* texture_mask = HH-HL-HL-WL */\
5216 bfi texture_mask, texture_mask_rev, #4, #4; \
5217 /* texture_mask_rev = 00-00-00-WH */\
5218 mov texture_mask_rev, texture_mask_rev, lsr #12; \
5219 \
5220 /* texture_mask = HH-WH-HL-WL */\
5221 bfi texture_mask, texture_mask_rev, #8, #4; \
59d15d23 5222 setup_sprite_get_left_block_mask##x4mode(); \
75e28f62
E
5223 \
5224 mov control_mask, #0; \
59d15d23 5225 setup_sprite_compare_left_block_mask##x4mode(); \
75e28f62 5226 \
59d15d23 5227 setup_sprite_get_right_block_mask##x4mode(); \
75e28f62
E
5228 orreq control_mask, control_mask, #0x4; \
5229 \
e1f6de8f 5230 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]; \
59d15d23 5231 setup_sprite_compare_right_block_mask##x4mode(); \
75e28f62
E
5232 \
5233 orreq control_mask, control_mask, #0x8; \
5234 cmp tile_width, #1; \
5235 \
5236 add block, psx_gpu, #psx_gpu_blocks_offset; \
5237 orreq control_mask, control_mask, #0x1; \
5238 \
5239 cmp tile_height, #1; \
5240 add block, block, num_blocks, lsl #6; \
5241 \
5242 orreq control_mask, control_mask, #0x2; \
8184d7c5 5243 JT_OP_REL(9f, control_mask, temp); \
e1f6de8f 5244 JT_OP(ldr pc, [pc, control_mask, lsl #2]); \
75e28f62
E
5245 nop; \
5246 \
8184d7c5 5247 9: \
5248 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode); \
5249 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode); \
5250 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode); \
5251 .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5252 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode); \
5253 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5254 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode); \
5255 .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5256 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode); \
5257 .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode); \
5258 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode); \
5259 .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5260 .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode); \
75e28f62 5261 .word 0x00000000; \
8184d7c5 5262 .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode); \
59d15d23 5263
5264
5265setup_sprite_tiled_builder(4bpp,);
5266setup_sprite_tiled_builder(8bpp,);
75e28f62 5267
59d15d23 5268#undef draw_mask_fb_ptr_left
5269#undef draw_mask_fb_ptr_right
75e28f62 5270
59d15d23 5271setup_sprite_tiled_builder(4bpp, _4x);
5272setup_sprite_tiled_builder(8bpp, _4x);
75e28f62
E
5273
5274
5275#undef block_ptr
5276#undef num_blocks
5277#undef clut_ptr
5278
5279#define psx_gpu r0
5280#define block_ptr r0
5281#define num_blocks r1
5282#define clut_ptr r2
5283#define texel_shift_mask r3
5284#define block_pixels_a r4
5285#define block_pixels_b r5
5286#define texel_0 r6
5287#define texel_2 r7
5288#define texel_4 r8
5289#define texel_6 r9
5290#define texel_1 r10
5291#define texel_3 r11
5292#define texel_5 r12
5293#define texel_7 r14
5294#define texels_01 r6
5295#define texels_23 r7
5296#define texels_45 r8
5297#define texels_67 r9
5298
5299function(texture_sprite_blocks_8bpp)
5300 stmdb sp!, { r4 - r11, r14 }
5301 movw texel_shift_mask, #(0xFF << 1)
5302
e1f6de8f 5303 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5304 ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
75e28f62
E
5305
5306 add block_ptr, psx_gpu, #psx_gpu_blocks_offset
e1f6de8f 5307 ldr block_pixels_a, [block_ptr, #16]
75e28f62
E
5308
5309 0:
5310 and texel_0, texel_shift_mask, block_pixels_a, lsl #1
e1f6de8f 5311 ldr block_pixels_b, [block_ptr, #20]
75e28f62
E
5312
5313 and texel_1, texel_shift_mask, block_pixels_a, lsr #7
e1f6de8f 5314 ldrh texel_0, [clut_ptr, texel_0]
75e28f62
E
5315
5316 and texel_2, texel_shift_mask, block_pixels_a, lsr #15
e1f6de8f 5317 ldrh texel_1, [clut_ptr, texel_1]
75e28f62
E
5318
5319 and texel_3, texel_shift_mask, block_pixels_a, lsr #23
e1f6de8f 5320 ldr block_pixels_a, [block_ptr, #(64 + 16)]
75e28f62 5321
e1f6de8f 5322 ldrh texel_2, [clut_ptr, texel_2]
75e28f62
E
5323 and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5324
e1f6de8f 5325 ldrh texel_3, [clut_ptr, texel_3]
75e28f62
E
5326 and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5327
e1f6de8f 5328 ldrh texel_4, [clut_ptr, texel_4]
75e28f62
E
5329 and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5330
e1f6de8f 5331 ldrh texel_5, [clut_ptr, texel_5]
75e28f62
E
5332 and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5333
e1f6de8f 5334 ldrh texel_6, [clut_ptr, texel_6]
75e28f62
E
5335 orr texels_01, texel_0, texel_1, lsl #16
5336
e1f6de8f 5337 ldrh texel_7, [clut_ptr, texel_7]
75e28f62
E
5338 orr texels_23, texel_2, texel_3, lsl #16
5339
5340 orr texels_45, texel_4, texel_5, lsl #16
e1f6de8f 5341 str texels_01, [block_ptr, #0]
75e28f62
E
5342
5343 orr texels_67, texel_6, texel_7, lsl #16
e1f6de8f 5344 str texels_23, [block_ptr, #4]
75e28f62
E
5345
5346 subs num_blocks, num_blocks, #1
e1f6de8f 5347 str texels_45, [block_ptr, #8]
75e28f62 5348
e1f6de8f 5349 str texels_67, [block_ptr, #12]
75e28f62
E
5350 add block_ptr, block_ptr, #64
5351
5352 bne 0b
5353
5354 ldmia sp!, { r4 - r11, pc }
5355
5356
5357#undef width_rounded
5358#undef texture_mask
5359#undef num_blocks
5360#undef texture_offset
59d15d23 5361#undef texels_low
5362#undef texels_high
5363#undef texels_wide_low
5364#undef texels_wide_high
5365#undef texels_wide
5366#undef fb_ptr2
8184d7c5 5367#undef temp
75e28f62
E
5368
5369#define psx_gpu r0
5370#define x r1
5371#define y r2
5372#define u r3
5373#define v r4
5374#define width r5
5375#define height r6
5376#define left_offset r8
5377#define width_rounded r9
5378#define right_width r10
59d15d23 5379
75e28f62
E
5380#define block_width r11
5381
5382#define texture_offset_base r1
5383#define texture_mask r2
5384#define texture_page_ptr r3
5385#define num_blocks r4
5386#define block r5
5387#define fb_ptr r7
5388#define texture_offset r8
5389#define blocks_remaining r9
59d15d23 5390#define fb_ptr2 r10
75e28f62
E
5391#define fb_ptr_pitch r12
5392#define texture_block_ptr r14
5393
5394#define texture_mask_width r2
5395#define texture_mask_height r3
5396#define left_mask_bits r4
5397#define right_mask_bits r5
5398
5399
5400#undef block_masks
5401#undef block_masks_shifted
5402#undef texels
5403
5404#define block_masks d0
5405#define block_masks_shifted d1
5406#define draw_mask_fb_ptr d2
5407#define texels q2
5408
59d15d23 5409#define draw_mask_fb_ptr_a d2
5410#define draw_mask_fb_ptr_b d3
5411#define texels_low d4
5412#define texels_high d5
5413#define texels_wide_low d6
5414#define texels_wide_high d7
5415#define texels_wide q3
75e28f62 5416
75e28f62 5417
59d15d23 5418setup_sprites_16bpp_flush:
5419 vpush { d0 - d3 }
75e28f62 5420
4d646738 5421 stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 5422 bl flush_render_block_buffer
4d646738 5423 ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
75e28f62 5424
59d15d23 5425 vpop { d0 - d3 }
75e28f62
E
5426
5427 add block, psx_gpu, #psx_gpu_blocks_offset
5428 mov num_blocks, block_width
5429
5430 bx lr
5431
5432function(setup_sprite_16bpp)
5433 stmdb sp!, { r4 - r11, r14 }
e1f6de8f 5434 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
75e28f62 5435
e1f6de8f 5436 ldr v, [sp, #36]
75e28f62
E
5437 add fb_ptr, fb_ptr, y, lsl #11
5438
e1f6de8f 5439 ldr width, [sp, #40]
75e28f62
E
5440 add fb_ptr, fb_ptr, x, lsl #1
5441
e1f6de8f 5442 ldr height, [sp, #44]
75e28f62
E
5443 and left_offset, u, #0x7
5444
5445 add texture_offset_base, u, u
5446 add width_rounded, width, #7
5447
ed0fd81d 5448 add texture_offset_base, texture_offset_base, v, lsl #11
75e28f62
E
5449 mov left_mask_bits, #0xFF
5450
e1f6de8f 5451 ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
75e28f62
E
5452 add width_rounded, width_rounded, left_offset
5453
e1f6de8f 5454 ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
75e28f62
E
5455 sub fb_ptr, fb_ptr, left_offset, lsl #1
5456
5457 add texture_mask, texture_mask_width, texture_mask_width
5458 mov right_mask_bits, #0xFE
5459
5460 and right_width, width_rounded, #0x7
5461 mvn left_mask_bits, left_mask_bits, lsl left_offset
5462
ed0fd81d 5463 add texture_mask, texture_mask, texture_mask_height, lsl #11
75e28f62
E
5464 mov block_width, width_rounded, lsr #3
5465
5466 mov right_mask_bits, right_mask_bits, lsl right_width
5467 movw fb_ptr_pitch, #(2048 + 16)
5468
5469 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5470 vmov block_masks, left_mask_bits, right_mask_bits
5471
e1f6de8f 5472 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5473 add block, psx_gpu, #psx_gpu_blocks_offset
5474
6ea0f7bf 5475 bic texture_offset_base, texture_offset_base, #0xF
75e28f62
E
5476 cmp block_width, #1
5477
e1f6de8f 5478 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
75e28f62
E
5479 add block, block, num_blocks, lsl #6
5480
5481 bne 0f
5482
5483 vext.32 block_masks_shifted, block_masks, block_masks, #1
5484 vorr.u32 block_masks, block_masks, block_masks_shifted
5485 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5486
5487 1:
5488 add num_blocks, num_blocks, #1
5489 cmp num_blocks, #MAX_BLOCKS
59d15d23 5490 blgt setup_sprites_16bpp_flush
75e28f62
E
5491
5492 and texture_block_ptr, texture_offset_base, texture_mask
5493 subs height, height, #1
5494
5495 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5496 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5497
e1f6de8f 5498 vst1.u32 { texels }, [block, :128]
75e28f62
E
5499 add block, block, #40
5500
5501 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5502 pld [fb_ptr]
75e28f62 5503
e1f6de8f 5504 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5505
5506 add block, block, #24
5507 add texture_offset_base, texture_offset_base, #2048
5508 add fb_ptr, fb_ptr, #2048
e1f6de8f 5509 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5510 bne 1b
5511
5512 ldmia sp!, { r4 - r11, pc }
5513
5514 0:
5515 add num_blocks, num_blocks, block_width
5516 mov texture_offset, texture_offset_base
5517
5518 cmp num_blocks, #MAX_BLOCKS
59d15d23 5519 blgt setup_sprites_16bpp_flush
75e28f62
E
5520
5521 add texture_offset_base, texture_offset_base, #2048
5522 and texture_block_ptr, texture_offset, texture_mask
5523
5524 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5525 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5526
e1f6de8f 5527 vst1.u32 { texels }, [block, :128]
75e28f62
E
5528 add block, block, #40
5529
5530 vdup.u8 draw_mask_fb_ptr, block_masks[0]
5531 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5532 pld [fb_ptr]
75e28f62 5533
e1f6de8f 5534 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5535 subs blocks_remaining, block_width, #2
5536
5537 add texture_offset, texture_offset, #16
5538 add fb_ptr, fb_ptr, #16
5539
5540 vmov.u8 draw_mask_fb_ptr, #0
5541
5542 add block, block, #24
5543 beq 2f
5544
5545 1:
5546 and texture_block_ptr, texture_offset, texture_mask
5547 subs blocks_remaining, blocks_remaining, #1
5548
5549 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5550 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62 5551
e1f6de8f 5552 vst1.u32 { texels }, [block, :128]
75e28f62
E
5553 add block, block, #40
5554
5555 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5556 pld [fb_ptr]
75e28f62 5557
e1f6de8f 5558 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5559
5560 add texture_offset, texture_offset, #16
5561 add fb_ptr, fb_ptr, #16
5562
5563 add block, block, #24
5564 bne 1b
5565
5566 2:
5567 and texture_block_ptr, texture_offset, texture_mask
5568 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5569
e1f6de8f 5570 vld1.u32 { texels }, [texture_block_ptr, :128]
75e28f62
E
5571 vdup.u8 draw_mask_fb_ptr, block_masks[4]
5572
e1f6de8f 5573 vst1.u32 { texels }, [block, :128]
75e28f62
E
5574 add block, block, #40
5575
5576 vmov.u32 draw_mask_fb_ptr[1], fb_ptr
e1f6de8f 5577 vst1.u32 { draw_mask_fb_ptr }, [block, :64]
75e28f62
E
5578
5579 add block, block, #24
5580 subs height, height, #1
5581
5582 add fb_ptr, fb_ptr, fb_ptr_pitch
e1f6de8f 5583 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
75e28f62
E
5584
5585 bne 0b
5586
5587 ldmia sp!, { r4 - r11, pc }
5588
5589
59d15d23 5590// 4x version
5591// FIXME: duplicate code with normal version :(
5592#undef draw_mask_fb_ptr
5593
5594function(setup_sprite_16bpp_4x)
5595 stmdb sp!, { r4 - r11, r14 }
e1f6de8f 5596 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
59d15d23 5597
e1f6de8f 5598 ldr v, [sp, #36]
59d15d23 5599 add fb_ptr, fb_ptr, y, lsl #11
5600
e1f6de8f 5601 ldr width, [sp, #40]
59d15d23 5602 add fb_ptr, fb_ptr, x, lsl #1
5603
e1f6de8f 5604 ldr height, [sp, #44]
59d15d23 5605 and left_offset, u, #0x7
5606
5607 add texture_offset_base, u, u
5608 add width_rounded, width, #7
5609
ed0fd81d 5610 add texture_offset_base, texture_offset_base, v, lsl #11
59d15d23 5611 movw left_mask_bits, #0xFFFF
5612
e1f6de8f 5613 ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
59d15d23 5614 add width_rounded, width_rounded, left_offset
5615
5616 lsl left_offset, #1
5617
e1f6de8f 5618 ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
59d15d23 5619 sub fb_ptr, fb_ptr, left_offset, lsl #1
5620
5621 add texture_mask, texture_mask_width, texture_mask_width
5622 movw right_mask_bits, #0xFFFC
5623
5624 and right_width, width_rounded, #0x7
5625 mvn left_mask_bits, left_mask_bits, lsl left_offset
5626
5627 lsl right_width, #1
5628
ed0fd81d 5629 add texture_mask, texture_mask, texture_mask_height, lsl #11
59d15d23 5630 mov block_width, width_rounded, lsr #3
5631
5632 mov right_mask_bits, right_mask_bits, lsl right_width
5633 movw fb_ptr_pitch, #(2048 + 16) * 2
5634
5635 sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5636 vmov block_masks, left_mask_bits, right_mask_bits
5637
e1f6de8f 5638 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5639 add block, psx_gpu, #psx_gpu_blocks_offset
5640
5641 bic texture_offset_base, texture_offset_base, #0xF
5642 cmp block_width, #1
5643
e1f6de8f 5644 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
59d15d23 5645 add block, block, num_blocks, lsl #6
5646
5647 lsl block_width, #2
5648 bne 0f
5649
5650 vext.32 block_masks_shifted, block_masks, block_masks, #1
5651 vorr.u32 block_masks, block_masks, block_masks_shifted
5652 vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5653 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5654
5655 1:
5656 add num_blocks, num_blocks, block_width
5657 cmp num_blocks, #MAX_BLOCKS
5658 blgt setup_sprites_16bpp_flush
5659
5660 and texture_block_ptr, texture_offset_base, texture_mask
5661 subs height, height, #1
5662
5663 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5664 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5665
5666 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5667
5668 add texture_offset_base, texture_offset_base, #2048
5669 add fb_ptr, fb_ptr, #2048*2
e1f6de8f 5670 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5671 bne 1b
5672
5673 ldmia sp!, { r4 - r11, pc }
5674
5675 0:
5676 add num_blocks, num_blocks, block_width
5677 mov texture_offset, texture_offset_base
5678
5679 vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5680 vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5681
5682 cmp num_blocks, #MAX_BLOCKS
5683 blgt setup_sprites_16bpp_flush
5684
5685 add texture_offset_base, texture_offset_base, #2048
5686 and texture_block_ptr, texture_offset, texture_mask
5687
5688 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5689 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5690
5691 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5692
5693 subs blocks_remaining, block_width, #2*4
5694 add texture_offset, texture_offset, #16
5695
5696 vmov.u8 draw_mask_fb_ptr_a, #0
5697 vmov.u8 draw_mask_fb_ptr_b, #0
5698
5699 add fb_ptr, fb_ptr, #16*2
5700 beq 2f
5701
5702 1:
5703 and texture_block_ptr, texture_offset, texture_mask
5704 subs blocks_remaining, blocks_remaining, #4
5705
5706 add texture_block_ptr, texture_page_ptr, texture_block_ptr
e1f6de8f 5707 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5708
5709 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5710 add texture_offset, texture_offset, #16
5711
5712 add fb_ptr, fb_ptr, #16*2
5713 bgt 1b
5714
5715 2:
5716 vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5717 vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5718
5719 and texture_block_ptr, texture_offset, texture_mask
5720 add texture_block_ptr, texture_page_ptr, texture_block_ptr
5721
e1f6de8f 5722 vld1.u32 { texels }, [texture_block_ptr, :128]
59d15d23 5723
5724 do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5725 subs height, height, #1
5726
5727 add fb_ptr, fb_ptr, fb_ptr_pitch
e1f6de8f 5728 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
59d15d23 5729
5730 bne 0b
5731
5732 ldmia sp!, { r4 - r11, pc }
5733
5734
f0931e56 5735#undef width
5736#undef right_width
5737#undef right_mask_bits
5738#undef color
5739#undef height
5740#undef blocks_remaining
5741#undef colors
5742#undef right_mask
5743#undef test_mask
5744#undef draw_mask
5745
5746#define psx_gpu r0
5747#define x r1
5748#define y r2
5749#define width r3
5750#define right_width r5
5751#define right_mask_bits r6
5752#define fb_ptr r7
5753#define color r8
5754#define height r9
5755#define fb_ptr_pitch r12
5756
5757// referenced by setup_sprites_16bpp_flush
5758#define num_blocks r4
5759#define block r5
5760#define block_width r11
5761
5762#define color_r r1
5763#define color_g r2
5764#define color_b r8
5765#define blocks_remaining r6
5766
5767#define colors q0
5768#define right_mask q1
5769#define test_mask q2
5770#define draw_mask q2
5771#define draw_mask_bits_fb_ptr d6
5772
5773
5774.align 3
5775
5776function(setup_sprite_untextured)
e1f6de8f 5777 ldrh r12, [psx_gpu, #psx_gpu_render_state_offset]
f0931e56 5778 tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
5779 | RENDER_FLAGS_BLEND)
e1f6de8f 5780 ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset]
d5c08ed3 5781 tsteq r12, #RENDER_INTERLACE_ENABLED
f0931e56 5782 beq setup_sprite_untextured_simple
5783
5784 stmdb sp!, { r4 - r11, r14 }
5785
e1f6de8f 5786 ldr width, [sp, #40]
5787 ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
f0931e56 5788
e1f6de8f 5789 ldr height, [sp, #44]
f0931e56 5790 add fb_ptr, fb_ptr, y, lsl #11
5791
5792 add fb_ptr, fb_ptr, x, lsl #1
5793 sub right_width, width, #1
5794
e1f6de8f 5795 ldr color, [sp, #48]
f0931e56 5796 and right_width, #7
5797
5798 add block_width, width, #7
5799 add right_width, #1
5800
5801 lsr block_width, #3
5802 mov right_mask_bits, #0xff
5803
5804 sub fb_ptr_pitch, block_width, #1
5805 lsl right_mask_bits, right_width
5806
5807 lsl fb_ptr_pitch, #3+1
5808 ubfx color_r, color, #3, #5
5809
5810 rsb fb_ptr_pitch, #1024*2
5811 ubfx color_g, color, #11, #5
5812
e1f6de8f 5813 vld1.u32 { test_mask }, [psx_gpu, :128]
f0931e56 5814 ubfx color_b, color, #19, #5
5815
5816 vdup.u16 right_mask, right_mask_bits
5817 orr color, color_r, color_b, lsl #10
5818
e1f6de8f 5819 ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
f0931e56 5820 orr color, color, color_g, lsl #5
5821
5822 vtst.u16 right_mask, right_mask, test_mask
5823 add block, psx_gpu, #psx_gpu_blocks_offset
5824
5825 vdup.u16 colors, color
5826 add block, block, num_blocks, lsl #6
5827
5828
5829setup_sprite_untextured_height_loop:
5830 add num_blocks, block_width
5831 sub blocks_remaining, block_width, #1
5832
5833 cmp num_blocks, #MAX_BLOCKS
5834 blgt setup_sprites_16bpp_flush
5835
5836 cmp blocks_remaining, #0
5837 ble 1f
5838
5839 vmov.u8 draw_mask, #0 /* zero_mask */
5840 vmov.u8 draw_mask_bits_fb_ptr, #0
5841
5842 0:
e1f6de8f 5843 vst1.u32 { draw_mask }, [block, :128]!
f0931e56 5844 subs blocks_remaining, #1
5845
e1f6de8f 5846 vst1.u32 { colors }, [block, :128]
f0931e56 5847 add block, block, #24
5848
5849 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
e1f6de8f 5850 vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
f0931e56 5851
5852 add block, block, #24
5853 add fb_ptr, #8*2
5854 bgt 0b
5855
5856 1:
e1f6de8f 5857 vst1.u32 { right_mask }, [block, :128]!
f0931e56 5858 subs height, #1
5859
e1f6de8f 5860 vst1.u32 { colors }, [block, :128]
f0931e56 5861 add block, block, #24
5862
5863 vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
e1f6de8f 5864 vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
f0931e56 5865
5866 add block, block, #24
5867 add fb_ptr, fb_ptr_pitch
5868
e1f6de8f 5869 strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
f0931e56 5870 bgt setup_sprite_untextured_height_loop
5871
5872 ldmia sp!, { r4 - r11, pc }
5873
5874
5875
75e28f62
E
5876#undef texture_page_ptr
5877#undef vram_ptr
5878#undef dirty_textures_mask
5879#undef current_texture_mask
5880
5881#define psx_gpu r0
5882#define current_texture_page r1
5883#define texture_page_ptr r2
5884#define vram_ptr_a r3
5885#define current_texture_page_x r12
5886#define current_texture_page_y r4
5887#define dirty_textures_mask r5
5888#define tile_y r6
5889#define tile_x r7
5890#define sub_y r8
5891#define current_texture_mask r9
5892#define c_4096 r10
5893#define vram_ptr_b r11
5894
5895#define texel_block_a d0
5896#define texel_block_b d1
5897#define texel_block_expanded_a q1
5898#define texel_block_expanded_b q2
5899#define texel_block_expanded_ab q2
5900#define texel_block_expanded_c q3
5901#define texel_block_expanded_d q4
5902#define texel_block_expanded_cd q3
5903
5904function(update_texture_4bpp_cache)
5905 stmdb sp!, { r4 - r11, r14 }
5906 vpush { q0 - q3 }
5907
e1f6de8f 5908 ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
75e28f62 5909
e1f6de8f 5910 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
5911 ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
75e28f62
E
5912
5913 and current_texture_page_x, current_texture_page, #0xF
e1f6de8f 5914 ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
75e28f62
E
5915
5916 mov current_texture_page_y, current_texture_page, lsr #4
e1f6de8f 5917 ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
5918
5919 add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5920 mov tile_y, #16
5921
5922 add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7
5923 bic dirty_textures_mask, current_texture_mask
5924
5925 mov tile_x, #16
e1f6de8f 5926 str dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
75e28f62
E
5927
5928 mov sub_y, #8
5929 movw c_4096, #4096
5930
5931 add vram_ptr_b, vram_ptr_a, #2048
5932
5933 0:
e1f6de8f 5934 vld1.u32 { texel_block_a }, [vram_ptr_a, :64], c_4096
5935 vld1.u32 { texel_block_b }, [vram_ptr_b, :64], c_4096
75e28f62
E
5936
5937 vmovl.u8 texel_block_expanded_a, texel_block_a
5938 vshll.u8 texel_block_expanded_b, texel_block_a, #4
5939 vmovl.u8 texel_block_expanded_c, texel_block_b
5940 vshll.u8 texel_block_expanded_d, texel_block_b, #4
5941
5942 vbic.u16 texel_block_expanded_a, #0x00F0
5943 vbic.u16 texel_block_expanded_b, #0x00F0
5944 vbic.u16 texel_block_expanded_c, #0x00F0
5945 vbic.u16 texel_block_expanded_d, #0x00F0
5946
5947 vorr.u16 texel_block_expanded_ab, texel_block_expanded_a, \
5948 texel_block_expanded_b
5949 vorr.u16 texel_block_expanded_cd, texel_block_expanded_c, \
5950 texel_block_expanded_d
5951
5952 vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd }, \
e1f6de8f 5953 [texture_page_ptr, :256]!
75e28f62
E
5954
5955 subs sub_y, sub_y, #1
5956 bne 0b
5957
5958 mov sub_y, #8
5959 add vram_ptr_a, vram_ptr_a, #8
5960 add vram_ptr_b, vram_ptr_b, #8
5961
5962 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5963 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5964
5965 subs tile_x, tile_x, #1
5966 bne 0b
5967
5968 mov tile_x, #16
5969 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5970 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5971
5972 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5973 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5974
5975 subs tile_y, tile_y, #1
5976 bne 0b
5977
5978 vpop { q0 - q3 }
5979 ldmia sp!, { r4 - r11, pc }
5980
5981
5982#undef current_texture_page
5983
5984#define psx_gpu r0
5985#define texture_page r1
5986#define texture_page_ptr r2
5987#define vram_ptr_a r3
5988#define texture_page_x r12
5989#define texture_page_y r4
5990#define current_texture_page r5
5991#define tile_y r6
5992#define tile_x r7
5993#define sub_y r8
5994#define c_4096 r10
5995#define vram_ptr_b r11
5996
5997
5998#undef texels_a
5999#undef texels_b
6000
6001#define texels_a q0
6002#define texels_b q1
6003#define texels_c q2
6004#define texels_d q3
6005
6006
6007function(update_texture_8bpp_cache_slice)
6008 stmdb sp!, { r4 - r11, r14 }
6009 vpush { q0 - q3 }
6010
e1f6de8f 6011 ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
6012 ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
75e28f62 6013
e1f6de8f 6014 ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
75e28f62
E
6015 mov tile_y, #16
6016
6017 and texture_page_x, texture_page, #0xF
6018 mov texture_page_y, texture_page, lsr #4
6019
6020 add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7
6021 mov tile_x, #8
6022
6023 add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6024 eor current_texture_page, current_texture_page, texture_page
6025
6026 ands current_texture_page, current_texture_page, #0x1
6027 mov sub_y, #4
6028
6029 addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6030 movw c_4096, #4096
6031
6032 add vram_ptr_b, vram_ptr_a, #2048
6033
6034 0:
e1f6de8f 6035 vld1.u32 { texels_a }, [vram_ptr_a, :128], c_4096
6036 vld1.u32 { texels_b }, [vram_ptr_b, :128], c_4096
6037 vld1.u32 { texels_c }, [vram_ptr_a, :128], c_4096
6038 vld1.u32 { texels_d }, [vram_ptr_b, :128], c_4096
75e28f62 6039
e1f6de8f 6040 vst1.u32 { texels_a, texels_b }, [texture_page_ptr, :256]!
6041 vst1.u32 { texels_c, texels_d }, [texture_page_ptr, :256]!
75e28f62
E
6042
6043 subs sub_y, sub_y, #1
6044 bne 0b
6045
6046 mov sub_y, #4
6047
6048 add vram_ptr_a, vram_ptr_a, #16
6049 add vram_ptr_b, vram_ptr_b, #16
6050
6051 sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6052 sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6053
6054 subs tile_x, tile_x, #1
6055 bne 0b
6056
6057 mov tile_x, #8
6058
6059 add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6060 add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6061
6062 sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6063 sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6064
6065 subs tile_y, tile_y, #1
6066 add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6067
6068 bne 0b
6069
6070 vpop { q0 - q3 }
6071 ldmia sp!, { r4 - r11, pc }
6072
50f9355a 6073
6074/* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6075function(scale2x_tiles8)
6076 push { r4, r14 }
6077
6078 mov r4, r1
6079 add r12, r0, #1024*2
6080 mov r14, r2
6081
60820:
e1f6de8f 6083 vld1.u16 { q0 }, [r1, :128]!
6084 vld1.u16 { q2 }, [r1, :128]!
50f9355a 6085 vmov q1, q0
6086 vmov q3, q2
6087 vzip.16 q0, q1
6088 vzip.16 q2, q3
6089 subs r14, #2
e1f6de8f 6090 vst1.u16 { q0, q1 }, [r0, :128]!
6091 vst1.u16 { q0, q1 }, [r12, :128]!
50f9355a 6092 blt 1f
e1f6de8f 6093 vst1.u16 { q2, q3 }, [r0, :128]!
6094 vst1.u16 { q2, q3 }, [r12, :128]!
50f9355a 6095 bgt 0b
60961:
6097 subs r3, #1
6098 mov r14, r2
6099 add r0, #1024*2*2
6100 add r4, #1024*2
ed0fd81d 6101 sub r0, r0, r2, lsl #4+1
50f9355a 6102 mov r1, r4
6103 add r12, r0, #1024*2
6104 bgt 0b
6105 nop
6106
6107 pop { r4, pc }
59d15d23 6108
6109// vim:filetype=armasm