psx_gpu: support relative jumptables
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation; either version 2 of
8  * the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  */
15
16 #define MAX_SPANS                                         512
17 #define MAX_BLOCKS                                        64
18 #define MAX_BLOCKS_PER_ROW                                128
19
20 #define RENDER_STATE_MASK_EVALUATE                        0x20
21 #define RENDER_FLAGS_MODULATE_TEXELS                      0x1
22 #define RENDER_FLAGS_BLEND                                0x2
23 #define RENDER_INTERLACE_ENABLED                          0x1
24
25 #include "psx_gpu_offsets.h"
26
27 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
28
29 #define edge_data_left_x_offset                           0
30 #define edge_data_num_blocks_offset                       2
31 #define edge_data_right_mask_offset                       4
32 #define edge_data_y_offset                                6
33
34 .syntax unified
35 .text
36
37 #define psx_gpu                                           r0
38 #define v_a                                               r1
39 #define v_b                                               r2
40 #define v_c                                               r3
41
42 #define x0                                                r4
43 #define x1                                                r5
44 #define x2                                                r6
45 #define x0_x1                                             r5
46 #define x1_x2                                             r6
47 #define y0                                                r7
48 #define y1                                                r8
49 #define y2                                                r9
50 #define y0_y1                                             r7
51 #define y1_y2                                             r8
52 #define b0                                                r9
53 #define b1                                                r10
54 #define b2                                                r11
55 #define b0_b1                                             r10
56 #define b1_b2                                             r11
57
58
59 #define area_r_s                                          r5
60
61 #define g_bx0                                             r2
62 #define g_bx                                              r3
63 #define g_bx2                                             r4
64 #define g_bx3                                             r5
65 #define b_base                                            r6
66 #define g_by                                              r8
67
68 #define gs_bx                                             r7
69 #define gs_by                                             r10
70
71 #define ga_bx                                             g_bx
72 #define ga_by                                             g_by
73
74 #define gw_bx_h                                           g_bx
75 #define gw_by_h                                           g_by
76
77 #define gw_bx_l                                           r11
78 #define gw_by_l                                           gw_bx_l
79
80 #define store_a                                           r0
81 #define store_b                                           r1
82 #define store_inc                                         r5
83
84
85 #define v0                                                q0
86 #define uvrgb0                                            d0
87 #define x0_y0                                             d1
88
89 #define v1                                                q1
90 #define uvrgb1                                            d2
91 #define x1_y1                                             d3
92
93 #define v2                                                q2
94 #define uvrgb2                                            d4
95 #define x2_y2                                             d5
96
97 #define x0_ab                                             q3
98 #define uvrg_xxxx0                                        q3
99 #define uvrg0                                             d6
100 #define xxxx0                                             d7
101
102 #define x1_ab                                             q4
103 #define uvrg_xxxx1                                        q4
104 #define uvrg1                                             d8
105 #define xxxx1                                             d9
106
107 #define x2_ab                                             q5
108 #define uvrg_xxxx2                                        q5
109 #define uvrg2                                             d10
110 #define xxxx2                                             d11
111
112 #define y0_ab                                             q6
113 #define yyyy_uvrg0                                        q6
114 #define yyyy0                                             d12
115 #define uvrg0b                                            d13
116
117 #define y1_ab                                             q7
118 #define yyyy_uvrg1                                        q7
119 #define yyyy1                                             d14
120 #define uvrg1b                                            d15
121
122 #define y2_ab                                             q8
123 #define yyyy_uvrg2                                        q8
124 #define yyyy2                                             d16
125 #define uvrg2b                                            d17
126
127 #define d0_ab                                             q9
128 #define d0_a                                              d18
129 #define d0_b                                              d19
130
131 #define d1_ab                                             q10
132 #define d1_a                                              d20
133 #define d1_b                                              d21
134
135 #define d2_ab                                             q11
136 #define d2_a                                              d22
137 #define d2_b                                              d23
138
139 #define d3_ab                                             q12
140 #define d3_a                                              d24
141 #define d3_b                                              d25
142
143 #define ga_uvrg_x                                         q1
144 #define ga_uvrg_y                                         q4
145
146 #define dx                                                x0_x1
147 #define dy                                                y0_y1
148 #define db                                                b0_b1
149
150 #define uvrg_base                                         q11
151
152 #define gs_uvrg_x                                         q5
153 #define gs_uvrg_y                                         q6
154
155 #define g_uvrg_x                                          q1
156 #define ga_uv_x                                           d2
157 #define g_uv_x                                            d2
158 #define ga_rg_x                                           d3
159 #define g_rg_x                                            d3
160
161 #define g_uvrg_y                                          q4
162 #define ga_uv_y                                           d8
163 #define g_uv_y                                            d8
164 #define ga_rg_y                                           d9
165 #define g_rg_y                                            d9
166
167 #define gw_uv_x                                           q1
168 #define gw_rg_x                                           q2
169 #define gw_uv_y                                           q4
170 #define gw_rg_y                                           q3
171
172 #define w_mask                                            q9
173 #define w_mask_l                                          d18
174
175 #define r_shift                                           q10
176
177 #define uvrg_dx0                                          q0
178 #define uvrg_dx0l                                         d0
179 #define uvrg_dx0h                                         d1
180
181 #define uvrg_dx1                                          q1
182 #define uvrg_dx1l                                         d2
183 #define uvrg_dx1h                                         d3
184
185 #define uvrg_dx2                                          q2
186 #define uvrg_dx2l                                         d4
187 #define uvrg_dx2h                                         d5
188
189 #define uvrg_dx3                                          q3
190 #define uvrg_dx3l                                         d6
191 #define uvrg_dx3h                                         d7
192
193 #define uvrgb_phase                                       q13
194
195 .align 4
196
197 #ifndef __MACH__
198
199 #define function(name)                                                         \
200   .global name;                                                                \
201   .type name, %function;                                                       \
202   name:                                                                        \
203
204 #define JT_OP_REL(table_label, index_reg, temp)
205 #define JT_OP(x...) x
206 #define JTE(start, target) target
207
208 #else
209
210 #define function(name)                                                         \
211   .globl _##name;                                                              \
212   name:                                                                        \
213   _##name:                                                                     \
214
215 #define JT_OP_REL(table_label, index_reg, temp)                                \
216   adr temp, table_label;                                                       \
217   ldr temp, [ temp, index_reg, lsl #2 ];                                       \
218   add pc, pc, temp                                                             \
219
220 #define JT_OP(x...)
221 #define JTE(start, target) (target - start)
222
223 #define flush_render_block_buffer _flush_render_block_buffer
224 #define setup_sprite_untextured_simple _setup_sprite_untextured_simple
225 #define update_texture_8bpp_cache _update_texture_8bpp_cache
226
227 #endif
228
229 @ r0: psx_gpu
230 @ r1: v_a
231 @ r2: v_b
232 @ r3: v_c
233
234 function(compute_all_gradients)
235   // First compute the triangle area reciprocal and shift. The division will
236   // happen concurrently with much of the work which follows.
237   @ r12 = psx_gpu->triangle_area
238   ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
239   stmdb sp!, { r4 - r11, lr }
240
241   @ load exponent of 62 into upper half of double
242   movw r4, #0
243   clz r14, r12                       @ r14 = shift
244
245   movt r4, #((62 + 1023) << 4)
246   mov r12, r12, lsl r14              @ r12 = triangle_area_normalized
247
248   @ load area normalized into lower half of double
249   mov r5, r12, lsr #10
250   vmov.f64 d30, r5, r4               @ d30 = (1 << 62) + ta_n
251
252   movt r4, #((1022 + 31) << 4)
253   mov r5, r12, lsl #20
254
255   add r4, r4, r12, lsr #11
256   vmov.f64 d31, r5, r4
257
258   vdiv.f64 d30, d30, d31             @ d30 = ((1 << 62) + ta_n) / ta_n
259
260   // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
261   // ( d0       *  d1      ) - ( d2       *  d3      ) =
262   // ( m0                  ) - ( m1                  ) = gradient
263
264   // This is split to do 12 elements at a time over three sets: a, b, and c.
265   // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
266   // two of the slots are unused.
267
268   // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
269   // is g.
270
271   // First type is:  uvrg bxxx xxxx 
272   // Second type is: yyyy ybyy uvrg 
273   // Since x_a and y_c are the same the same variable is used for both. 
274
275   vld1.u32 { v0 }, [ v_a, : 128 ]    @ v0 = { uvrg0, b0, x0, y0 }
276   ldrsh x0, [ v_a, #8 ]              @ load x0
277
278   vld1.u32 { v1 }, [ v_b, : 128 ]    @ v1 = { uvrg1, b1, x1, y1}
279   ldrh x1, [ v_b, #8 ]               @ load x1
280
281   vld1.u32 { v2 }, [ v_c, : 128 ]    @ v2 = { uvrg2, b2, x2, y2 }
282   ldrh x2, [ v_c, #8 ]               @ load x2
283
284   vmovl.u8 uvrg_xxxx0, uvrgb0        @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
285   ldrh y0, [ v_a, #10 ]              @ load y0
286
287   vmovl.u8 uvrg_xxxx1, uvrgb1        @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
288   ldrh y1, [ v_b, #10 ]              @ load y1
289
290   vmovl.u8 uvrg_xxxx2, uvrgb2        @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
291   ldrh y2, [ v_c, #10 ]              @ load y2
292
293   vmov.u8 uvrg0b, uvrg0              @ uvrg0b = { uv0, rg0 }
294   vdup.u16 xxxx0, x0_y0[0]           @ xxxx0 = { xx0, xx0 }
295
296   orr x1_x2, x1, x2, lsl #16         @ x1_x2 = { x1, x2 }
297   pkhbt x0_x1, x0, x1, lsl #16       @ x0_x1 = { x0, x1 }
298
299   vmov.u8 uvrg1b, uvrg1              @ uvrg1b = { uv1, rg1 }
300   vdup.u16 xxxx1, x1_y1[0]           @ xxxx1 = { xx1, xx1 }
301
302   vmov.u8 uvrg2b, uvrg2              @ uvrg2b = { uv2, rg2 }
303   vdup.u16 xxxx2, x2_y2[0]           @ xxxx2 = { xx2, xx2 }
304
305   ldrb b2, [ v_c, #4 ]               @ load b2
306   orr y0_y1, y0, y1, lsl #16         @ y0_y1 = { y0, y1 }
307
308   ldrb b1, [ v_b, #4 ]               @ load b1
309   orr y1_y2, y1, y2, lsl #16         @ y1_y2 = { y1, y2 }
310
311   vdup.u16 yyyy0, x0_y0[1]           @ yyyy0 = { yy0, yy0 }
312   vsub.s16 d0_ab, x1_ab, x0_ab
313
314   ldrb b0, [ v_a, #4 ]               @ load b0
315   orr b1_b2, b1, b2, lsl #16         @ b1_b2 = { b1, b2 }
316
317   vdup.u16 yyyy1, x1_y1[1]           @ yyyy1 = { yy1, yy1 }
318   vsub.s16 d2_ab, x2_ab, x1_ab
319
320   vdup.u16 yyyy2, x2_y2[1]           @ yyyy2 = { yy2, yy2 }
321   vsub.s16 d1_ab, y2_ab, y1_ab
322
323   orr b0_b1, b0, b1, lsl #16         @ b1_b2 = { b1, b2 }
324   ssub16 dx, x1_x2, x0_x1            @ dx = { x1 - x0, x2 - x1 }
325
326   ssub16 dy, y1_y2, y0_y1            @ dy = { y1 - y0, y2 - y1 }
327   ssub16 db, b1_b2, b0_b1            @ db = { b1 - b0, b2 - b1 }
328
329   vsub.s16 d3_ab, y1_ab, y0_ab
330   smusdx ga_by, dx, db               @ ga_by = ((x1 - x0) * (b2 - b1)) -
331                                      @         ((x2 - X1) * (b1 - b0)) 
332   vmull.s16 ga_uvrg_x, d0_a, d1_a
333   smusdx ga_bx, db, dy               @ ga_bx = ((b1 - b0) * (y2 - y1)) -
334                                      @         ((b2 - b1) * (y1 - y0))
335   vmlsl.s16 ga_uvrg_x, d2_a, d3_a
336   movs gs_bx, ga_bx, asr #31
337
338   vmull.s16 ga_uvrg_y, d0_b, d1_b
339   rsbmi ga_bx, ga_bx, #0
340
341   @ r12 = psx_gpu->uvrgb_phase
342   ldr r12, [ psx_gpu, #psx_gpu_uvrgb_phase_offset ]
343
344   vmlsl.s16 ga_uvrg_y, d2_b, d3_b
345   movs gs_by, ga_by, asr #31
346
347   vshr.u64 d0, d30, #22
348   add b_base, r12, b0, lsl #16
349
350   vdup.u32 uvrgb_phase, r12
351
352   rsbmi ga_by, ga_by, #0
353   vclt.s32 gs_uvrg_x, ga_uvrg_x, #0  @ gs_uvrg_x = ga_uvrg_x < 0
354
355   @ r12 = psx_gpu->triangle_winding_offset
356   ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
357   vclt.s32 gs_uvrg_y, ga_uvrg_y, #0  @ gs_uvrg_y = ga_uvrg_y < 0
358
359   rsb r12, r12, #0                   @ r12 = -(triangle->winding)
360
361   vdup.u32 w_mask, r12               @ w_mask = { -w, -w, -w, -w }
362   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
363
364   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
365   vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
366
367   vadd.u32 uvrg_base, uvrgb_phase
368   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
369
370   vmov area_r_s, s0                  @ area_r_s = triangle_reciprocal
371   vabs.s32 ga_uvrg_y, ga_uvrg_y      @ ga_uvrg_y = abs(ga_uvrg_y)
372
373   vmull.u32 gw_rg_x, ga_rg_x, d0[0]
374   vmull.u32 gw_uv_x, ga_uv_x, d0[0]
375   vmull.u32 gw_rg_y, ga_rg_y, d0[0]
376   vmull.u32 gw_uv_y, ga_uv_y, d0[0]
377
378   vshl.u64 gw_rg_x, gw_rg_x, r_shift
379   vshl.u64 gw_uv_x, gw_uv_x, r_shift
380   vshl.u64 gw_rg_y, gw_rg_y, r_shift
381   vshl.u64 gw_uv_y, gw_uv_y, r_shift
382
383   veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
384   vmovn.u64 g_uv_x, gw_uv_x
385
386   veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
387   vmovn.u64 g_rg_x, gw_rg_x
388
389   veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
390   vmovn.u64 g_uv_y, gw_uv_y
391
392   vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
393   vmovn.u64 g_rg_y, gw_rg_y
394
395   veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
396   mov ga_bx, ga_bx, lsl #13
397
398   vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
399   mov ga_by, ga_by, lsl #13
400
401   vdup.u32 x0_y0, x0
402   umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
403
404   vshl.u32 g_uvrg_x, g_uvrg_x, #4
405   vshl.u32 g_uvrg_y, g_uvrg_y, #4
406
407   umull gw_by_l, gw_by_h, ga_by, area_r_s
408   vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
409
410   eor gs_bx, gs_bx, r12
411   vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
412
413   veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
414   eor gs_by, gs_by, r12
415
416   rsb r11, r14, #0                   @ r11 = negative shift for scalar lsr
417   add store_a, psx_gpu, #psx_gpu_uvrg_offset
418
419   sub r11, r11, #(32 - 13)
420
421   add store_b, store_a, #16
422   mov store_inc, #32
423
424   vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
425   vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
426
427   vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
428   mov g_bx, gw_bx_h, lsr r11
429
430   vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
431   mov g_by, gw_by_h, lsr r11
432
433   vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l },                     \
434    [ store_b, : 128 ], store_inc
435   eor g_bx, g_bx, gs_bx
436
437   vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h },                     \
438    [ store_b, : 128 ], store_inc
439   sub g_bx, g_bx, gs_bx
440
441   lsl g_bx, g_bx, #4  
442   eor g_by, g_by, gs_by
443
444   mls b_base, g_bx, x0, b_base
445   sub g_by, g_by, gs_by
446
447   lsl g_by, g_by, #4
448   mov g_bx0, #0
449
450   add g_bx2, g_bx, g_bx
451   add g_bx3, g_bx, g_bx2
452
453   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
454
455   ldmia sp!, { r4 - r11, pc }
456
457
458 #define psx_gpu                                  r0
459 #define v_a                                      r1
460 #define v_b                                      r2
461 #define v_c                                      r3
462
463 #define temp                                     r14
464
465 #define x_a                                      r4
466 #define x_b                                      r5
467 #define x_c                                      r6
468 #define y_a                                      r1
469 #define y_b                                      r2
470 #define y_c                                      r3
471
472 #define height_minor_a                           r7
473 #define height_minor_b                           r8
474 #define height_major                             r9
475 #define height                                   r9
476
477 #define reciprocal_table_ptr                     r10
478
479 #define edge_alt_low                             r4
480 #define edge_alt_high                            r5
481 #define edge_dx_dy_alt                           r6
482 #define edge_shift_alt                           r10
483
484 #define edge_dx_dy_alt_low                       r4
485 #define edge_dx_dy_alt_high                      r5
486
487 #define span_edge_data                           r4
488 #define span_uvrg_offset                         r5
489 #define span_b_offset                            r6
490
491 #define clip                                     r14
492
493 #define b                                        r11
494 #define b_dy                                     r12
495
496
497 #define alternate_x                              q0
498 #define alternate_dx_dy                          q1
499 #define alternate_x_32                           q2
500
501 #define alternate_x_low                          d0
502 #define alternate_x_high                         d1
503 #define alternate_dx_dy_low                      d2
504 #define alternate_dx_dy_high                     d3
505 #define alternate_x_32_low                       d4
506 #define alternate_x_32_high                      d5
507
508 #define left_x                                   q3
509 #define right_x                                  q4
510 #define left_dx_dy                               q5
511 #define right_dx_dy                              q6
512 #define left_edge                                q7
513 #define right_edge                               q8
514
515 #define left_x_low                               d6
516 #define left_x_high                              d7
517 #define right_x_low                              d8
518 #define right_x_high                             d9
519 #define left_dx_dy_low                           d10
520 #define left_dx_dy_high                          d11
521 #define right_dx_dy_low                          d12
522 #define right_dx_dy_high                         d13
523 #define left_edge_low                            d14
524 #define left_edge_high                           d15
525 #define right_edge_low                           d16
526 #define right_edge_high                          d17
527
528 #define y_mid_point                              d18
529 #define c_0x0004                                 d19
530
531 #define left_right_x_16                          q11
532 #define span_shifts_y                            q12
533 #define c_0x0001                                 q13
534
535 #define span_shifts                              d24
536 #define y_x4                                     d25
537 #define c_0xFFFE                                 d26
538 #define c_0x0007                                 d27
539
540 #define left_right_x_16_low                      d22
541 #define left_right_x_16_high                     d23
542
543 #define uvrg                                     q14
544 #define uvrg_dy                                  q15
545
546 #define alternate_x_16                           d4
547
548 #define v_clip                                   q3
549 #define v_clip_low                               d6
550
551 #define right_x_32                               q10
552 #define left_x_32                                q11
553 #define alternate_select                         d24
554
555 #define right_x_32_low                           d20
556 #define right_x_32_high                          d21
557 #define left_x_32_low                            d22
558 #define left_x_32_high                           d23
559
560 #define edges_xy                                 q0
561 #define edges_dx_dy                              d2
562 #define edge_shifts                              d3
563 #define edge_shifts_64                           q2
564
565 #define edges_xy_left                            d0
566 #define edges_xy_right                           d1
567
568 #define height_reciprocals                       d6
569 #define heights                                  d7
570
571 #define widths                                   d8
572 #define c_0x01                                   d9
573 #define x_starts                                 d10
574 #define x_ends                                   d11
575
576 #define heights_b                                d12
577 #define edges_dx_dy_64                           q10
578
579 #define edges_dx_dy_64_left                      d20
580 #define edges_dx_dy_64_right                     d21
581
582
583 #define setup_spans_prologue()                                                 \
584   stmdb sp!, { r4 - r11, lr };                                                 \
585                                                                                \
586   ldrsh x_a, [ v_a, #8 ];                                                      \
587   ldrsh x_b, [ v_b, #8 ];                                                      \
588   ldrsh x_c, [ v_c, #8 ];                                                      \
589   ldrsh y_a, [ v_a, #10 ];                                                     \
590   ldrsh y_b, [ v_b, #10 ];                                                     \
591   ldrsh y_c, [ v_c, #10 ];                                                     \
592                                                                                \
593   add temp, psx_gpu, #psx_gpu_uvrg_offset;                                     \
594   vld1.32 { uvrg }, [ temp ];                                                  \
595   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
596   vld1.32 { uvrg_dy }, [ temp ];                                               \
597   ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \
598                                                                                \
599   vmov.u32 c_0x01, #0x01                                                       \
600
601 #define setup_spans_load_b()                                                   \
602   ldr b, [ psx_gpu, #psx_gpu_b_offset ];                                       \
603   ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ]                                  \
604
605 #define setup_spans_prologue_b()                                               \
606   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
607   add temp, psx_gpu, #psx_gpu_viewport_start_x_offset;                         \
608                                                                                \
609   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
610   vmov.u16 c_0x0004, #0x0004;                                                  \
611                                                                                \
612   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
613   vmov.u16 c_0x0001, #0x0001;                                                  \
614                                                                                \
615   vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ];                    \
616   add temp, psx_gpu, #psx_gpu_viewport_end_x_offset;                           \
617                                                                                \
618   vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ];                  \
619   vadd.u16 right_edge, right_edge, c_0x0001;                                   \
620                                                                                \
621   vmov.u16 c_0x0007, #0x0007;                                                  \
622   vmvn.u16 c_0xFFFE, #0x0001                                                   \
623
624
625 #define compute_edge_delta_x2()                                                \
626   ldr temp, [ reciprocal_table_ptr, height, lsl #2 ];                          \
627                                                                                \
628   vdup.u32 heights, height;                                                    \
629   vsub.u32 widths, x_ends, x_starts;                                           \
630                                                                                \
631   vdup.u32 edge_shifts, temp;                                                  \
632   vsub.u32 heights_b, heights, c_0x01;                                         \
633   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
634                                                                                \
635   vmla.s32 heights_b, x_starts, heights;                                       \
636   vbic.u16 edge_shifts, #0xE0;                                                 \
637   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
638   vmull.s32 edges_xy, heights_b, height_reciprocals                            \
639
640 #define width_alt                 r6
641 #define height_reciprocal_alt     r11
642 #define height_b_alt              r12
643
644 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
645   vmov heights, height_a, height_b;                                            \
646   ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ];                        \
647   vmov.u32 edge_shifts[0], temp;                                               \
648   ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ];                        \
649   vmov.u32 edge_shifts[1], temp;                                               \
650   ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ];        \
651                                                                                \
652   vsub.u32 widths, x_ends, x_starts;                                           \
653   sub width_alt, x_c, start_c;                                                 \
654                                                                                \
655   vsub.u32 heights_b, heights, c_0x01;                                         \
656   sub height_b_alt, height_minor_b, #1;                                        \
657                                                                                \
658   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
659   lsr height_reciprocal_alt, edge_shift_alt, #10;                              \
660                                                                                \
661   vmla.s32 heights_b, x_starts, heights;                                       \
662   mla height_b_alt, height_minor_b, start_c, height_b_alt;                     \
663                                                                                \
664   vbic.u16 edge_shifts, #0xE0;                                                 \
665   and edge_shift_alt, edge_shift_alt, #0x1F;                                   \
666                                                                                \
667   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
668   mul edge_dx_dy_alt, width_alt, height_reciprocal_alt;                        \
669                                                                                \
670   vmull.s32 edges_xy, heights_b, height_reciprocals;                           \
671   smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt       \
672
673
674 #define setup_spans_adjust_y_up()                                              \
675   vsub.u32 y_x4, y_x4, c_0x0004                                                \
676
677 #define setup_spans_adjust_y_down()                                            \
678   vadd.u32 y_x4, y_x4, c_0x0004                                                \
679
680 #define setup_spans_adjust_interpolants_up()                                   \
681   vsub.u32 uvrg, uvrg, uvrg_dy;                                                \
682   sub b, b, b_dy                                                               \
683
684 #define setup_spans_adjust_interpolants_down()                                 \
685   vadd.u32 uvrg, uvrg, uvrg_dy;                                                \
686   add b, b, b_dy                                                               \
687
688
689 #define setup_spans_clip_interpolants_increment()                              \
690   mla b, b_dy, clip, b;                                                        \
691   vmla.s32 uvrg, uvrg_dy, v_clip                                               \
692
693 #define setup_spans_clip_interpolants_decrement()                              \
694   mls b, b_dy, clip, b;                                                        \
695   vmls.s32 uvrg, uvrg_dy, v_clip                                               \
696
697 #define setup_spans_clip_alternate_yes()                                       \
698   smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip                      \
699
700 #define setup_spans_clip_alternate_no()                                        \
701
702 #define setup_spans_clip(direction, alternate_active)                          \
703   vdup.u32 v_clip, clip;                                                       \
704   setup_spans_clip_alternate_##alternate_active();                             \
705   setup_spans_clip_interpolants_##direction();                                 \
706   vmlal.s32 edges_xy, edges_dx_dy, v_clip_low                                  \
707
708
709 #define setup_spans_adjust_edges_alternate_no(left_index, right_index)         \
710   vmovl.s32 edge_shifts_64, edge_shifts;                                       \
711   vmovl.s32 edges_dx_dy_64, edges_dx_dy;                                       \
712                                                                                \
713   vshl.s64 edges_xy, edges_xy, edge_shifts_64;                                 \
714   vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64;                     \
715                                                                                \
716   vmov left_x_low, edges_xy_##left_index;                                      \
717   vmov right_x_low, edges_xy_##right_index;                                    \
718                                                                                \
719   vmov left_dx_dy_low, edges_dx_dy_64_##left_index;                            \
720   vmov left_dx_dy_high, edges_dx_dy_64_##left_index;                           \
721   vmov right_dx_dy_low, edges_dx_dy_64_##right_index;                          \
722   vmov right_dx_dy_high, edges_dx_dy_64_##right_index;                         \
723                                                                                \
724   vadd.u64 left_x_high, left_x_low, left_dx_dy_low;                            \
725   vadd.u64 right_x_high, right_x_low, right_dx_dy_low;                         \
726                                                                                \
727   vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy;                                 \
728   vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy                               \
729
730
731 #define setup_spans_adjust_edges_alternate_yes(left_index, right_index)        \
732   setup_spans_adjust_edges_alternate_no(left_index, right_index);              \
733                                                                                \
734   vdup.u16 y_mid_point, y_b;                                                   \
735   rsb temp, edge_shift_alt, #32;                                               \
736                                                                                \
737   lsl edge_alt_high, edge_alt_high, edge_shift_alt;                            \
738   orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp;                    \
739   lsl edge_alt_low, edge_alt_low, edge_shift_alt;                              \
740   vmov alternate_x_low, edge_alt_low, edge_alt_high;                           \
741                                                                                \
742   asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp;                               \
743   lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt;                      \
744   vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high;           \
745   vmov alternate_dx_dy_high, alternate_dx_dy_low;                              \
746                                                                                \
747   vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low;             \
748   vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy                   \
749
750
751 #define setup_spans_y_select_up()                                              \
752   vclt.s16 alternate_select, y_x4, y_mid_point                                 \
753
754 #define setup_spans_y_select_down()                                            \
755   vcgt.s16 alternate_select, y_x4, y_mid_point                                 \
756
757
758 #define setup_spans_alternate_select_left()                                    \
759   vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select               \
760
761 #define setup_spans_alternate_select_right()                                   \
762   vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select              \
763
764
765 #define setup_spans_set_x4_alternate_yes(alternate, direction)                 \
766   vshrn.s64 alternate_x_32_low, alternate_x, #32;                              \
767   vshrn.s64 left_x_32_low, left_x, #32;                                        \
768   vshrn.s64 right_x_32_low, right_x, #32;                                      \
769                                                                                \
770   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
771   vadd.u64 left_x, left_x, left_dx_dy;                                         \
772   vadd.u64 right_x, right_x, right_dx_dy;                                      \
773                                                                                \
774   vshrn.s64 alternate_x_32_high, alternate_x, #32;                             \
775   vshrn.s64 left_x_32_high, left_x, #32;                                       \
776   vshrn.s64 right_x_32_high, right_x, #32;                                     \
777                                                                                \
778   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
779   vadd.u64 left_x, left_x, left_dx_dy;                                         \
780   vadd.u64 right_x, right_x, right_dx_dy;                                      \
781                                                                                \
782   vmovn.u32 alternate_x_16, alternate_x_32;                                    \
783   setup_spans_y_select_##direction();                                          \
784   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
785                                                                                \
786   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
787   setup_spans_alternate_select_##alternate();                                  \
788                                                                                \
789   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
790   str b, [ span_b_offset ], #4;                                                \
791   setup_spans_adjust_interpolants_##direction();                               \
792                                                                                \
793   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
794                                                                                \
795   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
796   str b, [ span_b_offset ], #4;                                                \
797   setup_spans_adjust_interpolants_##direction();                               \
798                                                                                \
799   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
800                                                                                \
801   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
802   str b, [ span_b_offset ], #4;                                                \
803   setup_spans_adjust_interpolants_##direction();                               \
804                                                                                \
805   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
806   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
807   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
808                                                                                \
809   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
810   str b, [ span_b_offset ], #4;                                                \
811   setup_spans_adjust_interpolants_##direction();                               \
812                                                                                \
813   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
814   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
815                                                                                \
816   vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!;            \
817                                                                                \
818   setup_spans_adjust_y_##direction()                                           \
819
820
821 #define setup_spans_set_x4_alternate_no(alternate, direction)                  \
822   vshrn.s64 left_x_32_low, left_x, #32;                                        \
823   vshrn.s64 right_x_32_low, right_x, #32;                                      \
824                                                                                \
825   vadd.u64 left_x, left_x, left_dx_dy;                                         \
826   vadd.u64 right_x, right_x, right_dx_dy;                                      \
827                                                                                \
828   vshrn.s64 left_x_32_high, left_x, #32;                                       \
829   vshrn.s64 right_x_32_high, right_x, #32;                                     \
830                                                                                \
831   vadd.u64 left_x, left_x, left_dx_dy;                                         \
832   vadd.u64 right_x, right_x, right_dx_dy;                                      \
833                                                                                \
834   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
835   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
836                                                                                \
837   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
838   str b, [ span_b_offset ], #4;                                                \
839   setup_spans_adjust_interpolants_##direction();                               \
840                                                                                \
841   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
842                                                                                \
843   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
844   str b, [ span_b_offset ], #4;                                                \
845   setup_spans_adjust_interpolants_##direction();                               \
846                                                                                \
847   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
848                                                                                \
849   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
850   str b, [ span_b_offset ], #4;                                                \
851   setup_spans_adjust_interpolants_##direction();                               \
852                                                                                \
853   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
854   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
855   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
856                                                                                \
857   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
858   str b, [ span_b_offset ], #4;                                                \
859   setup_spans_adjust_interpolants_##direction();                               \
860                                                                                \
861   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
862   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
863                                                                                \
864   vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!;            \
865                                                                                \
866   setup_spans_adjust_y_##direction()                                           \
867
868
869 #define edge_adjust_low           r11
870 #define edge_adjust_high          r12
871
872 #define setup_spans_alternate_adjust_yes()                                     \
873   smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a;     \
874   subs edge_alt_low, edge_alt_low, edge_adjust_low;                            \
875   sbc edge_alt_high, edge_alt_high, edge_adjust_high                           \
876
877 #define setup_spans_alternate_adjust_no()                                      \
878
879
880 #define setup_spans_down(left_index, right_index, alternate, alternate_active) \
881   setup_spans_alternate_adjust_##alternate_active();                           \
882   setup_spans_load_b();                                                        \
883                                                                                \
884   ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ];                     \
885   subs y_c, y_c, temp;                                                         \
886   subgt height, height, y_c;                                                   \
887   addgt height, height, #1;                                                    \
888                                                                                \
889   ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ];                   \
890   subs clip, temp, y_a;                                                        \
891   ble 0f;                                                                      \
892                                                                                \
893   sub height, height, clip;                                                    \
894   add y_a, y_a, clip;                                                          \
895   setup_spans_clip(increment, alternate_active);                               \
896                                                                                \
897  0:                                                                            \
898   cmp height, #0;                                                              \
899   ble 1f;                                                                      \
900                                                                                \
901   orr temp, y_a, y_a, lsl #16;                                                 \
902   add temp, temp, #(1 << 16);                                                  \
903   add y_a, temp, #2;                                                           \
904   add y_a, y_a, #(2 << 16);                                                    \
905   vmov y_x4, temp, y_a;                                                        \
906                                                                                \
907   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
908    right_index);                                                               \
909   setup_spans_prologue_b();                                                    \
910                                                                                \
911   strh height, [ psx_gpu, #psx_gpu_num_spans_offset ];                         \
912                                                                                \
913  2:                                                                            \
914   setup_spans_set_x4_alternate_##alternate_active(alternate, down);            \
915   subs height, height, #4;                                                     \
916   bhi 2b;                                                                      \
917                                                                                \
918  1:                                                                            \
919
920
921 #define setup_spans_alternate_pre_increment_yes()                              \
922   adds edge_alt_low, edge_alt_low, edge_dx_dy_alt;                             \
923   adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31                    \
924
925 #define setup_spans_alternate_pre_increment_no()                               \
926
927
928 #define setup_spans_up_decrement_yes()                                         \
929   suble height, height, #1                                                     \
930
931 #define setup_spans_up_decrement_no()                                          \
932
933
934 #define setup_spans_up(left_index, right_index, alternate, alternate_active)   \
935   setup_spans_alternate_adjust_##alternate_active();                           \
936   setup_spans_load_b();                                                        \
937   sub y_a, y_a, #1;                                                            \
938                                                                                \
939   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ];                    \
940   subs temp, temp, y_c;                                                        \
941   subgt height, height, temp;                                                  \
942   setup_spans_up_decrement_##alternate_active();                               \
943                                                                                \
944   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ];                      \
945   subs clip, y_a, temp;                                                        \
946   ble 0f;                                                                      \
947                                                                                \
948   sub height, height, clip;                                                    \
949   sub y_a, y_a, clip;                                                          \
950   setup_spans_clip(decrement, alternate_active);                               \
951                                                                                \
952  0:                                                                            \
953   cmp height, #0;                                                              \
954   ble 1f;                                                                      \
955                                                                                \
956   orr temp, y_a, y_a, lsl #16;                                                 \
957   sub temp, temp, #(1 << 16);                                                  \
958   sub y_a, temp, #2;                                                           \
959   sub y_a, y_a, #(2 << 16);                                                    \
960   vmov y_x4, temp, y_a;                                                        \
961                                                                                \
962   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
963                                                                                \
964   setup_spans_alternate_pre_increment_##alternate_active();                    \
965   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
966    right_index);                                                               \
967   setup_spans_adjust_interpolants_up();                                        \
968   setup_spans_prologue_b();                                                    \
969                                                                                \
970   strh height, [ psx_gpu, #psx_gpu_num_spans_offset ];                         \
971                                                                                \
972  2:                                                                            \
973   setup_spans_set_x4_alternate_##alternate_active(alternate, up);              \
974   subs height, height, #4;                                                     \
975   bhi 2b;                                                                      \
976                                                                                \
977  1:                                                                            \
978
979
980 #define setup_spans_epilogue()                                                 \
981   ldmia sp!, { r4 - r11, pc }                                                  \
982
983
984 #define setup_spans_up_up(minor, major)                                        \
985   setup_spans_prologue();                                                      \
986   sub height_minor_a, y_a, y_b;                                                \
987   sub height_minor_b, y_b, y_c;                                                \
988   sub height, y_a, y_c;                                                        \
989                                                                                \
990   vdup.u32 x_starts, x_a;                                                      \
991   vmov x_ends, x_c, x_b;                                                       \
992                                                                                \
993   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
994   setup_spans_up(major, minor, minor, yes);                                    \
995   setup_spans_epilogue()                                                       \
996
997 function(setup_spans_up_left)
998   setup_spans_up_up(left, right)
999
1000 function(setup_spans_up_right)
1001   setup_spans_up_up(right, left)
1002
1003 #define setup_spans_down_down(minor, major)                                    \
1004   setup_spans_prologue();                                                      \
1005   sub height_minor_a, y_b, y_a;                                                \
1006   sub height_minor_b, y_c, y_b;                                                \
1007   sub height, y_c, y_a;                                                        \
1008                                                                                \
1009   vdup.u32 x_starts, x_a;                                                      \
1010   vmov x_ends, x_c, x_b;                                                       \
1011                                                                                \
1012   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1013   setup_spans_down(major, minor, minor, yes);                                  \
1014   setup_spans_epilogue()                                                       \
1015
1016 function(setup_spans_down_left)
1017   setup_spans_down_down(left, right)
1018
1019 function(setup_spans_down_right)
1020   setup_spans_down_down(right, left)
1021
1022
1023 #define setup_spans_up_flat()                                                  \
1024   sub height, y_a, y_c;                                                        \
1025                                                                                \
1026   compute_edge_delta_x2();                                                     \
1027   setup_spans_up(left, right, none, no);                                       \
1028   setup_spans_epilogue()                                                       \
1029
1030 function(setup_spans_up_a)
1031   setup_spans_prologue()
1032
1033   vmov x_starts, x_a, x_b
1034   vdup.u32 x_ends, x_c
1035
1036   setup_spans_up_flat()
1037
1038 function(setup_spans_up_b)
1039   setup_spans_prologue()
1040
1041   vdup.u32 x_starts, x_a
1042   vmov x_ends, x_b, x_c
1043
1044   setup_spans_up_flat()
1045
1046 #define setup_spans_down_flat()                                                \
1047   sub height, y_c, y_a;                                                        \
1048                                                                                \
1049   compute_edge_delta_x2();                                                     \
1050   setup_spans_down(left, right, none, no);                                     \
1051   setup_spans_epilogue()                                                       \
1052
1053 function(setup_spans_down_a)
1054   setup_spans_prologue()
1055
1056   vmov x_starts, x_a, x_b
1057   vdup.u32 x_ends, x_c
1058
1059   setup_spans_down_flat()
1060
1061 function(setup_spans_down_b)
1062   setup_spans_prologue()
1063
1064   vdup.u32 x_starts, x_a
1065   vmov x_ends, x_b, x_c
1066
1067   setup_spans_down_flat()
1068
1069
1070 #define middle_y                                          r9
1071
1072 #define edges_xy_b                                        q11
1073 #define edges_dx_dy_b                                     d26
1074 #define edge_shifts_b                                     d27
1075 #define edges_dx_dy_and_shifts_b                          q13
1076 #define height_increment                                  d20
1077
1078 #define edges_dx_dy_and_shifts                            q1
1079
1080 #define edges_xy_b_left                                   d22
1081 #define edges_xy_b_right                                  d23
1082
1083 #define setup_spans_up_down_load_edge_set_b()                                  \
1084   vmov edges_xy, edges_xy_b;                                                   \
1085   vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b                        \
1086
1087
1088 function(setup_spans_up_down)
1089   setup_spans_prologue()
1090
1091   // s32 middle_y = y_a;
1092   sub height_minor_a, y_a, y_b
1093   sub height_minor_b, y_c, y_a
1094   sub height_major, y_c, y_b
1095
1096   vmov x_starts, x_a, x_c
1097   vdup.u32 x_ends, x_b
1098
1099   compute_edge_delta_x3(x_a, height_minor_a, height_major)
1100
1101   mov temp, #0
1102   vmov height_increment, temp, height_minor_b
1103   vmlal.s32 edges_xy, edges_dx_dy, height_increment
1104
1105   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1106   vmov edges_xy_b_right, edges_xy_right
1107
1108   vmov edge_shifts_b, edge_shifts
1109   vmov.u32 edge_shifts_b[0], edge_shift_alt
1110
1111   vneg.s32 edges_dx_dy_b, edges_dx_dy
1112   vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1113
1114   mov middle_y, y_a
1115   
1116   setup_spans_load_b()
1117   sub y_a, y_a, #1
1118
1119   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1120   subs temp, temp, y_b
1121   subgt height_minor_a, height_minor_a, temp
1122
1123   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1124   subs clip, y_a, temp
1125   ble 0f
1126
1127   sub height_minor_a, height_minor_a, clip
1128   sub y_a, y_a, clip
1129   setup_spans_clip(decrement, no)
1130
1131  0:                                                                
1132   cmp height_minor_a, #0
1133   ble 3f
1134
1135   orr temp, y_a, y_a, lsl #16
1136   sub temp, temp, #(1 << 16)
1137   sub y_a, temp, #2
1138   sub y_a, y_a, #(2 << 16)
1139   vmov y_x4, temp, y_a
1140
1141   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1142
1143   strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1144
1145   setup_spans_adjust_edges_alternate_no(left, right); 
1146   setup_spans_adjust_interpolants_up()
1147   setup_spans_up_down_load_edge_set_b()
1148
1149   setup_spans_prologue_b()
1150
1151
1152  2: 
1153   setup_spans_set_x4_alternate_no(none, up)
1154   subs height_minor_a, height_minor_a, #4
1155   bhi 2b
1156
1157   add span_edge_data, span_edge_data, height_minor_a, lsl #3
1158   add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1159   add span_b_offset, span_b_offset, height_minor_a, lsl #2
1160
1161  4:
1162   add temp, psx_gpu, #psx_gpu_uvrg_offset
1163   vld1.32 { uvrg }, [ temp ]
1164   mov y_a, middle_y
1165   
1166   setup_spans_load_b()
1167
1168   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1169   subs y_c, y_c, temp
1170   subgt height_minor_b, height_minor_b, y_c
1171   addgt height_minor_b, height_minor_b, #1
1172
1173   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1174   subs clip, temp, y_a
1175   ble 0f
1176
1177   sub height_minor_b, height_minor_b, clip
1178   add y_a, y_a, clip
1179   setup_spans_clip(increment, no)
1180
1181  0:
1182   cmp height_minor_b, #0
1183   ble 1f
1184
1185   orr temp, y_a, y_a, lsl #16
1186   add temp, temp, #(1 << 16) 
1187   add y_a, temp, #2
1188   add y_a, y_a, #(2 << 16)
1189   vmov y_x4, temp, y_a
1190
1191   setup_spans_adjust_edges_alternate_no(left, right)
1192
1193   ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1194   add temp, temp, height_minor_b
1195
1196   cmp temp, #MAX_SPANS
1197   beq 5f
1198
1199   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1200
1201  2:                                                     
1202   setup_spans_set_x4_alternate_no(none, down)
1203   subs height_minor_b, height_minor_b, #4
1204   bhi 2b
1205
1206  1:
1207   setup_spans_epilogue()
1208
1209  3:
1210   setup_spans_up_down_load_edge_set_b()
1211   setup_spans_prologue_b()
1212   bal 4b
1213
1214  5:
1215   // FIXME: overflow corner case
1216   sub temp, temp, height_minor_b
1217   bics height_minor_b, #3
1218   add temp, temp, height_minor_b
1219   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1220   bne 2b
1221   bal 1b
1222
1223 #undef span_uvrg_offset
1224 #undef span_edge_data
1225 #undef span_b_offset
1226 #undef left_x
1227 #undef b
1228
1229 #define psx_gpu                                  r0
1230 #define num_spans                                r1
1231 #define span_uvrg_offset                         r2
1232 #define span_edge_data                           r3
1233 #define span_b_offset                            r4
1234 #define b_dx                                     r5
1235 #define span_num_blocks                          r6
1236 #define y                                        r7
1237 #define left_x                                   r8
1238 #define b                                        r9
1239 #define dither_offset_ptr                        r10
1240 #define block_ptr_a                              r11
1241 #define fb_ptr                                   r12
1242 #define num_blocks                               r14
1243
1244 #define uvrg_dx_ptr                              r2
1245 #define texture_mask_ptr                         r3
1246 #define dither_shift                             r8
1247 #define dither_row                               r10
1248
1249 #define c_32                                     r7
1250 #define b_dx4                                    r8
1251 #define b_dx8                                    r9
1252 #define block_ptr_b                              r10
1253
1254 #define block_span_ptr                           r10
1255 #define right_mask                               r8
1256
1257 #define color                                    r2
1258 #define color_r                                  r3
1259 #define color_g                                  r4
1260 #define color_b                                  r5
1261
1262 #undef uvrg
1263
1264 #define u_block                                  q0
1265 #define v_block                                  q1
1266 #define r_block                                  q2
1267 #define g_block                                  q3
1268 #define b_block                                  q4
1269
1270 #define uv_dx4                                   d10
1271 #define rg_dx4                                   d11
1272 #define uv_dx8                                   d12
1273 #define rg_dx8                                   d13
1274 #define b_whole_8                                d14
1275 #define fb_mask_ptrs                             d15
1276
1277 #define uvrg_dx4                                 q5
1278 #define uvrg_dx8                                 q6
1279 #define uv_dx8                                   d12
1280 #define rg_dx8                                   d13
1281
1282 #define u_whole                                  q8
1283 #define v_whole                                  q9
1284 #define r_whole                                  q10
1285 #define g_whole                                  q11
1286 #define b_whole                                  q12
1287
1288 #define u_whole_low                              d16
1289 #define u_whole_high                             d17
1290 #define v_whole_low                              d18
1291 #define v_whole_high                             d19
1292 #define r_whole_low                              d20
1293 #define r_whole_high                             d21
1294 #define g_whole_low                              d22
1295 #define g_whole_high                             d23
1296 #define b_whole_low                              d24
1297 #define b_whole_high                             d25
1298
1299 #define dx4                                      q13
1300 #define dx8                                      q13
1301
1302 #define u_whole_8                                d26
1303 #define v_whole_8                                d27
1304 #define u_whole_8b                               d24
1305 #define r_whole_8                                d24
1306 #define g_whole_8                                d25
1307
1308 #define uv_whole_8                               q13
1309 #define uv_whole_8b                              q14
1310
1311 #define dither_offsets                           q14
1312 #define texture_mask                             q15
1313 #define texture_mask_u                           d30
1314 #define texture_mask_v                           d31
1315
1316 #define dither_offsets_short                     d28
1317
1318 #define v_left_x                                 q8
1319 #define uvrg                                     q9
1320 #define block_span                               q10
1321
1322 #define uv                                       d18
1323 #define rg                                       d19
1324
1325 #define draw_mask                                q1
1326 #define draw_mask_edge                           q13
1327 #define test_mask                                q0
1328
1329 #define uvrg_dx                                  q3
1330
1331 #define colors                                   q2
1332
1333 #define setup_blocks_texture_swizzled()                                        \
1334   vand.u8 u_whole_8b, u_whole_8, texture_mask_u;                               \
1335   vsli.u8 u_whole_8, v_whole_8, #4;                                            \
1336   vsri.u8 v_whole_8, u_whole_8b, #4                                            \
1337
1338 #define setup_blocks_texture_unswizzled()                                      \
1339
1340
1341 #define setup_blocks_shaded_textured_builder(swizzling)                        \
1342 .align 3;                                                                      \
1343                                                                                \
1344 function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
1345   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
1346   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1347                                                                                \
1348   vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ];                                 \
1349   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1350                                                                                \
1351   cmp num_spans, #0;                                                           \
1352   bxeq lr;                                                                     \
1353                                                                                \
1354   stmdb sp!, { r4 - r11, r14 };                                                \
1355   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1356                                                                                \
1357   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
1358   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1359                                                                                \
1360   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ];   \
1361   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1362                                                                                \
1363   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1364   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1365                                                                                \
1366   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
1367   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1368                                                                                \
1369   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1370                                                                                \
1371  0:                                                                            \
1372   vmov.u8 fb_mask_ptrs, #0;                                                    \
1373                                                                                \
1374   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
1375   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1376                                                                                \
1377   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
1378   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
1379                                                                                \
1380   cmp span_num_blocks, #0;                                                     \
1381   beq 1f;                                                                      \
1382                                                                                \
1383   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
1384   add num_blocks, span_num_blocks, num_blocks;                                 \
1385                                                                                \
1386   cmp num_blocks, #MAX_BLOCKS;                                                 \
1387   bgt 2f;                                                                      \
1388                                                                                \
1389  3:                                                                            \
1390   ldr b, [ span_b_offset ];                                                    \
1391   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1392                                                                                \
1393   vdup.u32 v_left_x, left_x;                                                   \
1394   and y, y, #0x3;                                                              \
1395                                                                                \
1396   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
1397   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1398                                                                                \
1399   mla b, b_dx, left_x, b;                                                      \
1400   and dither_shift, left_x, #0x03;                                             \
1401                                                                                \
1402   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
1403   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1404                                                                                \
1405   mov dither_shift, dither_shift, lsl #3;                                      \
1406   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1407                                                                                \
1408   mov c_32, #32;                                                               \
1409   subs span_num_blocks, span_num_blocks, #1;                                   \
1410                                                                                \
1411   mov dither_row, dither_row, ror dither_shift;                                \
1412   mov b_dx4, b_dx, lsl #2;                                                     \
1413                                                                                \
1414   vdup.u32 dither_offsets_short, dither_row;                                   \
1415   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1416                                                                                \
1417   vdup.u32 b_block, b;                                                         \
1418   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1419                                                                                \
1420   vdup.u32 u_block, uv[0];                                                     \
1421   mov b_dx8, b_dx, lsl #3;                                                     \
1422                                                                                \
1423   vdup.u32 v_block, uv[1];                                                     \
1424   vdup.u32 r_block, rg[0];                                                     \
1425   vdup.u32 g_block, rg[1];                                                     \
1426                                                                                \
1427   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1428                                                                                \
1429   vadd.u32 u_block, u_block, block_span;                                       \
1430   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1431                                                                                \
1432   vadd.u32 v_block, v_block, block_span;                                       \
1433   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1434                                                                                \
1435   vadd.u32 r_block, r_block, block_span;                                       \
1436   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1437                                                                                \
1438   vadd.u32 g_block, g_block, block_span;                                       \
1439   vld1.u32 { block_span }, [ block_span_ptr, :128 ];                           \
1440                                                                                \
1441   vadd.u32 b_block, b_block, block_span;                                       \
1442   add block_ptr_b, block_ptr_a, #16;                                           \
1443                                                                                \
1444   vshrn.u32 u_whole_low, u_block, #16;                                         \
1445   vshrn.u32 v_whole_low, v_block, #16;                                         \
1446   vshrn.u32 r_whole_low, r_block, #16;                                         \
1447   vshrn.u32 g_whole_low, g_block, #16;                                         \
1448                                                                                \
1449   vdup.u32 dx4, uv_dx4[0];                                                     \
1450   vshrn.u32 b_whole_low, b_block, #16;                                         \
1451                                                                                \
1452   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1453   vdup.u32 dx4, uv_dx4[1];                                                     \
1454                                                                                \
1455   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1456   vdup.u32 dx4, rg_dx4[0];                                                     \
1457                                                                                \
1458   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1459   vdup.u32 dx4, rg_dx4[1];                                                     \
1460                                                                                \
1461   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1462   vdup.u32 dx4, b_dx4;                                                         \
1463                                                                                \
1464   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1465   vdup.u32 dx8, uv_dx8[0];                                                     \
1466                                                                                \
1467   vadd.u32 u_block, u_block, dx8;                                              \
1468   vdup.u32 dx8, uv_dx8[1];                                                     \
1469                                                                                \
1470   vadd.u32 v_block, v_block, dx8;                                              \
1471   vdup.u32 dx8, rg_dx8[0];                                                     \
1472                                                                                \
1473   vadd.u32 r_block, r_block, dx8;                                              \
1474   vdup.u32 dx8, rg_dx8[1];                                                     \
1475                                                                                \
1476   vadd.u32 g_block, g_block, dx8;                                              \
1477   vdup.u32 dx8, b_dx8;                                                         \
1478                                                                                \
1479   vadd.u32 b_block, b_block, dx8;                                              \
1480   vmovn.u16 u_whole_8, u_whole;                                                \
1481                                                                                \
1482   vmovn.u16 v_whole_8, v_whole;                                                \
1483                                                                                \
1484   vmovn.u16 b_whole_8, b_whole;                                                \
1485   pld [ fb_ptr ];                                                              \
1486   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1487                                                                                \
1488   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1489   setup_blocks_texture_##swizzling();                                          \
1490                                                                                \
1491   vmovn.u16 r_whole_8, r_whole;                                                \
1492   beq 5f;                                                                      \
1493                                                                                \
1494  4:                                                                            \
1495   vmovn.u16 g_whole_8, g_whole;                                                \
1496   vshrn.u32 u_whole_low, u_block, #16;                                         \
1497                                                                                \
1498   vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32;               \
1499   vshrn.u32 v_whole_low, v_block, #16;                                         \
1500                                                                                \
1501   vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32;              \
1502   vshrn.u32 r_whole_low, r_block, #16;                                         \
1503                                                                                \
1504   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1505   vshrn.u32 g_whole_low, g_block, #16;                                         \
1506                                                                                \
1507   vdup.u32 dx4, uv_dx4[0];                                                     \
1508   vshrn.u32 b_whole_low, b_block, #16;                                         \
1509                                                                                \
1510   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1511   vdup.u32 dx4, uv_dx4[1];                                                     \
1512                                                                                \
1513   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1514   vdup.u32 dx4, rg_dx4[0];                                                     \
1515                                                                                \
1516   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1517   vdup.u32 dx4, rg_dx4[1];                                                     \
1518                                                                                \
1519   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1520   vdup.u32 dx4, b_dx4;                                                         \
1521                                                                                \
1522   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1523   vdup.u32 dx8, uv_dx8[0];                                                     \
1524                                                                                \
1525   vadd.u32 u_block, u_block, dx8;                                              \
1526   vdup.u32 dx8, uv_dx8[1];                                                     \
1527                                                                                \
1528   vadd.u32 v_block, v_block, dx8;                                              \
1529   vdup.u32 dx8, rg_dx8[0];                                                     \
1530                                                                                \
1531   vadd.u32 r_block, r_block, dx8;                                              \
1532   vdup.u32 dx8, rg_dx8[1];                                                     \
1533                                                                                \
1534   vadd.u32 g_block, g_block, dx8;                                              \
1535   vdup.u32 dx8, b_dx8;                                                         \
1536                                                                                \
1537   vadd.u32 b_block, b_block, dx8;                                              \
1538   vmovn.u16 u_whole_8, u_whole;                                                \
1539                                                                                \
1540   add fb_ptr, fb_ptr, #16;                                                     \
1541   vmovn.u16 v_whole_8, v_whole;                                                \
1542                                                                                \
1543   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1544   vmovn.u16 b_whole_8, b_whole;                                                \
1545                                                                                \
1546   pld [ fb_ptr ];                                                              \
1547                                                                                \
1548   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1549   subs span_num_blocks, span_num_blocks, #1;                                   \
1550                                                                                \
1551   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1552   setup_blocks_texture_##swizzling();                                          \
1553                                                                                \
1554   vmovn.u16 r_whole_8, r_whole;                                                \
1555   bne 4b;                                                                      \
1556                                                                                \
1557  5:                                                                            \
1558   vmovn.u16 g_whole_8, g_whole;                                                \
1559   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
1560                                                                                \
1561   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
1562   vdup.u8 draw_mask, right_mask;                                               \
1563                                                                                \
1564   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1565   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1566   vzip.u8 u_whole_8, v_whole_8;                                                \
1567                                                                                \
1568   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1569   vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32;              \
1570   vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32;                        \
1571   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1572   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1573                                                                                \
1574  1:                                                                            \
1575   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1576   add span_b_offset, span_b_offset, #4;                                        \
1577                                                                                \
1578   add span_edge_data, span_edge_data, #8;                                      \
1579   subs num_spans, num_spans, #1;                                               \
1580                                                                                \
1581   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1582   bne 0b;                                                                      \
1583                                                                                \
1584   ldmia sp!, { r4 - r11, pc };                                                 \
1585                                                                                \
1586  2:                                                                            \
1587   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1588   vpush { texture_mask };                                                      \
1589   vpush { uvrg_dx4 };                                                          \
1590                                                                                \
1591   stmdb sp!, { r0 - r3, r12, r14 };                                            \
1592   bl flush_render_block_buffer;                                                \
1593   ldmia sp!, { r0 - r3, r12, r14 };                                            \
1594                                                                                \
1595   vpop { uvrg_dx4 };                                                           \
1596   vpop { texture_mask };                                                       \
1597                                                                                \
1598   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1599   vmov.u8 fb_mask_ptrs, #0;                                                    \
1600                                                                                \
1601   mov num_blocks, span_num_blocks;                                             \
1602   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1603   bal 3b                                                                       \
1604
1605
1606 setup_blocks_shaded_textured_builder(swizzled)
1607 setup_blocks_shaded_textured_builder(unswizzled)
1608
1609
1610 #define setup_blocks_unshaded_textured_builder(swizzling)                      \
1611 .align 3;                                                                      \
1612                                                                                \
1613 function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
1614   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
1615   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1616                                                                                \
1617   vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ];                                 \
1618   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1619                                                                                \
1620   cmp num_spans, #0;                                                           \
1621   bxeq lr;                                                                     \
1622                                                                                \
1623   stmdb sp!, { r4 - r11, r14 };                                                \
1624   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1625                                                                                \
1626   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1627                                                                                \
1628   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ];   \
1629   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1630                                                                                \
1631   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1632   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1633                                                                                \
1634   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1635                                                                                \
1636   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1637                                                                                \
1638  0:                                                                            \
1639   vmov.u8 fb_mask_ptrs, #0;                                                    \
1640                                                                                \
1641   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
1642   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1643                                                                                \
1644   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
1645   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
1646                                                                                \
1647   cmp span_num_blocks, #0;                                                     \
1648   beq 1f;                                                                      \
1649                                                                                \
1650   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
1651   add num_blocks, span_num_blocks, num_blocks;                                 \
1652                                                                                \
1653   cmp num_blocks, #MAX_BLOCKS;                                                 \
1654   bgt 2f;                                                                      \
1655                                                                                \
1656  3:                                                                            \
1657   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1658                                                                                \
1659   vdup.u32 v_left_x, left_x;                                                   \
1660   and y, y, #0x3;                                                              \
1661                                                                                \
1662   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
1663   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1664                                                                                \
1665   and dither_shift, left_x, #0x03;                                             \
1666                                                                                \
1667   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
1668   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1669                                                                                \
1670   mov dither_shift, dither_shift, lsl #3;                                      \
1671   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1672                                                                                \
1673   mov c_32, #32;                                                               \
1674   subs span_num_blocks, span_num_blocks, #1;                                   \
1675                                                                                \
1676   mov dither_row, dither_row, ror dither_shift;                                \
1677                                                                                \
1678   vdup.u32 dither_offsets_short, dither_row;                                   \
1679   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1680                                                                                \
1681   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1682                                                                                \
1683   vdup.u32 u_block, uv[0];                                                     \
1684                                                                                \
1685   vdup.u32 v_block, uv[1];                                                     \
1686   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1687                                                                                \
1688   vadd.u32 u_block, u_block, block_span;                                       \
1689   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1690                                                                                \
1691   vadd.u32 v_block, v_block, block_span;                                       \
1692   add block_ptr_b, block_ptr_a, #16;                                           \
1693                                                                                \
1694   vshrn.u32 u_whole_low, u_block, #16;                                         \
1695   vshrn.u32 v_whole_low, v_block, #16;                                         \
1696                                                                                \
1697   vdup.u32 dx4, uv_dx4[0];                                                     \
1698                                                                                \
1699   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1700   vdup.u32 dx4, uv_dx4[1];                                                     \
1701                                                                                \
1702   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1703   vdup.u32 dx8, uv_dx8[0];                                                     \
1704                                                                                \
1705   vadd.u32 u_block, u_block, dx8;                                              \
1706   vdup.u32 dx8, uv_dx8[1];                                                     \
1707                                                                                \
1708   vadd.u32 v_block, v_block, dx8;                                              \
1709   vmovn.u16 u_whole_8, u_whole;                                                \
1710                                                                                \
1711   vmovn.u16 v_whole_8, v_whole;                                                \
1712                                                                                \
1713   pld [ fb_ptr ];                                                              \
1714   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1715                                                                                \
1716   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1717   setup_blocks_texture_##swizzling();                                          \
1718                                                                                \
1719   beq 5f;                                                                      \
1720                                                                                \
1721  4:                                                                            \
1722   vshrn.u32 u_whole_low, u_block, #16;                                         \
1723                                                                                \
1724   vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32;               \
1725   vshrn.u32 v_whole_low, v_block, #16;                                         \
1726                                                                                \
1727   add block_ptr_b, block_ptr_b, #32;                                           \
1728   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1729                                                                                \
1730   vdup.u32 dx4, uv_dx4[0];                                                     \
1731   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1732   vdup.u32 dx4, uv_dx4[1];                                                     \
1733                                                                                \
1734   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1735   vdup.u32 dx8, uv_dx8[0];                                                     \
1736                                                                                \
1737   vadd.u32 u_block, u_block, dx8;                                              \
1738   vdup.u32 dx8, uv_dx8[1];                                                     \
1739                                                                                \
1740   vadd.u32 v_block, v_block, dx8;                                              \
1741   vmovn.u16 u_whole_8, u_whole;                                                \
1742                                                                                \
1743   add fb_ptr, fb_ptr, #16;                                                     \
1744   vmovn.u16 v_whole_8, v_whole;                                                \
1745                                                                                \
1746   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1747   pld [ fb_ptr ];                                                              \
1748                                                                                \
1749   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1750   subs span_num_blocks, span_num_blocks, #1;                                   \
1751                                                                                \
1752   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1753   setup_blocks_texture_##swizzling();                                          \
1754                                                                                \
1755   bne 4b;                                                                      \
1756                                                                                \
1757  5:                                                                            \
1758   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
1759                                                                                \
1760   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
1761   vdup.u8 draw_mask, right_mask;                                               \
1762                                                                                \
1763   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1764   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1765   vzip.u8 u_whole_8, v_whole_8;                                                \
1766                                                                                \
1767   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1768   add block_ptr_b, block_ptr_b, #32;                                           \
1769   vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32;                        \
1770   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1771   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1772                                                                                \
1773  1:                                                                            \
1774   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1775   add span_edge_data, span_edge_data, #8;                                      \
1776   subs num_spans, num_spans, #1;                                               \
1777                                                                                \
1778   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1779   bne 0b;                                                                      \
1780                                                                                \
1781   ldmia sp!, { r4 - r11, pc };                                                 \
1782                                                                                \
1783  2:                                                                            \
1784   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1785   vpush { texture_mask };                                                      \
1786   vpush { uvrg_dx4 };                                                          \
1787                                                                                \
1788   stmdb sp!, { r0 - r3, r12, r14 };                                            \
1789   bl flush_render_block_buffer;                                                \
1790   ldmia sp!, { r0 - r3, r12, r14 };                                            \
1791                                                                                \
1792   vpop { uvrg_dx4 };                                                           \
1793   vpop { texture_mask };                                                       \
1794                                                                                \
1795   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1796   vmov.u8 fb_mask_ptrs, #0;                                                    \
1797                                                                                \
1798   mov num_blocks, span_num_blocks;                                             \
1799   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1800   bal 3b                                                                       \
1801
1802
1803 setup_blocks_unshaded_textured_builder(swizzled)
1804 setup_blocks_unshaded_textured_builder(unswizzled)
1805
1806
1807 .align 3
1808
1809 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1810   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1811   veor.u32 draw_mask, draw_mask, draw_mask
1812
1813   cmp num_spans, #0
1814   bxeq lr
1815
1816   stmdb sp!, { r4 - r11, r14 }
1817   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1818
1819   ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1820
1821   ubfx color_r, color, #3, #5
1822   ubfx color_g, color, #11, #5
1823   ubfx color_b, color, #19, #5
1824
1825   orr color, color_r, color_b, lsl #10
1826   orr color, color, color_g, lsl #5
1827
1828   vdup.u16 colors, color
1829
1830   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1831   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1832
1833   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1834   add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1835
1836  0:
1837   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1838   ldrh y, [ span_edge_data, #edge_data_y_offset ]
1839
1840   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
1841
1842   cmp span_num_blocks, #0
1843   beq 1f
1844
1845   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1846   add num_blocks, span_num_blocks, num_blocks
1847
1848   cmp num_blocks, #MAX_BLOCKS
1849   bgt 2f
1850
1851  3:
1852   add fb_ptr, fb_ptr, y, lsl #11
1853   and y, y, #0x3
1854
1855   add fb_ptr, fb_ptr, left_x, lsl #1
1856   mov c_32, #32
1857
1858   subs span_num_blocks, span_num_blocks, #1
1859
1860   add block_ptr_b, block_ptr_a, #16
1861   pld [ fb_ptr ]
1862
1863   vmov.u32 fb_mask_ptrs[1], fb_ptr
1864   beq 5f
1865
1866  4:
1867   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1868   vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1869   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1870
1871   add fb_ptr, fb_ptr, #16
1872   add block_ptr_b, block_ptr_b, #32
1873
1874   pld [ fb_ptr ]
1875
1876   vmov.u32 fb_mask_ptrs[1], fb_ptr
1877   subs span_num_blocks, span_num_blocks, #1
1878
1879   bne 4b
1880
1881  5:
1882   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1883
1884   vdup.u8 draw_mask_edge, right_mask
1885   vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1886
1887   vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1888   vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1889   add block_ptr_b, block_ptr_b, #32
1890   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1891
1892  1:
1893   add span_edge_data, span_edge_data, #8
1894   subs num_spans, num_spans, #1
1895
1896   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1897   bne 0b
1898
1899   ldmia sp!, { r4 - r11, pc }
1900                                                                            
1901  2:
1902   vpush { colors }
1903
1904   stmdb sp!, { r0 - r3, r12, r14 }
1905   bl flush_render_block_buffer
1906   ldmia sp!, { r0 - r3, r12, r14 }
1907
1908   vpop { colors }
1909
1910   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1911   veor.u32 draw_mask, draw_mask, draw_mask
1912
1913   mov num_blocks, span_num_blocks
1914   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1915   bal 3b
1916
1917
1918 #define mask_msb_scalar                                   r14
1919
1920 #define msb_mask                                          q15
1921
1922 #define pixels_low                                        d16
1923
1924 #define msb_mask_low                                      d30
1925 #define msb_mask_high                                     d31
1926
1927
1928 .align 3
1929
1930 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1931   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1932
1933   cmp num_spans, #0
1934   bxeq lr
1935
1936   stmdb sp!, { r4 - r11, r14 }
1937
1938   ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1939
1940   ubfx color_r, color, #3, #5
1941   ubfx color_g, color, #11, #5
1942
1943   ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1944   ubfx color_b, color, #19, #5
1945
1946   orr color, color_r, color_b, lsl #10
1947   orr color, color, color_g, lsl #5
1948   orr color, color, mask_msb_scalar
1949
1950   vdup.u16 colors, color
1951
1952   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1953   orr color, color, color, lsl #16
1954
1955
1956  0:
1957   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1958   ldrh y, [ span_edge_data, #edge_data_y_offset ]
1959
1960   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
1961
1962   cmp span_num_blocks, #0
1963   beq 1f
1964
1965   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1966
1967   add fb_ptr, fb_ptr, y, lsl #11
1968   subs span_num_blocks, span_num_blocks, #1
1969
1970   add fb_ptr, fb_ptr, left_x, lsl #1
1971   beq 3f
1972
1973  2:
1974   vst1.u32 { colors }, [ fb_ptr ]!
1975   subs span_num_blocks, span_num_blocks, #1
1976
1977   bne 2b
1978
1979  3:
1980   ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1981
1982   cmp right_mask, #0x0
1983   beq 5f
1984
1985   tst right_mask, #0xF
1986   streq color, [ fb_ptr ], #4
1987   moveq right_mask, right_mask, lsr #4
1988   streq color, [ fb_ptr ], #4
1989
1990   tst right_mask, #0x3
1991   streq color, [ fb_ptr ], #4
1992   moveq right_mask, right_mask, lsr #2
1993
1994   tst right_mask, #0x1
1995   strheq color, [ fb_ptr ]
1996
1997  1:
1998   add span_edge_data, span_edge_data, #8
1999   subs num_spans, num_spans, #1
2000   bne 0b
2001
2002   ldmia sp!, { r4 - r11, pc }
2003                                                                            
2004  5:
2005   vst1.u32 { colors }, [ fb_ptr ]
2006   bal 1b
2007
2008
2009 #undef c_64
2010
2011 #define c_64                                              r7
2012 #define rg_dx_ptr                                         r2
2013
2014
2015 #undef r_block
2016 #undef g_block
2017 #undef b_block
2018 #undef r_whole
2019 #undef g_whole
2020 #undef b_whole
2021 #undef r_whole_low
2022 #undef r_whole_high
2023 #undef g_whole_low
2024 #undef g_whole_high
2025 #undef b_whole_low
2026 #undef b_whole_high
2027 #undef r_whole_8
2028 #undef g_whole_8
2029 #undef b_whole_8
2030 #undef dither_offsets
2031 #undef rg_dx4
2032 #undef rg_dx8
2033 #undef dx4
2034 #undef dx8
2035 #undef v_left_x
2036 #undef uvrg
2037 #undef block_span
2038 #undef rg
2039 #undef draw_mask
2040 #undef test_mask
2041
2042 #define r_block                                           q0
2043 #define g_block                                           q1
2044 #define b_block                                           q2
2045
2046 #define r_whole                                           q3
2047 #define g_whole                                           q4
2048 #define b_whole                                           q5
2049
2050 #define r_whole_low                                       d6
2051 #define r_whole_high                                      d7
2052 #define g_whole_low                                       d8
2053 #define g_whole_high                                      d9
2054 #define b_whole_low                                       d10
2055 #define b_whole_high                                      d11
2056
2057 #define gb_whole_8                                        q6
2058
2059 #define g_whole_8                                         d12
2060 #define b_whole_8                                         d13
2061
2062 #define r_whole_8                                         d14
2063
2064 #define pixels                                            q8
2065
2066 #define rg_dx4                                            d18
2067 #define rg_dx8                                            d19
2068
2069 #define dx4                                               q10
2070 #define dx8                                               q10
2071
2072 #define v_left_x                                          d6
2073 #define uvrg                                              q4
2074 #define block_span                                        q5
2075
2076 #define rg                                                d9
2077
2078 #define d64_1                                             d22
2079 #define d64_128                                           d23
2080
2081 #define d128_4                                            q12
2082 #define d128_0x7                                          q13
2083
2084 #define d64_4                                             d24
2085
2086 #define dither_offsets                                    q14
2087 #define draw_mask                                         q15
2088
2089 #define dither_offsets_low                                d28
2090
2091 #define rg_dx                                             d0
2092 #define test_mask                                         q10
2093
2094
2095 #define setup_blocks_shaded_untextured_dither_a_dithered()                     \
2096   vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low;                           \
2097   vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets;                             \
2098
2099 #define setup_blocks_shaded_untextured_dither_b_dithered()                     \
2100   vqsub.u8 r_whole_8, r_whole_8, d64_4;                                        \
2101   vqsub.u8 gb_whole_8, gb_whole_8, d128_4                                      \
2102
2103 #define setup_blocks_shaded_untextured_dither_a_undithered()                   \
2104
2105 #define setup_blocks_shaded_untextured_dither_b_undithered()                   \
2106
2107
2108 #define setup_blocks_shaded_untextured_indirect_builder(dithering)             \
2109 .align 3;                                                                      \
2110                                                                                \
2111 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
2112   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
2113   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2114                                                                                \
2115   vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ];                                      \
2116                                                                                \
2117   cmp num_spans, #0;                                                           \
2118   bxeq lr;                                                                     \
2119                                                                                \
2120   stmdb sp!, { r4 - r11, r14 };                                                \
2121   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2122                                                                                \
2123   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
2124   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2125                                                                                \
2126   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2127                                                                                \
2128   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
2129   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2130                                                                                \
2131   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2132   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2133                                                                                \
2134   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
2135   vmov.u8 d64_1, #1;                                                           \
2136                                                                                \
2137   vmov.u8 d128_4, #4;                                                          \
2138   vmov.u8 d64_128, #128;                                                       \
2139                                                                                \
2140   vmov.u8 d128_0x7, #0x7;                                                      \
2141                                                                                \
2142  0:                                                                            \
2143   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
2144   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2145                                                                                \
2146   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
2147   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
2148                                                                                \
2149   cmp span_num_blocks, #0;                                                     \
2150   beq 1f;                                                                      \
2151                                                                                \
2152   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
2153   add num_blocks, span_num_blocks, num_blocks;                                 \
2154                                                                                \
2155   cmp num_blocks, #MAX_BLOCKS;                                                 \
2156   bgt 2f;                                                                      \
2157                                                                                \
2158  3:                                                                            \
2159   ldr b, [ span_b_offset ];                                                    \
2160   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2161                                                                                \
2162   vdup.u32 v_left_x, left_x;                                                   \
2163   and y, y, #0x3;                                                              \
2164                                                                                \
2165   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
2166   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2167                                                                                \
2168   mla b, b_dx, left_x, b;                                                      \
2169   and dither_shift, left_x, #0x03;                                             \
2170                                                                                \
2171   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
2172   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2173                                                                                \
2174   mov dither_shift, dither_shift, lsl #3;                                      \
2175   vmla.u32 rg, rg_dx, v_left_x;                                                \
2176                                                                                \
2177   mov c_64, #64;                                                               \
2178   subs span_num_blocks, span_num_blocks, #1;                                   \
2179                                                                                \
2180   mov dither_row, dither_row, ror dither_shift;                                \
2181   mov b_dx4, b_dx, lsl #2;                                                     \
2182                                                                                \
2183   vdup.u32 dither_offsets, dither_row;                                         \
2184   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2185                                                                                \
2186   vdup.u32 b_block, b;                                                         \
2187   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2188                                                                                \
2189   mov b_dx8, b_dx, lsl #3;                                                     \
2190   vdup.u32 r_block, rg[0];                                                     \
2191   vdup.u32 g_block, rg[1];                                                     \
2192                                                                                \
2193   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2194                                                                                \
2195   vadd.u32 r_block, r_block, block_span;                                       \
2196   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2197                                                                                \
2198   vadd.u32 g_block, g_block, block_span;                                       \
2199   vld1.u32 { block_span }, [ block_span_ptr, :128 ];                           \
2200                                                                                \
2201   vadd.u32 b_block, b_block, block_span;                                       \
2202   add block_ptr_b, block_ptr_a, #16;                                           \
2203                                                                                \
2204   vshrn.u32 r_whole_low, r_block, #16;                                         \
2205   vshrn.u32 g_whole_low, g_block, #16;                                         \
2206   vshrn.u32 b_whole_low, b_block, #16;                                         \
2207   vdup.u32 dx4, rg_dx4[0];                                                     \
2208                                                                                \
2209   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2210   vdup.u32 dx4, rg_dx4[1];                                                     \
2211                                                                                \
2212   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2213   vdup.u32 dx4, b_dx4;                                                         \
2214                                                                                \
2215   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2216   vdup.u32 dx8, rg_dx8[0];                                                     \
2217                                                                                \
2218   vadd.u32 r_block, r_block, dx8;                                              \
2219   vdup.u32 dx8, rg_dx8[1];                                                     \
2220                                                                                \
2221   vadd.u32 g_block, g_block, dx8;                                              \
2222   vdup.u32 dx8, b_dx8;                                                         \
2223                                                                                \
2224   vadd.u32 b_block, b_block, dx8;                                              \
2225                                                                                \
2226   vmovn.u16 r_whole_8, r_whole;                                                \
2227   vmovn.u16 g_whole_8, g_whole;                                                \
2228   vmovn.u16 b_whole_8, b_whole;                                                \
2229                                                                                \
2230   beq 5f;                                                                      \
2231   veor.u32 draw_mask, draw_mask, draw_mask;                                    \
2232                                                                                \
2233  4:                                                                            \
2234   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2235   vshrn.u32 r_whole_low, r_block, #16;                                         \
2236                                                                                \
2237   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2238   vshrn.u32 g_whole_low, g_block, #16;                                         \
2239                                                                                \
2240   vshrn.u32 b_whole_low, b_block, #16;                                         \
2241   str fb_ptr, [ block_ptr_a, #44 ];                                            \
2242                                                                                \
2243   vdup.u32 dx4, rg_dx4[0];                                                     \
2244   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2245   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2246                                                                                \
2247   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2248   vdup.u32 dx4, rg_dx4[1];                                                     \
2249                                                                                \
2250   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2251   vdup.u32 dx4, b_dx4;                                                         \
2252                                                                                \
2253   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2254   vdup.u32 dx8, rg_dx8[0];                                                     \
2255                                                                                \
2256   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2257   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2258   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2259                                                                                \
2260   vadd.u32 r_block, r_block, dx8;                                              \
2261   vdup.u32 dx8, rg_dx8[1];                                                     \
2262                                                                                \
2263   vadd.u32 g_block, g_block, dx8;                                              \
2264   vdup.u32 dx8, b_dx8;                                                         \
2265                                                                                \
2266   vadd.u32 b_block, b_block, dx8;                                              \
2267   add fb_ptr, fb_ptr, #16;                                                     \
2268                                                                                \
2269   vmovn.u16 r_whole_8, r_whole;                                                \
2270   vmovn.u16 g_whole_8, g_whole;                                                \
2271   vmovn.u16 b_whole_8, b_whole;                                                \
2272                                                                                \
2273   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64;                         \
2274   vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64;                            \
2275                                                                                \
2276   pld [ fb_ptr ];                                                              \
2277                                                                                \
2278   subs span_num_blocks, span_num_blocks, #1;                                   \
2279   bne 4b;                                                                      \
2280                                                                                \
2281  5:                                                                            \
2282   str fb_ptr, [ block_ptr_a, #44 ];                                            \
2283   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2284                                                                                \
2285   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
2286   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2287                                                                                \
2288   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2289   vdup.u8 draw_mask, right_mask;                                               \
2290                                                                                \
2291   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2292   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
2293                                                                                \
2294   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
2295                                                                                \
2296   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2297   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2298   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2299                                                                                \
2300   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64;                         \
2301   vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64;                            \
2302                                                                                \
2303  1:                                                                            \
2304   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2305   add span_b_offset, span_b_offset, #4;                                        \
2306                                                                                \
2307   add span_edge_data, span_edge_data, #8;                                      \
2308   subs num_spans, num_spans, #1;                                               \
2309                                                                                \
2310   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
2311   bne 0b;                                                                      \
2312                                                                                \
2313   ldmia sp!, { r4 - r11, pc };                                                 \
2314                                                                                \
2315  2:                                                                            \
2316   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
2317   vpush { rg_dx4 };                                                            \
2318                                                                                \
2319   stmdb sp!, { r0 - r3, r12, r14 };                                            \
2320   bl flush_render_block_buffer;                                                \
2321   ldmia sp!, { r0 - r3, r12, r14 };                                            \
2322                                                                                \
2323   vpop { rg_dx4 };                                                             \
2324                                                                                \
2325   vmov.u8 d64_1, #1;                                                           \
2326   vmov.u8 d128_4, #4;                                                          \
2327   vmov.u8 d64_128, #128;                                                       \
2328   vmov.u8 d128_0x7, #0x7;                                                      \
2329                                                                                \
2330   vadd.u32 rg_dx8, rg_dx4, rg_dx4;                                             \
2331                                                                                \
2332   mov num_blocks, span_num_blocks;                                             \
2333   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2334   bal 3b                                                                       \
2335
2336
2337 setup_blocks_shaded_untextured_indirect_builder(undithered)
2338 setup_blocks_shaded_untextured_indirect_builder(dithered)
2339
2340
2341 #undef draw_mask
2342
2343 #define mask_msb_ptr                                      r14
2344
2345 #define draw_mask                                         q0
2346 #define pixels_low                                        d16
2347 #define pixels_high                                       d17
2348
2349
2350
2351 #define setup_blocks_shaded_untextured_direct_builder(dithering)               \
2352 .align 3;                                                                      \
2353                                                                                \
2354 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
2355   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
2356   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2357                                                                                \
2358   vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ];                                      \
2359                                                                                \
2360   cmp num_spans, #0;                                                           \
2361   bxeq lr;                                                                     \
2362                                                                                \
2363   stmdb sp!, { r4 - r11, r14 };                                                \
2364   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2365                                                                                \
2366   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
2367   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2368                                                                                \
2369   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2370   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2371                                                                                \
2372   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2373   vmov.u8 d64_1, #1;                                                           \
2374                                                                                \
2375   vmov.u8 d128_4, #4;                                                          \
2376   vmov.u8 d64_128, #128;                                                       \
2377                                                                                \
2378   vmov.u8 d128_0x7, #0x7;                                                      \
2379   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
2380   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
2381                                                                                \
2382  0:                                                                            \
2383   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
2384   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2385                                                                                \
2386   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
2387   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
2388                                                                                \
2389   cmp span_num_blocks, #0;                                                     \
2390   beq 1f;                                                                      \
2391                                                                                \
2392   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
2393   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2394                                                                                \
2395   ldr b, [ span_b_offset ];                                                    \
2396   vdup.u32 v_left_x, left_x;                                                   \
2397   and y, y, #0x3;                                                              \
2398                                                                                \
2399   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
2400   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2401                                                                                \
2402   mla b, b_dx, left_x, b;                                                      \
2403   and dither_shift, left_x, #0x03;                                             \
2404                                                                                \
2405   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
2406   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2407                                                                                \
2408   mov dither_shift, dither_shift, lsl #3;                                      \
2409   vmla.u32 rg, rg_dx, v_left_x;                                                \
2410                                                                                \
2411   subs span_num_blocks, span_num_blocks, #1;                                   \
2412                                                                                \
2413   mov dither_row, dither_row, ror dither_shift;                                \
2414   mov b_dx4, b_dx, lsl #2;                                                     \
2415                                                                                \
2416   vdup.u32 dither_offsets, dither_row;                                         \
2417   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2418                                                                                \
2419   vdup.u32 b_block, b;                                                         \
2420   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2421                                                                                \
2422   mov b_dx8, b_dx, lsl #3;                                                     \
2423   vdup.u32 r_block, rg[0];                                                     \
2424   vdup.u32 g_block, rg[1];                                                     \
2425                                                                                \
2426   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2427                                                                                \
2428   vadd.u32 r_block, r_block, block_span;                                       \
2429   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2430                                                                                \
2431   vadd.u32 g_block, g_block, block_span;                                       \
2432   vld1.u32 { block_span }, [ block_span_ptr, :128 ];                           \
2433                                                                                \
2434   vadd.u32 b_block, b_block, block_span;                                       \
2435   add block_ptr_b, block_ptr_a, #16;                                           \
2436                                                                                \
2437   vshrn.u32 r_whole_low, r_block, #16;                                         \
2438   vshrn.u32 g_whole_low, g_block, #16;                                         \
2439   vshrn.u32 b_whole_low, b_block, #16;                                         \
2440   vdup.u32 dx4, rg_dx4[0];                                                     \
2441                                                                                \
2442   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2443   vdup.u32 dx4, rg_dx4[1];                                                     \
2444                                                                                \
2445   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2446   vdup.u32 dx4, b_dx4;                                                         \
2447                                                                                \
2448   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2449   vdup.u32 dx8, rg_dx8[0];                                                     \
2450                                                                                \
2451   vadd.u32 r_block, r_block, dx8;                                              \
2452   vdup.u32 dx8, rg_dx8[1];                                                     \
2453                                                                                \
2454   vadd.u32 g_block, g_block, dx8;                                              \
2455   vdup.u32 dx8, b_dx8;                                                         \
2456                                                                                \
2457   vadd.u32 b_block, b_block, dx8;                                              \
2458                                                                                \
2459   vmovn.u16 r_whole_8, r_whole;                                                \
2460   vmovn.u16 g_whole_8, g_whole;                                                \
2461   vmovn.u16 b_whole_8, b_whole;                                                \
2462                                                                                \
2463   beq 3f;                                                                      \
2464                                                                                \
2465  2:                                                                            \
2466   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2467   vshrn.u32 r_whole_low, r_block, #16;                                         \
2468                                                                                \
2469   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2470   vshrn.u32 g_whole_low, g_block, #16;                                         \
2471                                                                                \
2472   vshrn.u32 b_whole_low, b_block, #16;                                         \
2473                                                                                \
2474   vdup.u32 dx4, rg_dx4[0];                                                     \
2475   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2476   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2477                                                                                \
2478   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2479   vdup.u32 dx4, rg_dx4[1];                                                     \
2480                                                                                \
2481   vmov pixels, msb_mask;                                                       \
2482   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2483   vdup.u32 dx4, b_dx4;                                                         \
2484                                                                                \
2485   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2486   vdup.u32 dx8, rg_dx8[0];                                                     \
2487                                                                                \
2488   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2489   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2490   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2491                                                                                \
2492   vadd.u32 r_block, r_block, dx8;                                              \
2493   vdup.u32 dx8, rg_dx8[1];                                                     \
2494                                                                                \
2495   vadd.u32 g_block, g_block, dx8;                                              \
2496   vdup.u32 dx8, b_dx8;                                                         \
2497                                                                                \
2498   vadd.u32 b_block, b_block, dx8;                                              \
2499                                                                                \
2500   vmovn.u16 r_whole_8, r_whole;                                                \
2501   vmovn.u16 g_whole_8, g_whole;                                                \
2502   vmovn.u16 b_whole_8, b_whole;                                                \
2503                                                                                \
2504   vst1.u32 { pixels }, [ fb_ptr ]!;                                            \
2505   subs span_num_blocks, span_num_blocks, #1;                                   \
2506   bne 2b;                                                                      \
2507                                                                                \
2508  3:                                                                            \
2509   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2510                                                                                \
2511   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
2512   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2513                                                                                \
2514   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2515   rbit right_mask, right_mask;                                                 \
2516   vmov pixels, msb_mask;                                                       \
2517   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2518   clz right_mask, right_mask;                                                  \
2519                                                                                \
2520   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2521   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2522   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2523                                                                                \
2524   JT_OP_REL(100f, right_mask, temp);                                           \
2525   JT_OP(ldr pc, [ pc, right_mask, lsl #2 ]);                                   \
2526   nop;                                                                         \
2527  100:                                                                          \
2528   nop;                                                                         \
2529   .word JTE(100b, 4f);                                                         \
2530   .word JTE(100b, 5f);                                                         \
2531   .word JTE(100b, 6f);                                                         \
2532   .word JTE(100b, 7f);                                                         \
2533   .word JTE(100b, 8f);                                                         \
2534   .word JTE(100b, 9f);                                                         \
2535   .word JTE(100b, 10f);                                                        \
2536   .word JTE(100b, 11f);                                                        \
2537                                                                                \
2538  4:                                                                            \
2539   vst1.u16 { pixels_low[0] }, [ fb_ptr ];                                      \
2540   bal 1f;                                                                      \
2541                                                                                \
2542  5:                                                                            \
2543   vst1.u32 { pixels_low[0] }, [ fb_ptr ];                                      \
2544   bal 1f;                                                                      \
2545                                                                                \
2546  6:                                                                            \
2547   vst1.u32 { pixels_low[0] }, [ fb_ptr ]!;                                     \
2548   vst1.u16 { pixels_low[2] }, [ fb_ptr ];                                      \
2549   bal 1f;                                                                      \
2550                                                                                \
2551  7:                                                                            \
2552   vst1.u32 { pixels_low }, [ fb_ptr ];                                         \
2553   bal 1f;                                                                      \
2554                                                                                \
2555  8:                                                                            \
2556   vst1.u32 { pixels_low }, [ fb_ptr ]!;                                        \
2557   vst1.u16 { pixels_high[0] }, [ fb_ptr ];                                     \
2558   bal 1f;                                                                      \
2559                                                                                \
2560  9:                                                                            \
2561   vst1.u32 { pixels_low }, [ fb_ptr ]!;                                        \
2562   vst1.u32 { pixels_high[0] }, [ fb_ptr ]!;                                    \
2563   bal 1f;                                                                      \
2564                                                                                \
2565  10:                                                                           \
2566   vst1.u32 { pixels_low }, [ fb_ptr ]!;                                        \
2567   vst1.u32 { pixels_high[0] }, [ fb_ptr ]!;                                    \
2568   vst1.u16 { pixels_high[2] }, [ fb_ptr ];                                     \
2569   bal 1f;                                                                      \
2570                                                                                \
2571  11:                                                                           \
2572   vst1.u32 { pixels }, [ fb_ptr ];                                             \
2573   bal 1f;                                                                      \
2574                                                                                \
2575  1:                                                                            \
2576   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2577   add span_b_offset, span_b_offset, #4;                                        \
2578                                                                                \
2579   add span_edge_data, span_edge_data, #8;                                      \
2580   subs num_spans, num_spans, #1;                                               \
2581                                                                                \
2582   bne 0b;                                                                      \
2583                                                                                \
2584   ldmia sp!, { r4 - r11, pc }                                                  \
2585
2586 setup_blocks_shaded_untextured_direct_builder(undithered)
2587 setup_blocks_shaded_untextured_direct_builder(dithered)
2588
2589
2590 #undef psx_gpu
2591 #undef num_blocks
2592 #undef triangle
2593 #undef c_64
2594
2595 #define psx_gpu                                  r0
2596 #define block_ptr                                r1
2597 #define num_blocks                               r2
2598 #define uv_01                                    r3
2599 #define uv_23                                    r4
2600 #define uv_45                                    r5
2601 #define uv_67                                    r6
2602 #define uv_0                                     r7
2603 #define uv_1                                     r3
2604 #define uv_2                                     r8
2605 #define uv_3                                     r4
2606 #define uv_4                                     r9
2607 #define uv_5                                     r5
2608 #define uv_6                                     r10
2609 #define uv_7                                     r6
2610 #define texture_ptr                              r11
2611
2612 #define pixel_0                                  r7
2613 #define pixel_1                                  r3
2614 #define pixel_2                                  r8
2615 #define pixel_3                                  r4
2616 #define pixel_4                                  r9
2617 #define pixel_5                                  r5
2618 #define pixel_6                                  r10
2619 #define pixel_7                                  r6
2620
2621 #define pixels_a                                 r7
2622 #define pixels_b                                 r9
2623 #define pixels_c                                 r8
2624 #define pixels_d                                 r10
2625
2626 #define c_64                                     r0
2627
2628 #define clut_ptr                                 r12
2629 #define current_texture_mask                     r5
2630 #define dirty_textures_mask                      r6
2631
2632 #define texels                                   d0
2633
2634 #define clut_low_a                               d2
2635 #define clut_low_b                               d3
2636 #define clut_high_a                              d4
2637 #define clut_high_b                              d5
2638
2639 #define clut_a                                   q1
2640 #define clut_b                                   q2
2641
2642 #define texels_low                               d6
2643 #define texels_high                              d7
2644
2645 .align 3
2646
2647 function(texture_blocks_untextured)
2648   bx lr
2649
2650
2651 .align 3
2652
2653 function(texture_blocks_4bpp)
2654   stmdb sp!, { r3 - r11, r14 }
2655   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2656
2657   ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2658   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2659
2660   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2661   vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]
2662
2663   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2664   vuzp.u8 clut_a, clut_b
2665
2666   ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
2667   tst dirty_textures_mask, current_texture_mask
2668
2669   bne 1f
2670   mov c_64, #64
2671
2672 0:
2673   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2674
2675   uxtah uv_0, texture_ptr, uv_01
2676   uxtah uv_1, texture_ptr, uv_01, ror #16
2677
2678   uxtah uv_2, texture_ptr, uv_23
2679   uxtah uv_3, texture_ptr, uv_23, ror #16
2680
2681   uxtah uv_4, texture_ptr, uv_45
2682   ldrb pixel_0, [ uv_0 ]
2683
2684   uxtah uv_5, texture_ptr, uv_45, ror #16
2685   ldrb pixel_1, [ uv_1 ]
2686
2687   uxtah uv_6, texture_ptr, uv_67
2688   ldrb pixel_2, [ uv_2 ]
2689
2690   uxtah uv_7, texture_ptr, uv_67, ror #16
2691   ldrb pixel_3, [ uv_3 ]
2692
2693   ldrb pixel_4, [ uv_4 ]
2694   subs num_blocks, num_blocks, #1
2695
2696   ldrb pixel_5, [ uv_5 ]
2697   orr pixels_a, pixel_0, pixel_1, lsl #8
2698
2699   ldrb pixel_6, [ uv_6 ]
2700   orr pixels_b, pixel_4, pixel_5, lsl #8
2701
2702   ldrb pixel_7, [ uv_7 ]
2703   orr pixels_a, pixels_a, pixel_2, lsl #16
2704
2705   orr pixels_b, pixels_b, pixel_6, lsl #16
2706   orr pixels_a, pixels_a, pixel_3, lsl #24
2707
2708   orr pixels_b, pixels_b, pixel_7, lsl #24
2709   vmov texels, pixels_a, pixels_b
2710
2711   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2712   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2713
2714   vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64
2715   bne 0b
2716
2717   ldmia sp!, { r3 - r11, pc }
2718
2719 1:
2720   stmdb sp!, { r1 - r2 }  
2721   bl update_texture_4bpp_cache
2722
2723   mov c_64, #64
2724   ldmia sp!, { r1 - r2 }
2725   bal 0b
2726
2727
2728 .align 3
2729
2730 function(texture_blocks_8bpp)
2731   stmdb sp!, { r3 - r11, r14 }
2732   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2733
2734   ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2735   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2736
2737   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2738   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2739
2740   ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]
2741   tst dirty_textures_mask, current_texture_mask
2742
2743   bne 1f
2744   nop
2745
2746 0:
2747   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2748
2749   uxtah uv_0, texture_ptr, uv_01
2750   uxtah uv_1, texture_ptr, uv_01, ror #16
2751
2752   uxtah uv_2, texture_ptr, uv_23
2753   uxtah uv_3, texture_ptr, uv_23, ror #16
2754
2755   uxtah uv_4, texture_ptr, uv_45
2756   ldrb pixel_0, [ uv_0 ]
2757
2758   uxtah uv_5, texture_ptr, uv_45, ror #16
2759   ldrb pixel_1, [ uv_1 ]
2760
2761   uxtah uv_6, texture_ptr, uv_67
2762   ldrb pixel_2, [ uv_2 ]
2763
2764   uxtah uv_7, texture_ptr, uv_67, ror #16
2765   ldrb pixel_3, [ uv_3 ]
2766
2767   ldrb pixel_4, [ uv_4 ]
2768   add pixel_0, pixel_0, pixel_0
2769
2770   ldrb pixel_5, [ uv_5 ]
2771   add pixel_1, pixel_1, pixel_1
2772
2773   ldrb pixel_6, [ uv_6 ]
2774   add pixel_2, pixel_2, pixel_2
2775
2776   ldrb pixel_7, [ uv_7 ]
2777   add pixel_3, pixel_3, pixel_3
2778
2779   ldrh pixel_0, [ clut_ptr, pixel_0 ]
2780   add pixel_4, pixel_4, pixel_4
2781
2782   ldrh pixel_1, [ clut_ptr, pixel_1 ]
2783   add pixel_5, pixel_5, pixel_5
2784
2785   ldrh pixel_2, [ clut_ptr, pixel_2 ]
2786   add pixel_6, pixel_6, pixel_6
2787
2788   ldrh pixel_3, [ clut_ptr, pixel_3 ]
2789   add pixel_7, pixel_7, pixel_7
2790
2791   ldrh pixel_4, [ clut_ptr, pixel_4 ]
2792   orr pixels_a, pixel_0, pixel_1, lsl #16
2793
2794   ldrh pixel_5, [ clut_ptr, pixel_5 ]
2795   orr pixels_c, pixel_2, pixel_3, lsl #16
2796
2797   ldrh pixel_6, [ clut_ptr, pixel_6 ]
2798   subs num_blocks, num_blocks, #1
2799
2800   ldrh pixel_7, [ clut_ptr, pixel_7 ]
2801   orr pixels_b, pixel_4, pixel_5, lsl #16
2802
2803   orr pixels_d, pixel_6, pixel_7, lsl #16
2804   stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d } 
2805
2806   add block_ptr, block_ptr, #64
2807   bne 0b
2808
2809   ldmia sp!, { r3 - r11, pc }
2810
2811 1:
2812   stmdb sp!, { r1 - r2, r12 }
2813
2814   bl update_texture_8bpp_cache
2815
2816   ldmia sp!, { r1 - r2, r12 }
2817   bal 0b
2818
2819
2820 #undef uv_0
2821 #undef uv_1
2822 #undef uv_2
2823 #undef uv_3
2824 #undef uv_4
2825 #undef uv_5
2826 #undef uv_6
2827 #undef uv_7
2828
2829 #undef pixel_0
2830 #undef pixel_1
2831 #undef pixel_2
2832 #undef pixel_3
2833 #undef pixel_4
2834 #undef pixel_5
2835 #undef pixel_6
2836 #undef pixel_7
2837
2838 #undef texture_ptr
2839
2840 #undef pixels_a
2841 #undef pixels_b
2842 #undef pixels_c
2843 #undef pixels_d
2844
2845 #define psx_gpu                                  r0
2846 #define block_ptr                                r1
2847 #define num_blocks                               r2
2848
2849 #define uv_0                                     r3
2850 #define uv_1                                     r4
2851 #define u_0                                      r3
2852 #define u_1                                      r4
2853 #define v_0                                      r5
2854 #define v_1                                      r6
2855
2856 #define uv_2                                     r5
2857 #define uv_3                                     r6
2858 #define u_2                                      r5
2859 #define u_3                                      r6
2860 #define v_2                                      r7
2861 #define v_3                                      r8
2862
2863 #define uv_4                                     r7
2864 #define uv_5                                     r8
2865 #define u_4                                      r7
2866 #define u_5                                      r8
2867 #define v_4                                      r9
2868 #define v_5                                      r10
2869
2870 #define uv_6                                     r9
2871 #define uv_7                                     r10
2872 #define u_6                                      r9
2873 #define u_7                                      r10
2874 #define v_6                                      r11
2875 #define v_7                                      r0
2876
2877 #define pixel_0                                  r3
2878 #define pixel_1                                  r4
2879 #define pixel_2                                  r5
2880 #define pixel_3                                  r6
2881 #define pixel_4                                  r7
2882 #define pixel_5                                  r8
2883 #define pixel_6                                  r9
2884 #define pixel_7                                  r10
2885
2886 #define pixels_a                                 r3
2887 #define pixels_b                                 r5
2888 #define pixels_c                                 r7
2889 #define pixels_d                                 r9
2890
2891 #define texture_ptr                              r12
2892
2893
2894 .align 3
2895
2896 function(texture_blocks_16bpp)
2897   stmdb sp!, { r3 - r11, r14 }
2898   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2899
2900   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2901   ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2902
2903 0:
2904   ldrh uv_0, [ block_ptr ]
2905   subs num_blocks, num_blocks, #1
2906
2907   ldrh uv_1, [ block_ptr, #2 ]
2908
2909   and v_0, uv_0, #0xFF00
2910   and v_1, uv_1, #0xFF00
2911
2912   and u_0, uv_0, #0xFF
2913   and u_1, uv_1, #0xFF
2914
2915   add uv_0, u_0, v_0, lsl #2
2916   ldrh uv_2, [ block_ptr, #4 ]
2917
2918   add uv_1, u_1, v_1, lsl #2
2919   ldrh uv_3, [ block_ptr, #6 ]
2920
2921   add uv_0, uv_0, uv_0
2922   add uv_1, uv_1, uv_1
2923
2924   and v_2, uv_2, #0xFF00
2925   and v_3, uv_3, #0xFF00
2926
2927   and u_2, uv_2, #0xFF
2928   and u_3, uv_3, #0xFF
2929
2930   add uv_2, u_2, v_2, lsl #2
2931   ldrh uv_4, [ block_ptr, #8 ]
2932
2933   add uv_3, u_3, v_3, lsl #2
2934   ldrh uv_5, [ block_ptr, #10 ]
2935
2936   add uv_2, uv_2, uv_2
2937   add uv_3, uv_3, uv_3
2938
2939   and v_4, uv_4, #0xFF00
2940   and v_5, uv_5, #0xFF00
2941
2942   and u_4, uv_4, #0xFF
2943   and u_5, uv_5, #0xFF
2944
2945   add uv_4, u_4, v_4, lsl #2
2946   ldrh uv_6, [ block_ptr, #12 ]
2947
2948   add uv_5, u_5, v_5, lsl #2
2949   ldrh uv_7, [ block_ptr, #14 ]
2950
2951   add uv_4, uv_4, uv_4
2952   ldrh pixel_0, [ texture_ptr, uv_0 ]
2953
2954   add uv_5, uv_5, uv_5
2955   ldrh pixel_1, [ texture_ptr, uv_1 ]
2956
2957   and v_6, uv_6, #0xFF00
2958   ldrh pixel_2, [ texture_ptr, uv_2 ]
2959
2960   and v_7, uv_7, #0xFF00
2961   ldrh pixel_3, [ texture_ptr, uv_3 ]
2962
2963   and u_6, uv_6, #0xFF
2964   ldrh pixel_4, [ texture_ptr, uv_4 ]
2965
2966   and u_7, uv_7, #0xFF
2967   ldrh pixel_5, [ texture_ptr, uv_5 ]
2968
2969   add uv_6, u_6, v_6, lsl #2
2970   add uv_7, u_7, v_7, lsl #2
2971
2972   add uv_6, uv_6, uv_6
2973   add uv_7, uv_7, uv_7
2974
2975   orr pixels_a, pixel_0, pixel_1, lsl #16
2976   orr pixels_b, pixel_2, pixel_3, lsl #16
2977
2978   ldrh pixel_6, [ texture_ptr, uv_6 ]
2979   orr pixels_c, pixel_4, pixel_5, lsl #16
2980
2981   ldrh pixel_7, [ texture_ptr, uv_7 ]
2982   orr pixels_d, pixel_6, pixel_7, lsl #16
2983
2984   stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
2985   add block_ptr, block_ptr, #64
2986
2987   bne 0b
2988
2989   ldmia sp!, { r3 - r11, pc }
2990
2991
2992 #undef num_blocks
2993
2994 #undef test_mask
2995 #undef texels
2996 #undef pixels_b
2997 #undef pixels
2998 #undef d64_1
2999 #undef d64_4
3000 #undef d64_128
3001 #undef draw_mask
3002 #undef msb_mask
3003 #undef msb_mask_low
3004 #undef msb_mask_high
3005 #undef fb_pixels
3006
3007 #undef c_32
3008 #undef fb_ptr
3009 #undef mask_msb_ptr
3010
3011 #define psx_gpu                                  r0
3012 #define num_blocks                               r1
3013 #define color_ptr                                r2
3014 #define colors_scalar                            r2
3015 #define colors_scalar_compare                    r3
3016 #define mask_msb_ptr                             r2
3017
3018 #define block_ptr_load_a                         r0
3019 #define block_ptr_store                          r3
3020 #define block_ptr_load_b                         r12
3021 #define c_32                                     r2
3022
3023 #define c_48                                     r4
3024 #define fb_ptr                                   r14
3025 #define draw_mask_bits_scalar                    r5
3026
3027 #define d128_0x07                                q0
3028 #define d128_0x1F                                q1
3029 #define d128_0x8000                              q2
3030 #define test_mask                                q3
3031 #define texels                                   q4
3032 #define colors_rg                                q5
3033 #define colors_b_dm_bits                         q6
3034 #define texels_rg                                q7
3035 #define pixels_r                                 q8
3036 #define pixels_g                                 q9
3037 #define pixels_b                                 q10
3038 #define pixels                                   q11
3039 #define zero_mask                                q4
3040 #define draw_mask                                q12
3041 #define msb_mask                                 q13
3042
3043 #define fb_pixels                                q8
3044
3045 #define pixels_gb_low                            q9
3046
3047 #define colors_r                                 d10
3048 #define colors_g                                 d11
3049 #define colors_b                                 d12
3050 #define draw_mask_bits                           d13
3051 #define texels_r                                 d14
3052 #define texels_g                                 d15
3053 #define pixels_r_low                             d16
3054 #define pixels_g_low                             d18
3055 #define pixels_b_low                             d19
3056 #define msb_mask_low                             d26
3057 #define msb_mask_high                            d27
3058
3059 #define d64_1                                    d28
3060 #define d64_4                                    d29
3061 #define d64_128                                  d30
3062 #define texels_b                                 d31
3063
3064 #define shade_blocks_textured_modulated_prologue_indirect()                    \
3065   mov c_48, #48;                                                               \
3066   add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset                         \
3067
3068 #define shade_blocks_textured_modulated_prologue_direct()                      \
3069   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3070   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]          \
3071
3072
3073 #define shade_blocks_textured_modulated_prologue_shaded(dithering, target)     \
3074   
3075 #define shade_blocks_textured_false_modulation_check_undithered(target)        \
3076   ldr colors_scalar, [ psx_gpu, #psx_gpu_triangle_color_offset ];              \
3077   movw colors_scalar_compare, #0x8080;                                         \
3078                                                                                \
3079   movt colors_scalar_compare, #0x80;                                           \
3080   cmp colors_scalar, colors_scalar_compare;                                    \
3081   beq shade_blocks_textured_unmodulated_##target                               \
3082
3083 #define shade_blocks_textured_false_modulation_check_dithered(target)          \
3084
3085 #define shade_blocks_textured_modulated_prologue_unshaded(dithering, target)   \
3086   shade_blocks_textured_false_modulation_check_##dithering(target);            \
3087   add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset;                      \
3088   vld1.u32 { colors_r[] }, [ color_ptr, :32 ];                                 \
3089   vdup.u8 colors_g, colors_r[1];                                               \
3090   vdup.u8 colors_b, colors_r[2];                                               \
3091   vdup.u8 colors_r, colors_r[0]                                                \
3092
3093
3094 #define shade_blocks_textured_modulated_load_dithered(target)                  \
3095   vld1.u32 { target }, [ block_ptr_load_b, :128 ]                              \
3096
3097 #define shade_blocks_textured_modulated_load_last_dithered(target)             \
3098   vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32                        \
3099
3100 #define shade_blocks_textured_modulated_load_undithered(target)                \
3101
3102 #define shade_blocks_textured_modulated_load_last_undithered(target)           \
3103   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3104
3105 #define shade_blocks_textured_modulate_dithered(channel)                       \
3106   vmlal.u8 pixels_##channel, texels_##channel, colors_##channel                \
3107
3108 #define shade_blocks_textured_modulate_undithered(channel)                     \
3109   vmull.u8 pixels_##channel, texels_##channel, colors_##channel                \
3110
3111
3112 #define shade_blocks_textured_modulated_store_draw_mask_indirect(offset)       \
3113   vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]!                           \
3114
3115 #define shade_blocks_textured_modulated_store_draw_mask_direct(offset)         \
3116   ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ];                            \
3117   vld1.u32 { fb_pixels }, [ fb_ptr ];                                          \
3118   vbit.u16 pixels, fb_pixels, draw_mask                                        \
3119
3120 #define shade_blocks_textured_modulated_store_pixels_indirect()                \
3121   vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48                         \
3122
3123 #define shade_blocks_textured_modulated_store_pixels_direct()                  \
3124   vst1.u32 { pixels }, [ fb_ptr ]                                              \
3125
3126
3127 #define shade_blocks_textured_modulated_load_rg_shaded()                       \
3128   vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32            \
3129
3130 #define shade_blocks_textured_modulated_load_rg_unshaded()                     \
3131   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3132
3133 #define shade_blocks_textured_modulated_load_bdm_shaded()                      \
3134   vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32      \
3135
3136 #define shade_blocks_textured_modulated_load_bdm_unshaded()                    \
3137   ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ];                         \
3138   add block_ptr_load_a, block_ptr_load_a, #32                                  \
3139
3140 #define shade_blocks_textured_modulated_expand_draw_mask_shaded()              \
3141   vdup.u16 draw_mask, draw_mask_bits[0]                                        \
3142
3143 #define shade_blocks_textured_modulated_expand_draw_mask_unshaded()            \
3144   vdup.u16 draw_mask, draw_mask_bits_scalar                                    \
3145
3146
3147 #define shade_blocks_textured_modulated_apply_msb_mask_indirect()              \
3148
3149 #define shade_blocks_textured_modulated_apply_msb_mask_direct()                \
3150   vorr.u16 pixels, pixels, msb_mask                                            \
3151
3152
3153 #define shade_blocks_textured_modulated_builder(shading, dithering, target)    \
3154 .align 3;                                                                      \
3155                                                                                \
3156 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
3157   shade_blocks_textured_modulated_prologue_##shading(dithering, target);       \
3158   stmdb sp!, { r4 - r5, lr };                                                  \
3159   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3160                                                                                \
3161   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
3162                                                                                \
3163   shade_blocks_textured_modulated_prologue_##target();                         \
3164                                                                                \
3165   add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset;                       \
3166   mov c_32, #32;                                                               \
3167                                                                                \
3168   add block_ptr_load_b, block_ptr_load_a, #16;                                 \
3169   vmov.u8 d64_1, #1;                                                           \
3170   vmov.u8 d64_4, #4;                                                           \
3171   vmov.u8 d64_128, #128;                                                       \
3172                                                                                \
3173   vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32;                       \
3174   vmov.u8 d128_0x07, #0x07;                                                    \
3175                                                                                \
3176   shade_blocks_textured_modulated_load_rg_##shading();                         \
3177   vmov.u8 d128_0x1F, #0x1F;                                                    \
3178                                                                                \
3179   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3180   vmov.u16 d128_0x8000, #0x8000;                                               \
3181                                                                                \
3182   vmovn.u16 texels_r, texels;                                                  \
3183   vshrn.u16 texels_g, texels, #5;                                              \
3184                                                                                \
3185   vshrn.u16 texels_b, texels, #7;                                              \
3186   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3187                                                                                \
3188   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3189   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3190                                                                                \
3191   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3192   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3193                                                                                \
3194   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3195   vshr.u8 texels_b, texels_b, #3;                                              \
3196                                                                                \
3197   shade_blocks_textured_modulate_##dithering(r);                               \
3198   shade_blocks_textured_modulate_##dithering(g);                               \
3199   shade_blocks_textured_modulate_##dithering(b);                               \
3200                                                                                \
3201   vand.u16 pixels, texels, d128_0x8000;                                        \
3202   vceq.u16 zero_mask, texels, #0;                                              \
3203                                                                                \
3204   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3205   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3206   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3207                                                                                \
3208   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3209   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3210   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3211   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3212                                                                                \
3213   subs num_blocks, num_blocks, #1;                                             \
3214   beq 1f;                                                                      \
3215                                                                                \
3216  .align 3;                                                                     \
3217                                                                                \
3218  0:                                                                            \
3219   vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32;                       \
3220   shade_blocks_textured_modulated_load_rg_##shading();                         \
3221   vshrn.u16 texels_g, texels, #5;                                              \
3222                                                                                \
3223   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3224   vshrn.u16 texels_b, texels, #7;                                              \
3225                                                                                \
3226   pld [ block_ptr_load_a ];                                                    \
3227   vmovn.u16 texels_r, texels;                                                  \
3228   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3229                                                                                \
3230   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3231   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3232   shade_blocks_textured_modulated_store_draw_mask_##target(-4);                \
3233                                                                                \
3234   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3235   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3236                                                                                \
3237   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3238   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3239                                                                                \
3240   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3241   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3242                                                                                \
3243   shade_blocks_textured_modulated_store_pixels_##target();                     \
3244   vshr.u8 texels_b, texels_b, #3;                                              \
3245                                                                                \
3246   shade_blocks_textured_modulate_##dithering(r);                               \
3247   shade_blocks_textured_modulate_##dithering(g);                               \
3248   shade_blocks_textured_modulate_##dithering(b);                               \
3249                                                                                \
3250   vand.u16 pixels, texels, d128_0x8000;                                        \
3251   vceq.u16 zero_mask, texels, #0;                                              \
3252                                                                                \
3253   subs num_blocks, num_blocks, #1;                                             \
3254                                                                                \
3255   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3256   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3257   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3258                                                                                \
3259   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3260   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3261   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3262   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3263                                                                                \
3264   bne 0b;                                                                      \
3265                                                                                \
3266  1:                                                                            \
3267   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3268   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3269   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3270                                                                                \
3271   shade_blocks_textured_modulated_store_draw_mask_##target(28);                \
3272   shade_blocks_textured_modulated_store_pixels_##target();                     \
3273                                                                                \
3274   ldmia sp!, { r4 - r5, pc }                                                   \
3275
3276
3277 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3278 shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3279 shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3280 shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3281
3282 shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3283 shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3284 shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3285 shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3286
3287
3288 #undef c_64
3289 #undef fb_ptr
3290 #undef color_ptr
3291
3292 #undef color_r
3293 #undef color_g
3294 #undef color_b
3295
3296 #undef test_mask
3297 #undef pixels
3298 #undef draw_mask
3299 #undef zero_mask
3300 #undef fb_pixels
3301 #undef msb_mask
3302 #undef msb_mask_low
3303 #undef msb_mask_high
3304
3305 #define psx_gpu                                  r0
3306 #define num_blocks                               r1
3307 #define mask_msb_ptr                             r2
3308 #define color_ptr                                r3
3309
3310 #define block_ptr_load                           r0
3311 #define draw_mask_store_ptr                      r3
3312 #define draw_mask_bits_ptr                       r12
3313 #define draw_mask_ptr                            r12
3314 #define pixel_store_ptr                          r14
3315
3316 #define fb_ptr_cmp                               r4
3317
3318 #define fb_ptr                                   r3
3319 #define fb_ptr_next                              r14
3320
3321 #define c_64                                     r2
3322
3323 #define test_mask                                q0
3324 #define pixels                                   q1
3325 #define draw_mask                                q2
3326 #define zero_mask                                q3
3327 #define draw_mask_combined                       q4
3328 #define fb_pixels                                q5
3329 #define fb_pixels_next                           q6
3330 #define msb_mask                                 q7
3331
3332 #define draw_mask_low                            d4
3333 #define draw_mask_high                           d5
3334 #define msb_mask_low                             d14
3335 #define msb_mask_high                            d15
3336
3337 .align 3
3338 function(shade_blocks_textured_unmodulated_indirect)
3339   str r14, [ sp, #-4 ]
3340   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3341
3342   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3343   add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3344
3345   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3346   add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3347
3348   mov c_64, #64
3349   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3350
3351   vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3352   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3353    [ draw_mask_bits_ptr, :16 ], c_64
3354   vceq.u16 zero_mask, pixels, #0
3355
3356   vtst.u16 draw_mask, draw_mask, test_mask
3357   vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3358
3359   subs num_blocks, num_blocks, #1
3360   beq 1f
3361
3362  0:
3363   vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3364   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3365
3366   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3367    [ draw_mask_bits_ptr, :16 ], c_64
3368   vceq.u16 zero_mask, pixels, #0
3369
3370   vtst.u16 draw_mask, draw_mask, test_mask
3371   vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3372
3373   vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3374   subs num_blocks, num_blocks, #1
3375
3376   bne 0b
3377
3378  1:
3379   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3380   vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3381
3382   ldr pc, [ sp, #-4 ]
3383
3384
3385 .align 3
3386
3387 function(shade_blocks_textured_unmodulated_direct)
3388   stmdb sp!, { r4, r14 }
3389   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3390
3391   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3392   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3393
3394   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3395   mov c_64, #64
3396
3397   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3398   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3399
3400   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3401    [ draw_mask_bits_ptr, :16 ], c_64
3402   ldr fb_ptr_next, [ block_ptr_load, #44 ]
3403
3404   vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3405   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3406   vceq.u16 zero_mask, pixels, #0
3407   vtst.u16 draw_mask, draw_mask, test_mask
3408
3409   subs num_blocks, num_blocks, #1
3410   beq 1f
3411
3412  0:
3413   mov fb_ptr, fb_ptr_next
3414   ldr fb_ptr_next, [ block_ptr_load, #44 ]
3415
3416   vorr.u16 pixels, pixels, msb_mask
3417
3418   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3419   vmov fb_pixels, fb_pixels_next
3420
3421   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3422    [ draw_mask_bits_ptr, :16 ], c_64
3423   vbif.u16 fb_pixels, pixels, draw_mask_combined
3424
3425   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3426   pld [ fb_ptr_next, #64 ]
3427
3428   add fb_ptr_cmp, fb_ptr_cmp, #14
3429   vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3430
3431   cmp fb_ptr_cmp, #28
3432   bls 4f
3433
3434   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3435   vceq.u16 zero_mask, pixels, #0
3436
3437   vst1.u16 { fb_pixels }, [ fb_ptr ]
3438   vtst.u16 draw_mask, draw_mask, test_mask
3439
3440  3:
3441   subs num_blocks, num_blocks, #1
3442   bne 0b
3443
3444  1:
3445   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3446   vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3447
3448   vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3449
3450   ldmia sp!, { r4, pc }
3451
3452  4:
3453   vst1.u16 { fb_pixels }, [ fb_ptr ]
3454   vceq.u16 zero_mask, pixels, #0
3455
3456   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3457   vtst.u16 draw_mask, draw_mask, test_mask
3458
3459   bal 3b
3460
3461
3462 function(shade_blocks_unshaded_untextured_indirect)
3463   bx lr
3464
3465 .align 3
3466
3467 function(shade_blocks_unshaded_untextured_direct)
3468   stmdb sp!, { r4, r14 }
3469   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3470
3471   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3472   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3473
3474   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3475   add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3476
3477   add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3478   vld1.u16 { pixels }, [ color_ptr, :128 ]
3479
3480   mov c_64, #64
3481   vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3482
3483   vorr.u16 pixels, pixels, msb_mask
3484   subs num_blocks, num_blocks, #1
3485
3486   ldr fb_ptr_next, [ block_ptr_load ], #64
3487
3488   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3489   beq 1f
3490
3491  0:
3492   vmov fb_pixels, fb_pixels_next
3493   mov fb_ptr, fb_ptr_next
3494   ldr fb_ptr_next, [ block_ptr_load ], #64
3495
3496   vbif.u16 fb_pixels, pixels, draw_mask
3497   vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3498
3499   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3500   add fb_ptr_cmp, fb_ptr_cmp, #14
3501   cmp fb_ptr_cmp, #28
3502   bls 4f
3503
3504   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3505   vst1.u16 { fb_pixels }, [ fb_ptr ]
3506
3507  3:
3508   subs num_blocks, num_blocks, #1
3509   bne 0b
3510
3511  1:
3512   vbif.u16 fb_pixels_next, pixels, draw_mask
3513   vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3514
3515   ldmia sp!, { r4, pc }
3516
3517  4:
3518   vst1.u16 { fb_pixels }, [ fb_ptr ]
3519   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3520   bal 3b
3521
3522
3523 #undef draw_mask_ptr
3524 #undef c_64
3525 #undef fb_ptr
3526 #undef fb_ptr_next
3527 #undef fb_ptr_cmp
3528
3529 #define psx_gpu                                  r0
3530 #define num_blocks                               r1
3531 #define msb_mask_ptr                             r2
3532 #define pixel_ptr                                r3
3533 #define draw_mask_ptr                            r0
3534 #define c_64                                     r2
3535 #define fb_ptr                                   r12
3536 #define fb_ptr_next                              r14
3537 #define fb_ptr_cmp                               r4
3538
3539 #undef msb_mask
3540 #undef draw_mask
3541 #undef pixels
3542 #undef fb_pixels
3543 #undef d128_0x8000
3544 #undef msb_mask_low
3545 #undef msb_mask_high
3546 #undef draw_mask_next
3547 #undef pixels_g
3548 #undef blend_pixels
3549 #undef fb_pixels_next
3550
3551 #define msb_mask                                 q0
3552 #define draw_mask                                q1
3553 #define pixels                                   q2
3554 #define fb_pixels                                q3
3555 #define blend_pixels                             q4
3556 #define pixels_no_msb                            q5
3557 #define blend_mask                               q6
3558 #define fb_pixels_no_msb                         q7
3559 #define d128_0x8000                              q8
3560 #define d128_0x0421                              q9
3561 #define fb_pixels_next                           q10
3562 #define blend_pixels_next                        q11
3563 #define pixels_next                              q12
3564 #define draw_mask_next                           q13
3565 #define write_mask                               q14
3566
3567 #define pixels_rb                                q5
3568 #define pixels_mg                                q7
3569 #define pixels_g                                 q7
3570 #define d128_0x7C1F                              q8
3571 #define d128_0x03E0                              q9
3572 #define fb_pixels_rb                             q10
3573 #define fb_pixels_g                              q11
3574 #define fb_pixels_masked                         q11
3575 #define d128_0x83E0                              q15
3576 #define pixels_fourth                            q7
3577 #define d128_0x1C07                              q12
3578 #define d128_0x00E0                              q13
3579 #define d128_0x80E0                              q13
3580
3581 #define msb_mask_low                             d0
3582 #define msb_mask_high                            d1
3583
3584 #define blend_blocks_average_set_blend_mask_textured(source)                   \
3585   vclt.s16 blend_mask, source, #0                                              \
3586
3587 #define blend_blocks_average_set_stp_bit_textured()                            \
3588   vorr.u16 blend_pixels, #0x8000                                               \
3589
3590 #define blend_blocks_average_combine_textured(source)                          \
3591   vbif.u16 blend_pixels, source, blend_mask                                    \
3592   
3593 #define blend_blocks_average_set_blend_mask_untextured(source)                 \
3594
3595 #define blend_blocks_average_set_stp_bit_untextured()                          \
3596
3597 #define blend_blocks_average_combine_untextured(source)                        \
3598
3599 #define blend_blocks_average_mask_set_on()                                     \
3600   vclt.s16 write_mask, fb_pixels_next, #0                                      \
3601
3602 #define blend_blocks_average_mask_copy_on()                                    \
3603   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3604
3605 #define blend_blocks_average_mask_copy_b_on()                                  \
3606   vorr.u16 draw_mask_next, draw_mask_next, write_mask                          \
3607
3608 #define blend_blocks_average_mask_set_off()                                    \
3609
3610 #define blend_blocks_average_mask_copy_off()                                   \
3611   vmov draw_mask, draw_mask_next                                               \
3612
3613 #define blend_blocks_average_mask_copy_b_off()                                 \
3614
3615 #define blend_blocks_average_builder(texturing, mask_evaluate)                 \
3616 .align 3;                                                                      \
3617                                                                                \
3618 function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
3619   stmdb sp!, { r4, r14 };                                                      \
3620   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3621   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3622                                                                                \
3623   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3624   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
3625                                                                                \
3626   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3627   mov c_64, #64;                                                               \
3628                                                                                \
3629   vmov.u16 d128_0x8000, #0x8000;                                               \
3630   vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64;                  \
3631   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3632                                                                                \
3633   vmov.u16 d128_0x0421, #0x0400;                                               \
3634   vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64;                         \
3635                                                                                \
3636   vorr.u16 d128_0x0421, #0x0021;                                               \
3637   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ];                                \
3638                                                                                \
3639   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3640   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3641   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3642   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3643   blend_blocks_average_mask_set_##mask_evaluate();                             \
3644   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3645                                                                                \
3646   subs num_blocks, num_blocks, #1;                                             \
3647   beq 1f;                                                                      \
3648                                                                                \
3649  0:                                                                            \
3650   mov fb_ptr, fb_ptr_next;                                                     \
3651   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3652                                                                                \
3653   vmov pixels, pixels_next;                                                    \
3654   vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64;                         \
3655                                                                                \
3656   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3657                                                                                \
3658   blend_blocks_average_mask_copy_##mask_evaluate();                            \
3659   vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64;                  \
3660                                                                                \
3661   blend_blocks_average_set_blend_mask_##texturing(pixels);                     \
3662   blend_blocks_average_set_stp_bit_##texturing();                              \
3663   vmov fb_pixels, fb_pixels_next;                                              \
3664   blend_blocks_average_combine_##texturing(pixels);                            \
3665                                                                                \
3666   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3667   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3668   cmp fb_ptr_cmp, #28;                                                         \
3669   bls 2f;                                                                      \
3670                                                                                \
3671   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ];                                \
3672   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3673                                                                                \
3674   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3675   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3676                                                                                \
3677   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3678   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3679                                                                                \
3680   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3681   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3682   blend_blocks_average_mask_set_##mask_evaluate();                             \
3683   vst1.u16 { fb_pixels }, [ fb_ptr ];                                          \
3684                                                                                \
3685  3:                                                                            \
3686   subs num_blocks, num_blocks, #1;                                             \
3687   bne 0b;                                                                      \
3688                                                                                \
3689  1:                                                                            \
3690   blend_blocks_average_mask_copy_b_##mask_evaluate();                          \
3691   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3692                                                                                \
3693   blend_blocks_average_set_blend_mask_##texturing(pixels_next);                \
3694   blend_blocks_average_set_stp_bit_##texturing();                              \
3695   blend_blocks_average_combine_##texturing(pixels_next);                       \
3696                                                                                \
3697   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3698   vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next;                       \
3699   vst1.u16 { fb_pixels_next }, [ fb_ptr_next ];                                \
3700                                                                                \
3701   ldmia sp!, { r4, pc };                                                       \
3702                                                                                \
3703  2:                                                                            \
3704   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3705   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3706   vst1.u16 { fb_pixels }, [ fb_ptr ];                                          \
3707                                                                                \
3708   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ];                                \
3709   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3710   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3711   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3712   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3713   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3714                                                                                \
3715   bal 3b                                                                       \
3716
3717 blend_blocks_average_builder(textured, off)
3718 blend_blocks_average_builder(untextured, off)
3719 blend_blocks_average_builder(textured, on)
3720 blend_blocks_average_builder(untextured, on)
3721
3722
3723 #define blend_blocks_add_mask_set_on()                                         \
3724   vclt.s16 write_mask, fb_pixels, #0                                           \
3725
3726 #define blend_blocks_add_mask_copy_on()                                        \
3727   vorr.u16 draw_mask, draw_mask, write_mask                                    \
3728
3729 #define blend_blocks_add_mask_set_off()                                        \
3730
3731 #define blend_blocks_add_mask_copy_off()                                       \
3732
3733
3734 #define blend_blocks_add_textured_builder(mask_evaluate)                       \
3735 .align 3;                                                                      \
3736                                                                                \
3737 function(blend_blocks_textured_add_##mask_evaluate)                            \
3738   stmdb sp!, { r4, r14 };                                                      \
3739   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3740   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3741                                                                                \
3742   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3743   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
3744                                                                                \
3745   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3746   mov c_64, #64;                                                               \
3747                                                                                \
3748   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3749   vmov.u16 d128_0x03E0, #0x0300;                                               \
3750   vmov.u16 d128_0x83E0, #0x8000;                                               \
3751   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3752   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3753   vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0;                              \
3754                                                                                \
3755   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
3756   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3757   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
3758   vclt.s16 blend_mask, pixels, #0;                                             \
3759   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3760   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3761   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3762                                                                                \
3763   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3764   vorr.u16 pixels, pixels, msb_mask;                                           \
3765   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3766   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3767   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3768   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3769   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3770   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3771   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3772   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3773                                                                                \
3774   subs num_blocks, num_blocks, #1;                                             \
3775   beq 1f;                                                                      \
3776                                                                                \
3777  0:                                                                            \
3778   mov fb_ptr, fb_ptr_next;                                                     \
3779                                                                                \
3780   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3781                                                                                \
3782   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
3783   vclt.s16 blend_mask, pixels, #0;                                             \
3784                                                                                \
3785   vorr.u16 pixels, pixels, msb_mask;                                           \
3786   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3787   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3788                                                                                \
3789   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3790   pld [ fb_ptr_next, #64 ];                                                    \
3791                                                                                \
3792   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3793   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3794                                                                                \
3795   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3796   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
3797                                                                                \
3798   cmp fb_ptr_cmp, #28;                                                         \
3799   bls 2f;                                                                      \
3800                                                                                \
3801   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3802   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3803   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3804   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3805   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3806   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3807   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
3808                                                                                \
3809  3:                                                                            \
3810   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3811   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3812   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3813   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3814   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3815                                                                                \
3816   subs num_blocks, num_blocks, #1;                                             \
3817   bne 0b;                                                                      \
3818                                                                                \
3819  1:                                                                            \
3820   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3821   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3822   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
3823                                                                                \
3824   ldmia sp!, { r4, pc };                                                       \
3825                                                                                \
3826  2:                                                                            \
3827   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
3828   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3829                                                                                \
3830   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3831   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3832   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3833   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3834   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3835   bal 3b                                                                       \
3836
3837
3838 #define blend_blocks_add_untextured_builder(mask_evaluate)                     \
3839 .align 3;                                                                      \
3840                                                                                \
3841 function(blend_blocks_untextured_add_##mask_evaluate)                          \
3842   stmdb sp!, { r4, r14 };                                                      \
3843   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3844   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3845                                                                                \
3846   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3847   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
3848                                                                                \
3849   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3850   mov c_64, #64;                                                               \
3851                                                                                \
3852   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3853   vmov.u16 d128_0x03E0, #0x0300;                                               \
3854   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3855   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3856                                                                                \
3857   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
3858   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3859   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
3860   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3861   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3862   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3863                                                                                \
3864   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3865   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3866   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3867   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3868   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3869   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3870   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3871   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3872                                                                                \
3873   subs num_blocks, num_blocks, #1;                                             \
3874   beq 1f;                                                                      \
3875                                                                                \
3876  0:                                                                            \
3877   mov fb_ptr, fb_ptr_next;                                                     \
3878                                                                                \
3879   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3880                                                                                \
3881   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
3882                                                                                \
3883   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3884   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3885   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3886                                                                                \
3887   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3888   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
3889                                                                                \
3890   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3891   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3892   cmp fb_ptr_cmp, #28;                                                         \
3893   bls 2f;                                                                      \
3894                                                                                \
3895   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3896   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3897   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3898   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3899   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3900   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
3901                                                                                \
3902  3:                                                                            \
3903   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3904   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3905   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3906   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3907   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3908                                                                                \
3909   subs num_blocks, num_blocks, #1;                                             \
3910   bne 0b;                                                                      \
3911                                                                                \
3912  1:                                                                            \
3913   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3914   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3915   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3916   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
3917                                                                                \
3918   ldmia sp!, { r4, pc };                                                       \
3919                                                                                \
3920  2:                                                                            \
3921   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
3922   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3923                                                                                \
3924   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3925   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3926   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3927   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3928   bal 3b                                                                       \
3929
3930
3931 blend_blocks_add_textured_builder(off)
3932 blend_blocks_add_textured_builder(on)
3933 blend_blocks_add_untextured_builder(off)
3934 blend_blocks_add_untextured_builder(on)
3935
3936 #define blend_blocks_subtract_set_blend_mask_textured()                        \
3937   vclt.s16 blend_mask, pixels_next, #0                                         \
3938
3939 #define blend_blocks_subtract_combine_textured()                               \
3940   vbif.u16 blend_pixels, pixels, blend_mask                                    \
3941
3942 #define blend_blocks_subtract_set_stb_textured()                               \
3943   vorr.u16 blend_pixels, #0x8000                                               \
3944
3945 #define blend_blocks_subtract_msb_mask_textured()                              \
3946   vorr.u16 pixels, pixels_next, msb_mask                                       \
3947
3948 #define blend_blocks_subtract_set_blend_mask_untextured()                      \
3949
3950 #define blend_blocks_subtract_combine_untextured()                             \
3951
3952 #define blend_blocks_subtract_set_stb_untextured()                             \
3953   vorr.u16 blend_pixels, blend_pixels, msb_mask                                \
3954
3955 #define blend_blocks_subtract_msb_mask_untextured()                            \
3956
3957
3958 #define blend_blocks_subtract_mask_set_on()                                    \
3959   vclt.s16 write_mask, fb_pixels, #0                                           \
3960
3961 #define blend_blocks_subtract_mask_copy_on()                                   \
3962   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3963
3964 #define blend_blocks_subtract_mask_set_off()                                   \
3965
3966 #define blend_blocks_subtract_mask_copy_off()                                  \
3967   vmov draw_mask, draw_mask_next                                               \
3968
3969
3970 #define blend_blocks_subtract_builder(texturing, mask_evaluate)                \
3971 .align 3;                                                                      \
3972                                                                                \
3973 function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
3974   stmdb sp!, { r4, r14 };                                                      \
3975   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3976   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3977                                                                                \
3978   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3979   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
3980                                                                                \
3981   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3982   mov c_64, #64;                                                               \
3983                                                                                \
3984   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3985   vmov.u16 d128_0x03E0, #0x0300;                                               \
3986   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3987   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3988                                                                                \
3989   vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64;                  \
3990   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3991   vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64;                         \
3992   blend_blocks_subtract_set_blend_mask_##texturing();                          \
3993   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3994   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
3995   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
3996                                                                                \
3997   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
3998   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3999   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4000   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4001   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4002                                                                                \
4003   subs num_blocks, num_blocks, #1;                                             \
4004   beq 1f;                                                                      \
4005                                                                                \
4006  0:                                                                            \
4007   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4008   mov fb_ptr, fb_ptr_next;                                                     \
4009   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4010                                                                                \
4011   vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64;                  \
4012   blend_blocks_subtract_msb_mask_##texturing();                                \
4013                                                                                \
4014   vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64;                         \
4015   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4016   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4017   blend_blocks_subtract_set_stb_##texturing();                                 \
4018   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4019   blend_blocks_subtract_combine_##texturing();                                 \
4020   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4021   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4022                                                                                \
4023   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4024   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4025   cmp fb_ptr_cmp, #28;                                                         \
4026   bls 2f;                                                                      \
4027                                                                                \
4028   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4029   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4030   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4031   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4032   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4033   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4034   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4035                                                                                \
4036  3:                                                                            \
4037   subs num_blocks, num_blocks, #1;                                             \
4038   bne 0b;                                                                      \
4039                                                                                \
4040  1:                                                                            \
4041   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4042                                                                                \
4043   blend_blocks_subtract_msb_mask_##texturing();                                \
4044   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4045   blend_blocks_subtract_set_stb_##texturing();                                 \
4046   blend_blocks_subtract_combine_##texturing();                                 \
4047   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4048   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
4049                                                                                \
4050   ldmia sp!, { r4, pc };                                                       \
4051                                                                                \
4052  2:                                                                            \
4053   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4054   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4055   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4056   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4057   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4058   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4059   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4060   bal 3b                                                                       \
4061
4062
4063 blend_blocks_subtract_builder(textured, off)
4064 blend_blocks_subtract_builder(textured, on)
4065 blend_blocks_subtract_builder(untextured, off)
4066 blend_blocks_subtract_builder(untextured, on)
4067
4068
4069 #define blend_blocks_add_fourth_textured_builder(mask_evaluate)                \
4070 .align 3;                                                                      \
4071                                                                                \
4072 function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
4073   stmdb sp!, { r4, r14 };                                                      \
4074   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4075   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
4076                                                                                \
4077   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4078   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
4079                                                                                \
4080   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4081   mov c_64, #64;                                                               \
4082                                                                                \
4083   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4084   vmov.u16 d128_0x03E0, #0x0300;                                               \
4085   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4086   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4087   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4088   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4089   vorr.u16 d128_0x1C07, #0x0007;                                               \
4090                                                                                \
4091   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
4092   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4093   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
4094   vclt.s16 blend_mask, pixels, #0;                                             \
4095   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4096   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4097   vshr.s16 pixels_fourth, pixels, #2;                                          \
4098   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4099                                                                                \
4100   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4101   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4102   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4103   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4104   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4105   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4106   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4107   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4108                                                                                \
4109   subs num_blocks, num_blocks, #1;                                             \
4110   beq 1f;                                                                      \
4111                                                                                \
4112  0:                                                                            \
4113   mov fb_ptr, fb_ptr_next;                                                     \
4114   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4115                                                                                \
4116   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4117   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4118                                                                                \
4119   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
4120   vclt.s16 blend_mask, pixels, #0;                                             \
4121   vshr.s16 pixels_fourth, pixels, #2;                                          \
4122   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4123   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4124                                                                                \
4125   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4126   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
4127                                                                                \
4128   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4129   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4130   cmp fb_ptr_cmp, #28;                                                         \
4131   bls 2f;                                                                      \
4132                                                                                \
4133   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4134   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4135   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4136   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4137   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4138   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4139                                                                                \
4140  3:                                                                            \
4141   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4142   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4143   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4144   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4145   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4146                                                                                \
4147   subs num_blocks, num_blocks, #1;                                             \
4148   bne 0b;                                                                      \
4149                                                                                \
4150  1:                                                                            \
4151   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4152   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4153   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4154   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4155   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
4156                                                                                \
4157   ldmia sp!, { r4, pc };                                                       \
4158                                                                                \
4159  2:                                                                            \
4160   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4161   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4162                                                                                \
4163   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4164   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4165   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4166   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4167   bal 3b                                                                       \
4168
4169
4170
4171 #define blend_blocks_add_fourth_untextured_builder(mask_evaluate)              \
4172 .align 3;                                                                      \
4173                                                                                \
4174 function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
4175   stmdb sp!, { r4, r14 };                                                      \
4176   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4177   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
4178                                                                                \
4179   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4180   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
4181                                                                                \
4182   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4183   mov c_64, #64;                                                               \
4184                                                                                \
4185   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4186   vmov.u16 d128_0x03E0, #0x0300;                                               \
4187   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4188   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4189   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4190   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4191   vorr.u16 d128_0x1C07, #0x0007;                                               \
4192                                                                                \
4193   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
4194   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4195   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
4196   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4197   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4198   vshr.s16 pixels_fourth, pixels, #2;                                          \
4199   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4200                                                                                \
4201   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4202   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4203   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4204   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4205   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4206   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4207   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4208   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4209                                                                                \
4210   subs num_blocks, num_blocks, #1;                                             \
4211   beq 1f;                                                                      \
4212                                                                                \
4213  0:                                                                            \
4214   mov fb_ptr, fb_ptr_next;                                                     \
4215   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4216                                                                                \
4217   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
4218                                                                                \
4219   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4220   vshr.s16 pixels_fourth, pixels, #2;                                          \
4221   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4222   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4223                                                                                \
4224   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4225   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
4226                                                                                \
4227   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4228   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4229   cmp fb_ptr_cmp, #28;                                                         \
4230   bls 2f;                                                                      \
4231                                                                                \
4232   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4233   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4234   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4235   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4236   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4237   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4238                                                                                \
4239  3:                                                                            \
4240   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4241   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4242   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4243   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4244   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4245                                                                                \
4246   subs num_blocks, num_blocks, #1;                                             \
4247   bne 0b;                                                                      \
4248                                                                                \
4249  1:                                                                            \
4250   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4251   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4252   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4253   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
4254                                                                                \
4255   ldmia sp!, { r4, pc };                                                       \
4256                                                                                \
4257  2:                                                                            \
4258   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4259   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4260                                                                                \
4261   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4262   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4263   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4264   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4265   bal 3b                                                                       \
4266
4267
4268 blend_blocks_add_fourth_textured_builder(off)
4269 blend_blocks_add_fourth_textured_builder(on)
4270 blend_blocks_add_fourth_untextured_builder(off)
4271 blend_blocks_add_fourth_untextured_builder(on)
4272
4273 // TODO: Optimize this more. Need a scene that actually uses it for
4274 // confirmation..
4275
4276 .align 3
4277
4278 function(blend_blocks_textured_unblended_on)         
4279   stmdb sp!, { r4, r14 }
4280   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4281   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4282
4283   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4284   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
4285
4286   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4287   mov c_64, #64
4288
4289   ldr fb_ptr, [ pixel_ptr, #28 ]
4290   vld1.u16 { fb_pixels }, [ fb_ptr ]
4291   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4292   vclt.s16 write_mask, fb_pixels, #0
4293   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4294
4295   subs num_blocks, num_blocks, #1
4296   beq 1f
4297
4298  0:
4299   vorr.u16 pixels, pixels, msb_mask
4300   vorr.u16 draw_mask, draw_mask, write_mask
4301   vbif.u16 fb_pixels, pixels, draw_mask
4302   vst1.u16 { fb_pixels }, [ fb_ptr ]
4303
4304   ldr fb_ptr, [ pixel_ptr, #28 ]
4305   vld1.u16 { fb_pixels }, [ fb_ptr ]
4306   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4307   vclt.s16 write_mask, fb_pixels, #0
4308   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4309
4310   subs num_blocks, num_blocks, #1
4311   bne 0b
4312  
4313  1:
4314   vorr.u16 pixels, pixels, msb_mask
4315   vorr.u16 draw_mask, draw_mask, write_mask
4316   vbif.u16 fb_pixels, pixels, draw_mask
4317   vst1.u16 { fb_pixels }, [ fb_ptr ]
4318
4319   ldmia sp!, { r4, pc }
4320
4321
4322 function(blend_blocks_textured_unblended_off)
4323   bx lr
4324
4325
4326 function(warmup)
4327   mov r3, #64
4328   cmp r0, #0
4329   bxeq lr
4330
4331  0:
4332   vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3
4333
4334   subs r0, r0, #1
4335   bne 0b
4336
4337   bx lr
4338
4339 #undef vram_ptr
4340 #undef color
4341 #undef width
4342 #undef height
4343 #undef pitch
4344
4345 #define vram_ptr                                          r0
4346 #define color                                             r1
4347 #define width                                             r2
4348 #define height                                            r3
4349
4350 #define pitch                                             r1
4351
4352 #define num_width                                         r12
4353
4354 #undef colors_a
4355 #undef colors_b
4356
4357 #define colors_a                                          q0
4358 #define colors_b                                          q1
4359
4360 .align 3
4361
4362 function(render_block_fill_body)
4363   vdup.u16 colors_a, color
4364   mov pitch, #2048
4365
4366   vmov colors_b, colors_a
4367   sub pitch, pitch, width, lsl #1
4368
4369   mov num_width, width
4370
4371  0:  
4372   vst1.u32 { colors_a, colors_b }, [ vram_ptr, :256 ]!
4373
4374   subs num_width, num_width, #16
4375   bne 0b
4376
4377   add vram_ptr, vram_ptr, pitch
4378   mov num_width, width
4379
4380   subs height, height, #1
4381   bne 0b
4382
4383   bx lr
4384  
4385
4386 #undef x
4387 #undef y
4388 #undef width
4389 #undef height
4390 #undef fb_ptr
4391 #undef texture_mask
4392 #undef num_blocks
4393 #undef temp
4394 #undef dirty_textures_mask
4395 #undef clut_ptr
4396 #undef current_texture_mask
4397
4398 #define psx_gpu                                           r0
4399 #define x                                                 r1
4400 #define y                                                 r2
4401 #define u                                                 r3
4402 #define v                                                 r4
4403 #define width                                             r5
4404 #define height                                            r6
4405 #define offset_u                                          r8
4406 #define offset_v                                          r9
4407 #define offset_u_right                                    r10
4408 #define width_rounded                                     r11
4409 #define height_rounded                                    r12
4410
4411 #define texture_offset_base                               r1
4412 #define tile_width                                        r2
4413 #define tile_height                                       r3
4414 #define num_blocks                                        r4
4415 #define block                                             r5
4416 #define sub_tile_height                                   r6
4417 #define fb_ptr                                            r7
4418 #define texture_mask                                      r8
4419 #define column_data                                       r9
4420 #define texture_offset                                    r10
4421 #define tiles_remaining                                   r11
4422 #define fb_ptr_advance_column                             r12
4423 #define texture_block_ptr                                 r14
4424
4425 #define temp                                              r14
4426
4427 #define texture_page_ptr                                  r3
4428 #define left_block_mask                                   r4
4429 #define right_block_mask                                  r5
4430 #define texture_mask_rev                                  r10
4431 #define control_mask                                      r11
4432
4433 #define dirty_textures_mask                               r4
4434 #define clut_ptr                                          r5
4435 #define current_texture_mask                              r6
4436
4437
4438 #undef texels
4439 #undef clut_low_a
4440 #undef clut_low_b
4441 #undef clut_high_a
4442 #undef clut_high_b
4443 #undef clut_a
4444 #undef clut_b
4445 #undef texels_low
4446 #undef texels_high
4447
4448 #define texels                                            d0
4449 #define draw_masks_fb_ptrs                                q1
4450
4451 #define draw_mask_fb_ptr_left                             d2
4452 #define draw_mask_fb_ptr_right                            d3
4453
4454 #define draw_mask_fb_ptr_left_a                           d2
4455 #define draw_mask_fb_ptr_left_b                           d3
4456 #define draw_mask_fb_ptr_right_a                          d10
4457 #define draw_mask_fb_ptr_right_b                          d11
4458 #define draw_masks_fb_ptrs2                               q5
4459
4460 #define clut_low_a                                        d4
4461 #define clut_low_b                                        d5
4462 #define clut_high_a                                       d6
4463 #define clut_high_b                                       d7
4464
4465 #define block_masks                                       d8
4466 #define block_masks_shifted                               d9
4467
4468 #define clut_a                                            q2
4469 #define clut_b                                            q3
4470
4471 #define texels_low                                        d12
4472 #define texels_high                                       d13
4473
4474 #define texels_wide_low                                   d14
4475 #define texels_wide_high                                  d15
4476 #define texels_wide                                       q7
4477
4478
4479 setup_sprite_flush_blocks:
4480   vpush { q1 - q5 }
4481
4482   stmdb sp!, { r0 - r3, r12, r14 }
4483   bl flush_render_block_buffer
4484   ldmia sp!, { r0 - r3, r12, r14 }
4485
4486   vpop { q1 - q5 }
4487
4488   add block, psx_gpu, #psx_gpu_blocks_offset
4489   bx lr
4490
4491
4492 setup_sprite_update_texture_4bpp_cache:
4493   stmdb sp!, { r0 - r3, r14 }
4494   bl update_texture_4bpp_cache
4495   ldmia sp!, { r0 - r3, pc }
4496
4497
4498 setup_sprite_update_texture_8bpp_cache:
4499   stmdb sp!, { r0 - r3, r14 }
4500   bl update_texture_8bpp_cache
4501   ldmia sp!, { r0 - r3, pc }
4502
4503
4504 #define setup_sprite_tiled_initialize_4bpp()                                   \
4505   ldr dirty_textures_mask,                                                     \
4506    [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ];                      \
4507   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ];                         \
4508                                                                                \
4509   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4510   vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ];                             \
4511                                                                                \
4512   tst current_texture_mask, dirty_textures_mask;                               \
4513   vuzp.u8 clut_a, clut_b;                                                      \
4514                                                                                \
4515   blne setup_sprite_update_texture_4bpp_cache                                  \
4516
4517 #define setup_sprite_tiled_initialize_8bpp()                                   \
4518   ldr dirty_textures_mask,                                                     \
4519    [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ];                      \
4520   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4521                                                                                \
4522   tst current_texture_mask, dirty_textures_mask;                               \
4523   blne setup_sprite_update_texture_8bpp_cache                                  \
4524
4525
4526 #define setup_sprite_block_count_single()                                      \
4527   sub_tile_height                                                              \
4528
4529 #define setup_sprite_block_count_double()                                      \
4530   sub_tile_height, lsl #1                                                      \
4531
4532 #define setup_sprite_tile_add_blocks(type)                                     \
4533   add num_blocks, num_blocks, setup_sprite_block_count_##type();               \
4534   cmp num_blocks, #MAX_BLOCKS;                                                 \
4535                                                                                \
4536   movgt num_blocks, setup_sprite_block_count_##type();                         \
4537   blgt setup_sprite_flush_blocks                                               \
4538
4539
4540 #define setup_sprite_tile_full_4bpp(edge)                                      \
4541   setup_sprite_tile_add_blocks(double);                                        \
4542                                                                                \
4543  4:                                                                            \
4544   and texture_block_ptr, texture_offset, texture_mask;                         \
4545   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4546                                                                                \
4547   pld [ fb_ptr ];                                                              \
4548   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4549   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4550                                                                                \
4551   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4552   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4553                                                                                \
4554   vst2.u8 { texels_low, texels_high }, [ block, :128 ];                        \
4555   add texture_block_ptr, texture_offset, #8;                                   \
4556                                                                                \
4557   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4558   add block, block, #40;                                                       \
4559                                                                                \
4560   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4561   add fb_ptr, fb_ptr, #16;                                                     \
4562                                                                                \
4563   vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ];                          \
4564   add block, block, #24;                                                       \
4565                                                                                \
4566   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4567   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4568                                                                                \
4569   pld [ fb_ptr ];                                                              \
4570   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4571   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4572                                                                                \
4573   vst2.u8 { texels_low, texels_high }, [ block, :128 ];                        \
4574   add block, block, #40;                                                       \
4575                                                                                \
4576   add texture_offset, texture_offset, #0x10;                                   \
4577   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4578                                                                                \
4579   vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ];                         \
4580   add block, block, #24;                                                       \
4581                                                                                \
4582   subs sub_tile_height, sub_tile_height, #1;                                   \
4583   bne 4b;                                                                      \
4584                                                                                \
4585   add texture_offset, texture_offset, #0xF00;                                  \
4586   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4587
4588   
4589 #define setup_sprite_tile_half_4bpp(edge)                                      \
4590   setup_sprite_tile_add_blocks(single);                                        \
4591                                                                                \
4592  4:                                                                            \
4593   and texture_block_ptr, texture_offset, texture_mask;                         \
4594   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4595                                                                                \
4596   pld [ fb_ptr ];                                                              \
4597   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4598   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4599                                                                                \
4600   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4601   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4602                                                                                \
4603   vst2.u8 { texels_low, texels_high }, [ block, :128 ];                        \
4604   add block, block, #40;                                                       \
4605                                                                                \
4606   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4607   vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ];                        \
4608                                                                                \
4609   add block, block, #24;                                                       \
4610   add texture_offset, texture_offset, #0x10;                                   \
4611                                                                                \
4612   add fb_ptr, fb_ptr, #2048;                                                   \
4613   subs sub_tile_height, sub_tile_height, #1;                                   \
4614                                                                                \
4615   bne 4b;                                                                      \
4616                                                                                \
4617   add texture_offset, texture_offset, #0xF00;                                  \
4618   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4619  
4620  
4621 #define setup_sprite_tile_full_8bpp(edge)                                      \
4622   setup_sprite_tile_add_blocks(double);                                        \
4623   add block, block, #16;                                                       \
4624                                                                                \
4625  4:                                                                            \
4626   and texture_block_ptr, texture_offset, texture_mask;                         \
4627   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4628                                                                                \
4629   pld [ fb_ptr ];                                                              \
4630   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4631   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4632                                                                                \
4633   add texture_block_ptr, texture_offset, #8;                                   \
4634   vst1.u32 { texels }, [ block, :64 ];                                         \
4635                                                                                \
4636   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4637   add block, block, #24;                                                       \
4638                                                                                \
4639   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4640                                                                                \
4641   add fb_ptr, fb_ptr, #16;                                                     \
4642   vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ];                          \
4643                                                                                \
4644   add block, block, #40;                                                       \
4645   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4646   pld [ fb_ptr ];                                                              \
4647                                                                                \
4648   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4649   vst1.u32 { texels }, [ block, :64 ];                                         \
4650   add block, block, #24;                                                       \
4651                                                                                \
4652   add texture_offset, texture_offset, #0x10;                                   \
4653   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4654                                                                                \
4655   vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ];                         \
4656   add block, block, #40;                                                       \
4657                                                                                \
4658   subs sub_tile_height, sub_tile_height, #1;                                   \
4659   bne 4b;                                                                      \
4660                                                                                \
4661   sub block, block, #16;                                                       \
4662   add texture_offset, texture_offset, #0xF00;                                  \
4663   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4664
4665   
4666 #define setup_sprite_tile_half_8bpp(edge)                                      \
4667   setup_sprite_tile_add_blocks(single);                                        \
4668   add block, block, #16;                                                       \
4669                                                                                \
4670  4:                                                                            \
4671   and texture_block_ptr, texture_offset, texture_mask;                         \
4672   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4673   pld [ fb_ptr ];                                                              \
4674                                                                                \
4675   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4676   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4677                                                                                \
4678   vst1.u32 { texels }, [ block, :64 ];                                         \
4679   add block, block, #24;                                                       \
4680                                                                                \
4681   vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ];                        \
4682   add block, block, #40;                                                       \
4683                                                                                \
4684   add texture_offset, texture_offset, #0x10;                                   \
4685   add fb_ptr, fb_ptr, #2048;                                                   \
4686                                                                                \
4687   subs sub_tile_height, sub_tile_height, #1;                                   \
4688   bne 4b;                                                                      \
4689                                                                                \
4690   sub block, block, #16;                                                       \
4691   add texture_offset, texture_offset, #0xF00;                                  \
4692   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4693
4694  
4695 #define setup_sprite_tile_column_edge_pre_adjust_half_right()                  \
4696   add texture_offset, texture_offset_base, #8;                                 \
4697   add fb_ptr, fb_ptr, #16                                                      \
4698
4699 #define setup_sprite_tile_column_edge_pre_adjust_half_left()                   \
4700   mov texture_offset, texture_offset_base                                      \
4701
4702 #define setup_sprite_tile_column_edge_pre_adjust_half(edge)                    \
4703   setup_sprite_tile_column_edge_pre_adjust_half_##edge()                       \
4704
4705 #define setup_sprite_tile_column_edge_pre_adjust_full(edge)                    \
4706   mov texture_offset, texture_offset_base                                      \
4707
4708 #define setup_sprite_tile_column_edge_post_adjust_half_right()                 \
4709   sub fb_ptr, fb_ptr, #16                                                      \
4710
4711 #define setup_sprite_tile_column_edge_post_adjust_half_left()                  \
4712
4713 #define setup_sprite_tile_column_edge_post_adjust_half(edge)                   \
4714   setup_sprite_tile_column_edge_post_adjust_half_##edge()                      \
4715
4716 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
4717
4718
4719 #define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode,  \
4720  x4mode)                                                                       \
4721   mov sub_tile_height, column_data;                                            \
4722   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4723   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4724   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4725
4726 #define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode,   \
4727  x4mode)                                                                       \
4728   and sub_tile_height, column_data, #0xFF;                                     \
4729   mov tiles_remaining, column_data, lsr #16;                                   \
4730   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4731   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4732                                                                                \
4733   subs tiles_remaining, tiles_remaining, #1;                                   \
4734   beq 2f;                                                                      \
4735                                                                                \
4736  3:                                                                            \
4737   mov sub_tile_height, #16;                                                    \
4738   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4739   subs tiles_remaining, tiles_remaining, #1;                                   \
4740   bne 3b;                                                                      \
4741                                                                                \
4742  2:                                                                            \
4743   uxtb sub_tile_height, column_data, ror #8;                                   \
4744   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4745   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4746
4747
4748 #define setup_sprite_column_data_single()                                      \
4749   mov column_data, height;                                                     \
4750   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]          \
4751
4752 #define setup_sprite_column_data_multi()                                       \
4753   and height_rounded, height_rounded, #0xF;                                    \
4754   rsb column_data, offset_v, #16;                                              \
4755                                                                                \
4756   add height_rounded, height_rounded, #1;                                      \
4757   sub tile_height, tile_height, #1;                                            \
4758                                                                                \
4759   orr column_data, column_data, tile_height, lsl #16;                          \
4760   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ];         \
4761                                                                                \
4762   orr column_data, column_data, height_rounded, lsl #8                         \
4763
4764 #define setup_sprite_setup_left_draw_mask_fb_ptr()                             \
4765   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4766   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4767
4768 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column()              \
4769   mov fb_ptr_advance_column, #32;                                              \
4770   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4771                                                                                \
4772   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11;           \
4773   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4774
4775 #define setup_sprite_setup_right_draw_mask_fb_ptr()                            \
4776   vdup.u8 draw_mask_fb_ptr_left, block_masks[4];                               \
4777   vdup.u8 draw_mask_fb_ptr_right, block_masks[5]                               \
4778
4779 #define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode,     \
4780  edge, x4mode)                                                                 \
4781  setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode:     \
4782   setup_sprite_column_data_##multi_height();                                   \
4783   vext.32 block_masks_shifted, block_masks, block_masks, #1;                   \
4784   vorr.u32 block_masks, block_masks, block_masks_shifted;                      \
4785   setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
4786                                                                                \
4787   setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
4788   ldmia sp!, { r4 - r11, pc }                                                  \
4789
4790 #define setup_sprite_tiled_advance_column()                                    \
4791   add texture_offset_base, texture_offset_base, #0x100;                        \
4792   tst texture_offset_base, #0xF00;                                             \
4793   subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00)             \
4794
4795 #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode,      \
4796  right_mode, x4mode)                                                           \
4797  setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
4798   setup_sprite_column_data_##multi_height();                                   \
4799                                                                                \
4800   setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode();           \
4801                                                                                \
4802   setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
4803                                                                                \
4804   subs tile_width, tile_width, #2;                                             \
4805   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4806                                                                                \
4807   beq 1f;                                                                      \
4808                                                                                \
4809   vmov.u8 draw_masks_fb_ptrs, #0;                                              \
4810   vmov.u8 draw_masks_fb_ptrs2, #0;                                             \
4811                                                                                \
4812  0:                                                                            \
4813   setup_sprite_tiled_advance_column();                                         \
4814   setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode);      \
4815   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4816   subs tile_width, tile_width, #1;                                             \
4817   bne 0b;                                                                      \
4818                                                                                \
4819  1:                                                                            \
4820   setup_sprite_setup_right_draw_mask_fb_ptr##x4mode();                         \
4821                                                                                \
4822   setup_sprite_tiled_advance_column();                                         \
4823   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
4824   ldmia sp!, { r4 - r11, pc }                                                  \
4825
4826
4827 #define setup_sprite_offset_u_adjust()                                         \
4828
4829 #define setup_sprite_get_left_block_mask()                                     \
4830   and left_block_mask, left_block_mask, #0xFF                                  \
4831
4832 #define setup_sprite_compare_left_block_mask()                                 \
4833   cmp left_block_mask, #0xFF                                                   \
4834
4835 #define setup_sprite_get_right_block_mask()                                    \
4836   uxtb right_block_mask, right_block_mask, ror #8                              \
4837
4838 #define setup_sprite_compare_right_block_mask()                                \
4839   cmp right_block_mask, #0xFF                                                  \
4840
4841
4842
4843 /* 4x stuff */
4844 #define fb_ptr2 column_data
4845
4846 #define setup_sprite_offset_u_adjust_4x()                                      \
4847   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
4848   lsl offset_u_right, #1;                                                      \
4849   lsl offset_u, #1;                                                            \
4850   add offset_u_right, #1                                                       \
4851
4852 #define setup_sprite_get_left_block_mask_4x()                                  \
4853   sxth left_block_mask, left_block_mask                                        \
4854
4855 #define setup_sprite_compare_left_block_mask_4x()                              \
4856   cmp left_block_mask, #0xFFFFFFFF                                             \
4857
4858 #define setup_sprite_get_right_block_mask_4x()                                 \
4859   sxth right_block_mask, right_block_mask, ror #16                             \
4860
4861 #define setup_sprite_compare_right_block_mask_4x()                             \
4862   cmp right_block_mask, #0xFFFFFFFF                                            \
4863
4864
4865 #define widen_texels_16bpp(texels_)                                            \
4866   vmov texels_wide_low, texels_;                                               \
4867   vmov texels_wide_high, texels_;                                              \
4868   vzip.16 texels_wide_low, texels_wide_high                                    \
4869
4870 #define widen_texels_8bpp(texels_)                                             \
4871   vmov texels_wide_low, texels_;                                               \
4872   vmov texels_wide_high, texels_;                                              \
4873   vzip.8 texels_wide_low, texels_wide_high                                     \
4874
4875 #define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)         \
4876   vst1.u32 { texels_ }, [ block_, :128 ];                                      \
4877   add block_, block_, #40;                                                     \
4878                                                                                \
4879   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4880   vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ];                             \
4881   add block_, block_, #24                                                      \
4882
4883 /* assumes 16-byte offset already added to block_ */
4884 #define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)          \
4885   vst1.u32 { texels_ }, [ block_, :64 ];                                       \
4886   add block_, block_, #24;                                                     \
4887                                                                                \
4888   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4889   vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ];                             \
4890   add block_, block_, #40                                                      \
4891
4892 #define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,             \
4893  draw_mask_fb_ptr_b_)                                                          \
4894   widen_texels_16bpp(texels_low);                                              \
4895   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4896                                                                                \
4897   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr);          \
4898                                                                                \
4899   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);      \
4900   widen_texels_16bpp(texels_high);                                             \
4901                                                                                \
4902   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4903   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);      \
4904                                                                                \
4905   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4906   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)       \
4907
4908 #define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,              \
4909  draw_mask_fb_ptr_b_)                                                          \
4910   widen_texels_8bpp(texels);                                                   \
4911   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4912                                                                                \
4913   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr);       \
4914   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);   \
4915                                                                                \
4916   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4917   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);  \
4918                                                                                \
4919   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4920   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)   \
4921
4922
4923 #define setup_sprite_tiled_initialize_4bpp_4x()                                \
4924   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ];                         \
4925   vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ];                             \
4926                                                                                \
4927   vuzp.u8 clut_a, clut_b                                                       \
4928
4929 #define setup_sprite_tiled_initialize_8bpp_4x()                                \
4930
4931
4932 #define setup_sprite_block_count_single_4x()                                   \
4933   sub_tile_height, lsl #2                                                      \
4934
4935 #define setup_sprite_block_count_double_4x()                                   \
4936   sub_tile_height, lsl #(1+2)                                                  \
4937
4938 #define setup_sprite_tile_full_4bpp_4x(edge)                                   \
4939   setup_sprite_tile_add_blocks(double_4x);                                     \
4940   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
4941                                                                                \
4942  4:                                                                            \
4943   and texture_block_ptr, texture_offset, texture_mask;                         \
4944   pld [ fb_ptr ];                                                              \
4945                                                                                \
4946   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4947   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4948                                                                                \
4949   add texture_block_ptr, texture_offset, #8;                                   \
4950   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4951                                                                                \
4952   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4953   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4954                                                                                \
4955   vzip.8 texels_low, texels_high;                                              \
4956   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                  \
4957    draw_mask_fb_ptr_left_b);                                                   \
4958                                                                                \
4959   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4960   pld [ fb_ptr, #2048 ];                                                       \
4961                                                                                \
4962   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4963   add fb_ptr, fb_ptr, #16*2;                                                   \
4964                                                                                \
4965   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4966   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4967                                                                                \
4968   vzip.8 texels_low, texels_high;                                              \
4969   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                 \
4970    draw_mask_fb_ptr_right_b);                                                  \
4971                                                                                \
4972   add texture_offset, texture_offset, #0x10;                                   \
4973   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
4974                                                                                \
4975   subs sub_tile_height, sub_tile_height, #1;                                   \
4976   bne 4b;                                                                      \
4977                                                                                \
4978   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
4979   add texture_offset, texture_offset, #0xF00;                                  \
4980   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4981
4982
4983 #define setup_sprite_tile_half_4bpp_4x(edge)                                   \
4984   setup_sprite_tile_add_blocks(single_4x);                                     \
4985   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
4986                                                                                \
4987  4:                                                                            \
4988   and texture_block_ptr, texture_offset, texture_mask;                         \
4989   pld [ fb_ptr ];                                                              \
4990                                                                                \
4991   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4992   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4993                                                                                \
4994   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4995   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4996                                                                                \
4997   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4998   add texture_offset, texture_offset, #0x10;                                   \
4999                                                                                \
5000   vzip.8 texels_low, texels_high;                                              \
5001   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,              \
5002    draw_mask_fb_ptr_##edge##_b);                                               \
5003                                                                                \
5004   pld [ fb_ptr, #2048 ];                                                       \
5005   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5006                                                                                \
5007   subs sub_tile_height, sub_tile_height, #1;                                   \
5008   bne 4b;                                                                      \
5009                                                                                \
5010   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5011   add texture_offset, texture_offset, #0xF00;                                  \
5012   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
5013
5014
5015 #define setup_sprite_tile_full_8bpp_4x(edge)                                   \
5016   setup_sprite_tile_add_blocks(double_4x);                                     \
5017   add block, block, #16;                                                       \
5018   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5019                                                                                \
5020  4:                                                                            \
5021   and texture_block_ptr, texture_offset, texture_mask;                         \
5022   pld [ fb_ptr ];                                                              \
5023                                                                                \
5024   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5025   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
5026                                                                                \
5027   add texture_block_ptr, texture_offset, #8;                                   \
5028   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                   \
5029    draw_mask_fb_ptr_left_b);                                                   \
5030                                                                                \
5031   pld [ fb_ptr, #2048 ];                                                       \
5032   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
5033                                                                                \
5034   add fb_ptr, fb_ptr, #16*2;                                                   \
5035   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5036                                                                                \
5037   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
5038                                                                                \
5039   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                  \
5040    draw_mask_fb_ptr_right_b);                                                  \
5041                                                                                \
5042   add texture_offset, texture_offset, #0x10;                                   \
5043   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5044                                                                                \
5045   subs sub_tile_height, sub_tile_height, #1;                                   \
5046   bne 4b;                                                                      \
5047                                                                                \
5048   sub block, block, #16;                                                       \
5049   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5050   add texture_offset, texture_offset, #0xF00;                                  \
5051   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
5052
5053   
5054 #define setup_sprite_tile_half_8bpp_4x(edge)                                   \
5055   setup_sprite_tile_add_blocks(single_4x);                                     \
5056   add block, block, #16;                                                       \
5057   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5058                                                                                \
5059  4:                                                                            \
5060   and texture_block_ptr, texture_offset, texture_mask;                         \
5061   pld [ fb_ptr ];                                                              \
5062                                                                                \
5063   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5064   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
5065                                                                                \
5066   pld [ fb_ptr, #2048 ];                                                       \
5067   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,               \
5068    draw_mask_fb_ptr_##edge##_b);                                               \
5069                                                                                \
5070   add texture_offset, texture_offset, #0x10;                                   \
5071   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5072                                                                                \
5073   subs sub_tile_height, sub_tile_height, #1;                                   \
5074   bne 4b;                                                                      \
5075                                                                                \
5076   sub block, block, #16;                                                       \
5077   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5078   add texture_offset, texture_offset, #0xF00;                                  \
5079   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
5080
5081  
5082 #define setup_sprite_tile_column_edge_pre_adjust_half_right_4x()               \
5083   add texture_offset, texture_offset_base, #8;                                 \
5084   add fb_ptr, fb_ptr, #16 * 2                                                  \
5085
5086 #define setup_sprite_tile_column_edge_pre_adjust_half_left_4x()                \
5087   mov texture_offset, texture_offset_base                                      \
5088
5089 #define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge)                 \
5090   setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x()                  \
5091
5092 #define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge)                 \
5093   mov texture_offset, texture_offset_base                                      \
5094
5095 #define setup_sprite_tile_column_edge_post_adjust_half_right_4x()              \
5096   sub fb_ptr, fb_ptr, #16 * 2                                                  \
5097
5098 #define setup_sprite_tile_column_edge_post_adjust_half_left_4x()               \
5099
5100 #define setup_sprite_tile_column_edge_post_adjust_half_4x(edge)                \
5101   setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x()                 \
5102
5103 #define setup_sprite_tile_column_edge_post_adjust_full_4x(edge)                \
5104
5105
5106 #define setup_sprite_setup_left_draw_mask_fb_ptr_4x()                          \
5107   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5108   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5109   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5110   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5111
5112 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x()           \
5113   mov fb_ptr_advance_column, #32 * 2;                                          \
5114   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5115   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5116   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1;       \
5117   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5118   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5119
5120 #define setup_sprite_setup_right_draw_mask_fb_ptr_4x()                         \
5121   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4];                             \
5122   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5];                             \
5123   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6];                            \
5124   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7]                             \
5125
5126
5127 // r0: psx_gpu
5128 // r1: x
5129 // r2: y
5130 // r3: u
5131 // [ sp ]: v
5132 // [ sp + 4 ]: width
5133 // [ sp + 8 ]: height
5134 // [ sp + 12 ]: color (unused)
5135
5136 #define setup_sprite_tiled_builder(texture_mode, x4mode)                       \
5137                                                                                \
5138 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, full,        \
5139   x4mode);                                                                     \
5140 setup_sprite_tile_column_width_single(texture_mode, multi,  full, none,        \
5141   x4mode);                                                                     \
5142 setup_sprite_tile_column_width_multi(texture_mode,  single, full, full,        \
5143   x4mode);                                                                     \
5144 setup_sprite_tile_column_width_single(texture_mode, single, full, none,        \
5145   x4mode);                                                                     \
5146 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, full,        \
5147   x4mode);                                                                     \
5148 setup_sprite_tile_column_width_single(texture_mode, multi,  half, right,       \
5149   x4mode);                                                                     \
5150 setup_sprite_tile_column_width_multi(texture_mode,  single, half, full,        \
5151   x4mode);                                                                     \
5152 setup_sprite_tile_column_width_single(texture_mode, single, half, right,       \
5153   x4mode);                                                                     \
5154 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, half,        \
5155   x4mode);                                                                     \
5156 setup_sprite_tile_column_width_single(texture_mode, multi,  half, left,        \
5157   x4mode);                                                                     \
5158 setup_sprite_tile_column_width_multi(texture_mode,  single, full, half,        \
5159   x4mode);                                                                     \
5160 setup_sprite_tile_column_width_single(texture_mode, single, half, left,        \
5161   x4mode);                                                                     \
5162 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, half,        \
5163   x4mode);                                                                     \
5164 setup_sprite_tile_column_width_multi(texture_mode,  single, half, half,        \
5165   x4mode);                                                                     \
5166                                                                                \
5167 .align 4;                                                                      \
5168                                                                                \
5169 function(setup_sprite_##texture_mode##x4mode)                                  \
5170   stmdb sp!, { r4 - r11, r14 };                                                \
5171   setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
5172                                                                                \
5173   ldr v, [ sp, #36 ];                                                          \
5174   and offset_u, u, #0xF;                                                       \
5175                                                                                \
5176   ldr width, [ sp, #40 ];                                                      \
5177   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
5178                                                                                \
5179   ldr height, [ sp, #44 ];                                                     \
5180   add fb_ptr, fb_ptr, y, lsl #11;                                              \
5181                                                                                \
5182   add fb_ptr, fb_ptr, x, lsl #1;                                               \
5183   and offset_v, v, #0xF;                                                       \
5184                                                                                \
5185   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
5186   add width_rounded, offset_u, width;                                          \
5187                                                                                \
5188   add height_rounded, offset_v, height;                                        \
5189   add width_rounded, width_rounded, #15;                                       \
5190                                                                                \
5191   add height_rounded, height_rounded, #15;                                     \
5192   mov tile_width, width_rounded, lsr #4;                                       \
5193                                                                                \
5194   /* texture_offset_base = VH-VL-00-00                                       */\
5195   mov texture_offset_base, v, lsl #8;                                          \
5196   and offset_u_right, width_rounded, #0xF;                                     \
5197                                                                                \
5198   /* texture_offset_base = VH-UH-UL-00                                       */\
5199   bfi texture_offset_base, u, #4, #8;                                          \
5200   mov right_block_mask, #0xFFFFFFFE;                                           \
5201                                                                                \
5202   setup_sprite_offset_u_adjust##x4mode();                                      \
5203                                                                                \
5204   /* texture_offset_base = VH-UH-VL-00                                       */\
5205   bfi texture_offset_base, v, #4, #4;                                          \
5206   mov left_block_mask, #0xFFFFFFFF;                                            \
5207                                                                                \
5208   mov tile_height, height_rounded, lsr #4;                                     \
5209   mvn left_block_mask, left_block_mask, lsl offset_u;                          \
5210                                                                                \
5211   /* texture_mask = HH-HL-WH-WL                                              */\
5212   ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ];          \
5213   mov right_block_mask, right_block_mask, lsl offset_u_right;                  \
5214                                                                                \
5215   /* texture_mask_rev = WH-WL-HH-HL                                          */\
5216   rev16 texture_mask_rev, texture_mask;                                        \
5217   vmov block_masks, left_block_mask, right_block_mask;                         \
5218                                                                                \
5219   /* texture_mask = HH-HL-HL-WL                                              */\
5220   bfi texture_mask, texture_mask_rev, #4, #4;                                  \
5221   /* texture_mask_rev = 00-00-00-WH                                          */\
5222   mov texture_mask_rev, texture_mask_rev, lsr #12;                             \
5223                                                                                \
5224   /* texture_mask = HH-WH-HL-WL                                              */\
5225   bfi texture_mask, texture_mask_rev, #8, #4;                                  \
5226   setup_sprite_get_left_block_mask##x4mode();                                  \
5227                                                                                \
5228   mov control_mask, #0;                                                        \
5229   setup_sprite_compare_left_block_mask##x4mode();                              \
5230                                                                                \
5231   setup_sprite_get_right_block_mask##x4mode();                                 \
5232   orreq control_mask, control_mask, #0x4;                                      \
5233                                                                                \
5234   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
5235   setup_sprite_compare_right_block_mask##x4mode();                             \
5236                                                                                \
5237   orreq control_mask, control_mask, #0x8;                                      \
5238   cmp tile_width, #1;                                                          \
5239                                                                                \
5240   add block, psx_gpu, #psx_gpu_blocks_offset;                                  \
5241   orreq control_mask, control_mask, #0x1;                                      \
5242                                                                                \
5243   cmp tile_height, #1;                                                         \
5244   add block, block, num_blocks, lsl #6;                                        \
5245                                                                                \
5246   orreq control_mask, control_mask, #0x2;                                      \
5247   JT_OP_REL(9f, control_mask, temp);                                           \
5248   JT_OP(ldr pc, [ pc, control_mask, lsl #2 ]);                                 \
5249   nop;                                                                         \
5250                                                                                \
5251  9:                                                                            \
5252  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode);   \
5253  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode);  \
5254  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode);  \
5255  .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5256  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode);   \
5257  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5258  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode);  \
5259  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5260  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode);   \
5261  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode);  \
5262  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode);  \
5263  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5264  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode);   \
5265  .word 0x00000000;                                                             \
5266  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode);  \
5267
5268
5269 setup_sprite_tiled_builder(4bpp,);
5270 setup_sprite_tiled_builder(8bpp,);
5271
5272 #undef draw_mask_fb_ptr_left
5273 #undef draw_mask_fb_ptr_right
5274
5275 setup_sprite_tiled_builder(4bpp, _4x);
5276 setup_sprite_tiled_builder(8bpp, _4x);
5277
5278
5279 #undef block_ptr
5280 #undef num_blocks
5281 #undef clut_ptr
5282
5283 #define psx_gpu                                           r0
5284 #define block_ptr                                         r0
5285 #define num_blocks                                        r1
5286 #define clut_ptr                                          r2
5287 #define texel_shift_mask                                  r3
5288 #define block_pixels_a                                    r4
5289 #define block_pixels_b                                    r5
5290 #define texel_0                                           r6
5291 #define texel_2                                           r7
5292 #define texel_4                                           r8
5293 #define texel_6                                           r9
5294 #define texel_1                                           r10
5295 #define texel_3                                           r11
5296 #define texel_5                                           r12
5297 #define texel_7                                           r14
5298 #define texels_01                                         r6
5299 #define texels_23                                         r7
5300 #define texels_45                                         r8
5301 #define texels_67                                         r9
5302
5303 function(texture_sprite_blocks_8bpp)
5304   stmdb sp!, { r4 - r11, r14 }
5305   movw texel_shift_mask, #(0xFF << 1)
5306
5307   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5308   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
5309
5310   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5311   ldr block_pixels_a, [ block_ptr, #16 ]
5312
5313  0:
5314   and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5315   ldr block_pixels_b, [ block_ptr, #20 ]
5316
5317   and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5318   ldrh texel_0, [ clut_ptr, texel_0 ]
5319
5320   and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5321   ldrh texel_1, [ clut_ptr, texel_1 ]
5322
5323   and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5324   ldr block_pixels_a, [ block_ptr, #(64 + 16) ]
5325
5326   ldrh texel_2, [ clut_ptr, texel_2 ]
5327   and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5328
5329   ldrh texel_3, [ clut_ptr, texel_3 ]
5330   and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5331
5332   ldrh texel_4, [ clut_ptr, texel_4 ]
5333   and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5334
5335   ldrh texel_5, [ clut_ptr, texel_5 ]
5336   and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5337
5338   ldrh texel_6, [ clut_ptr, texel_6 ]
5339   orr texels_01, texel_0, texel_1, lsl #16
5340
5341   ldrh texel_7, [ clut_ptr, texel_7 ]
5342   orr texels_23, texel_2, texel_3, lsl #16
5343
5344   orr texels_45, texel_4, texel_5, lsl #16
5345   str texels_01, [ block_ptr, #0 ]
5346
5347   orr texels_67, texel_6, texel_7, lsl #16
5348   str texels_23, [ block_ptr, #4 ]
5349
5350   subs num_blocks, num_blocks, #1
5351   str texels_45, [ block_ptr, #8 ]
5352
5353   str texels_67, [ block_ptr, #12 ]
5354   add block_ptr, block_ptr, #64
5355
5356   bne 0b
5357
5358   ldmia sp!, { r4 - r11, pc }
5359
5360
5361 #undef width_rounded
5362 #undef texture_mask
5363 #undef num_blocks
5364 #undef texture_offset
5365 #undef texels_low
5366 #undef texels_high
5367 #undef texels_wide_low
5368 #undef texels_wide_high
5369 #undef texels_wide
5370 #undef fb_ptr2
5371 #undef temp
5372
5373 #define psx_gpu                                           r0
5374 #define x                                                 r1
5375 #define y                                                 r2
5376 #define u                                                 r3
5377 #define v                                                 r4
5378 #define width                                             r5
5379 #define height                                            r6
5380 #define left_offset                                       r8
5381 #define width_rounded                                     r9
5382 #define right_width                                       r10
5383
5384 #define block_width                                       r11
5385
5386 #define texture_offset_base                               r1
5387 #define texture_mask                                      r2
5388 #define texture_page_ptr                                  r3
5389 #define num_blocks                                        r4
5390 #define block                                             r5
5391 #define fb_ptr                                            r7
5392 #define texture_offset                                    r8
5393 #define blocks_remaining                                  r9
5394 #define fb_ptr2                                           r10
5395 #define fb_ptr_pitch                                      r12
5396 #define texture_block_ptr                                 r14
5397
5398 #define texture_mask_width                                r2
5399 #define texture_mask_height                               r3
5400 #define left_mask_bits                                    r4
5401 #define right_mask_bits                                   r5
5402
5403
5404 #undef block_masks
5405 #undef block_masks_shifted
5406 #undef texels
5407
5408 #define block_masks                                       d0
5409 #define block_masks_shifted                               d1
5410 #define draw_mask_fb_ptr                                  d2
5411 #define texels                                            q2
5412
5413 #define draw_mask_fb_ptr_a                                d2
5414 #define draw_mask_fb_ptr_b                                d3
5415 #define texels_low                                        d4
5416 #define texels_high                                       d5
5417 #define texels_wide_low                                   d6
5418 #define texels_wide_high                                  d7
5419 #define texels_wide                                       q3
5420
5421
5422 setup_sprites_16bpp_flush:
5423   vpush { d0 - d3 }
5424
5425   stmdb sp!, { r0 - r3, r12, r14 }
5426   bl flush_render_block_buffer
5427   ldmia sp!, { r0 - r3, r12, r14 }
5428
5429   vpop { d0 - d3 }
5430
5431   add block, psx_gpu, #psx_gpu_blocks_offset
5432   mov num_blocks, block_width
5433
5434   bx lr
5435
5436 function(setup_sprite_16bpp)
5437   stmdb sp!, { r4 - r11, r14 }
5438   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5439
5440   ldr v, [ sp, #36 ]
5441   add fb_ptr, fb_ptr, y, lsl #11
5442
5443   ldr width, [ sp, #40 ]
5444   add fb_ptr, fb_ptr, x, lsl #1
5445
5446   ldr height, [ sp, #44 ]
5447   and left_offset, u, #0x7
5448
5449   add texture_offset_base, u, u
5450   add width_rounded, width, #7
5451
5452   add texture_offset_base, texture_offset_base, v, lsl #11
5453   mov left_mask_bits, #0xFF
5454   
5455   ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5456   add width_rounded, width_rounded, left_offset
5457
5458   ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5459   sub fb_ptr, fb_ptr, left_offset, lsl #1
5460
5461   add texture_mask, texture_mask_width, texture_mask_width
5462   mov right_mask_bits, #0xFE
5463
5464   and right_width, width_rounded, #0x7
5465   mvn left_mask_bits, left_mask_bits, lsl left_offset
5466
5467   add texture_mask, texture_mask, texture_mask_height, lsl #11
5468   mov block_width, width_rounded, lsr #3
5469
5470   mov right_mask_bits, right_mask_bits, lsl right_width
5471   movw fb_ptr_pitch, #(2048 + 16)
5472
5473   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5474   vmov block_masks, left_mask_bits, right_mask_bits
5475
5476   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5477   add block, psx_gpu, #psx_gpu_blocks_offset
5478
5479   bic texture_offset_base, texture_offset_base, #0xF
5480   cmp block_width, #1
5481
5482   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5483   add block, block, num_blocks, lsl #6
5484
5485   bne 0f
5486
5487   vext.32 block_masks_shifted, block_masks, block_masks, #1
5488   vorr.u32 block_masks, block_masks, block_masks_shifted
5489   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5490
5491  1:
5492   add num_blocks, num_blocks, #1
5493   cmp num_blocks, #MAX_BLOCKS
5494   blgt setup_sprites_16bpp_flush
5495
5496   and texture_block_ptr, texture_offset_base, texture_mask
5497   subs height, height, #1
5498
5499   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5500   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5501
5502   vst1.u32 { texels }, [ block, :128 ]
5503   add block, block, #40
5504
5505   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5506   pld [ fb_ptr ]
5507
5508   vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5509
5510   add block, block, #24
5511   add texture_offset_base, texture_offset_base, #2048
5512   add fb_ptr, fb_ptr, #2048
5513   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5514   bne 1b
5515
5516   ldmia sp!, { r4 - r11, pc }
5517
5518  0:
5519   add num_blocks, num_blocks, block_width
5520   mov texture_offset, texture_offset_base
5521
5522   cmp num_blocks, #MAX_BLOCKS
5523   blgt setup_sprites_16bpp_flush
5524
5525   add texture_offset_base, texture_offset_base, #2048
5526   and texture_block_ptr, texture_offset, texture_mask
5527
5528   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5529   vld1.u32 { texels }, [ texture_block_ptr, :128 ]  
5530
5531   vst1.u32 { texels }, [ block, :128 ]
5532   add block, block, #40
5533
5534   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5535   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5536   pld [ fb_ptr ]
5537
5538   vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5539   subs blocks_remaining, block_width, #2
5540
5541   add texture_offset, texture_offset, #16
5542   add fb_ptr, fb_ptr, #16
5543
5544   vmov.u8 draw_mask_fb_ptr, #0
5545
5546   add block, block, #24
5547   beq 2f
5548
5549  1:
5550   and texture_block_ptr, texture_offset, texture_mask
5551   subs blocks_remaining, blocks_remaining, #1
5552
5553   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5554   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5555
5556   vst1.u32 { texels }, [ block, :128 ]
5557   add block, block, #40
5558
5559   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5560   pld [ fb_ptr ]
5561
5562   vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5563   
5564   add texture_offset, texture_offset, #16
5565   add fb_ptr, fb_ptr, #16
5566
5567   add block, block, #24
5568   bne 1b
5569
5570  2:
5571   and texture_block_ptr, texture_offset, texture_mask
5572   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5573
5574   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5575   vdup.u8 draw_mask_fb_ptr, block_masks[4]
5576
5577   vst1.u32 { texels }, [ block, :128 ]
5578   add block, block, #40
5579
5580   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5581   vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5582   
5583   add block, block, #24
5584   subs height, height, #1
5585
5586   add fb_ptr, fb_ptr, fb_ptr_pitch
5587   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5588
5589   bne 0b
5590
5591   ldmia sp!, { r4 - r11, pc }
5592
5593
5594 // 4x version
5595 // FIXME: duplicate code with normal version :(
5596 #undef draw_mask_fb_ptr
5597
5598 function(setup_sprite_16bpp_4x)
5599   stmdb sp!, { r4 - r11, r14 }
5600   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5601
5602   ldr v, [ sp, #36 ]
5603   add fb_ptr, fb_ptr, y, lsl #11
5604
5605   ldr width, [ sp, #40 ]
5606   add fb_ptr, fb_ptr, x, lsl #1
5607
5608   ldr height, [ sp, #44 ]
5609   and left_offset, u, #0x7
5610
5611   add texture_offset_base, u, u
5612   add width_rounded, width, #7
5613
5614   add texture_offset_base, texture_offset_base, v, lsl #11
5615   movw left_mask_bits, #0xFFFF
5616   
5617   ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5618   add width_rounded, width_rounded, left_offset
5619
5620   lsl left_offset, #1
5621
5622   ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5623   sub fb_ptr, fb_ptr, left_offset, lsl #1
5624
5625   add texture_mask, texture_mask_width, texture_mask_width
5626   movw right_mask_bits, #0xFFFC
5627
5628   and right_width, width_rounded, #0x7
5629   mvn left_mask_bits, left_mask_bits, lsl left_offset
5630
5631   lsl right_width, #1
5632
5633   add texture_mask, texture_mask, texture_mask_height, lsl #11
5634   mov block_width, width_rounded, lsr #3
5635
5636   mov right_mask_bits, right_mask_bits, lsl right_width
5637   movw fb_ptr_pitch, #(2048 + 16) * 2
5638
5639   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5640   vmov block_masks, left_mask_bits, right_mask_bits
5641
5642   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5643   add block, psx_gpu, #psx_gpu_blocks_offset
5644
5645   bic texture_offset_base, texture_offset_base, #0xF
5646   cmp block_width, #1
5647
5648   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5649   add block, block, num_blocks, lsl #6
5650
5651   lsl block_width, #2
5652   bne 0f
5653
5654   vext.32 block_masks_shifted, block_masks, block_masks, #1
5655   vorr.u32 block_masks, block_masks, block_masks_shifted
5656   vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5657   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5658
5659  1:
5660   add num_blocks, num_blocks, block_width
5661   cmp num_blocks, #MAX_BLOCKS
5662   blgt setup_sprites_16bpp_flush
5663
5664   and texture_block_ptr, texture_offset_base, texture_mask
5665   subs height, height, #1
5666
5667   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5668   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5669
5670   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5671
5672   add texture_offset_base, texture_offset_base, #2048
5673   add fb_ptr, fb_ptr, #2048*2
5674   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5675   bne 1b
5676
5677   ldmia sp!, { r4 - r11, pc }
5678
5679  0:
5680   add num_blocks, num_blocks, block_width
5681   mov texture_offset, texture_offset_base
5682
5683   vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5684   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5685
5686   cmp num_blocks, #MAX_BLOCKS
5687   blgt setup_sprites_16bpp_flush
5688
5689   add texture_offset_base, texture_offset_base, #2048
5690   and texture_block_ptr, texture_offset, texture_mask
5691
5692   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5693   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5694
5695   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5696
5697   subs blocks_remaining, block_width, #2*4
5698   add texture_offset, texture_offset, #16
5699
5700   vmov.u8 draw_mask_fb_ptr_a, #0
5701   vmov.u8 draw_mask_fb_ptr_b, #0
5702
5703   add fb_ptr, fb_ptr, #16*2
5704   beq 2f
5705
5706  1:
5707   and texture_block_ptr, texture_offset, texture_mask
5708   subs blocks_remaining, blocks_remaining, #4
5709
5710   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5711   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5712
5713   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5714   add texture_offset, texture_offset, #16
5715
5716   add fb_ptr, fb_ptr, #16*2
5717   bgt 1b
5718
5719  2:
5720   vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5721   vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5722
5723   and texture_block_ptr, texture_offset, texture_mask
5724   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5725
5726   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5727
5728   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5729   subs height, height, #1
5730
5731   add fb_ptr, fb_ptr, fb_ptr_pitch
5732   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5733
5734   bne 0b
5735
5736   ldmia sp!, { r4 - r11, pc }
5737
5738
5739 #undef width
5740 #undef right_width
5741 #undef right_mask_bits
5742 #undef color
5743 #undef height
5744 #undef blocks_remaining
5745 #undef colors
5746 #undef right_mask
5747 #undef test_mask
5748 #undef draw_mask
5749
5750 #define psx_gpu                                           r0
5751 #define x                                                 r1
5752 #define y                                                 r2
5753 #define width                                             r3
5754 #define right_width                                       r5
5755 #define right_mask_bits                                   r6
5756 #define fb_ptr                                            r7
5757 #define color                                             r8
5758 #define height                                            r9
5759 #define fb_ptr_pitch                                      r12
5760
5761 // referenced by setup_sprites_16bpp_flush
5762 #define num_blocks                                        r4
5763 #define block                                             r5
5764 #define block_width                                       r11
5765
5766 #define color_r                                           r1
5767 #define color_g                                           r2
5768 #define color_b                                           r8
5769 #define blocks_remaining                                  r6
5770
5771 #define colors                                            q0
5772 #define right_mask                                        q1
5773 #define test_mask                                         q2
5774 #define draw_mask                                         q2
5775 #define draw_mask_bits_fb_ptr                             d6
5776
5777
5778 .align 3
5779
5780 function(setup_sprite_untextured)
5781   ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
5782   tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
5783     | RENDER_FLAGS_BLEND)
5784   ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
5785   tsteq r12, #RENDER_INTERLACE_ENABLED
5786   beq setup_sprite_untextured_simple
5787
5788   stmdb sp!, { r4 - r11, r14 }
5789
5790   ldr width, [ sp, #40 ]
5791   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
5792
5793   ldr height, [ sp, #44 ]
5794   add fb_ptr, fb_ptr, y, lsl #11
5795
5796   add fb_ptr, fb_ptr, x, lsl #1
5797   sub right_width, width, #1
5798
5799   ldr color, [ sp, #48 ]
5800   and right_width, #7
5801
5802   add block_width, width, #7
5803   add right_width, #1
5804
5805   lsr block_width, #3
5806   mov right_mask_bits, #0xff
5807
5808   sub fb_ptr_pitch, block_width, #1
5809   lsl right_mask_bits, right_width
5810
5811   lsl fb_ptr_pitch, #3+1
5812   ubfx color_r, color, #3, #5
5813
5814   rsb fb_ptr_pitch, #1024*2
5815   ubfx color_g, color, #11, #5
5816
5817   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
5818   ubfx color_b, color, #19, #5
5819
5820   vdup.u16 right_mask, right_mask_bits
5821   orr color, color_r, color_b, lsl #10
5822
5823   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5824   orr color, color, color_g, lsl #5
5825
5826   vtst.u16 right_mask, right_mask, test_mask
5827   add block, psx_gpu, #psx_gpu_blocks_offset
5828
5829   vdup.u16 colors, color
5830   add block, block, num_blocks, lsl #6
5831
5832
5833 setup_sprite_untextured_height_loop:
5834   add num_blocks, block_width
5835   sub blocks_remaining, block_width, #1
5836
5837   cmp num_blocks, #MAX_BLOCKS
5838   blgt setup_sprites_16bpp_flush
5839
5840   cmp blocks_remaining, #0
5841   ble 1f
5842
5843   vmov.u8 draw_mask, #0 /* zero_mask */
5844   vmov.u8 draw_mask_bits_fb_ptr, #0
5845
5846  0:
5847   vst1.u32 { draw_mask }, [ block, :128 ]!
5848   subs blocks_remaining, #1
5849
5850   vst1.u32 { colors }, [ block, :128 ]
5851   add block, block, #24
5852
5853   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5854   vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
5855   
5856   add block, block, #24
5857   add fb_ptr, #8*2
5858   bgt 0b
5859
5860  1:
5861   vst1.u32 { right_mask }, [ block, :128 ]!
5862   subs height, #1
5863
5864   vst1.u32 { colors }, [ block, :128 ]
5865   add block, block, #24
5866
5867   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5868   vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
5869   
5870   add block, block, #24
5871   add fb_ptr, fb_ptr_pitch
5872
5873   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5874   bgt setup_sprite_untextured_height_loop
5875
5876   ldmia sp!, { r4 - r11, pc }
5877
5878
5879
5880 #undef texture_page_ptr
5881 #undef vram_ptr
5882 #undef dirty_textures_mask
5883 #undef current_texture_mask
5884
5885 #define psx_gpu                                           r0
5886 #define current_texture_page                              r1
5887 #define texture_page_ptr                                  r2
5888 #define vram_ptr_a                                        r3
5889 #define current_texture_page_x                            r12
5890 #define current_texture_page_y                            r4
5891 #define dirty_textures_mask                               r5
5892 #define tile_y                                            r6
5893 #define tile_x                                            r7
5894 #define sub_y                                             r8
5895 #define current_texture_mask                              r9
5896 #define c_4096                                            r10
5897 #define vram_ptr_b                                        r11
5898
5899 #define texel_block_a                                     d0
5900 #define texel_block_b                                     d1
5901 #define texel_block_expanded_a                            q1
5902 #define texel_block_expanded_b                            q2
5903 #define texel_block_expanded_ab                           q2
5904 #define texel_block_expanded_c                            q3
5905 #define texel_block_expanded_d                            q4
5906 #define texel_block_expanded_cd                           q3
5907
5908 function(update_texture_4bpp_cache)
5909   stmdb sp!, { r4 - r11, r14 }
5910   vpush { q0 - q3 }
5911
5912   ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5913
5914   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
5915   ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5916
5917   and current_texture_page_x, current_texture_page, #0xF
5918   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
5919
5920   mov current_texture_page_y, current_texture_page, lsr #4
5921   ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5922
5923   add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5924   mov tile_y, #16
5925
5926   add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7  
5927   bic dirty_textures_mask, current_texture_mask
5928   
5929   mov tile_x, #16
5930   str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5931
5932   mov sub_y, #8
5933   movw c_4096, #4096
5934
5935   add vram_ptr_b, vram_ptr_a, #2048
5936
5937  0:
5938   vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096
5939   vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096
5940
5941   vmovl.u8 texel_block_expanded_a, texel_block_a
5942   vshll.u8 texel_block_expanded_b, texel_block_a, #4
5943   vmovl.u8 texel_block_expanded_c, texel_block_b
5944   vshll.u8 texel_block_expanded_d, texel_block_b, #4
5945
5946   vbic.u16 texel_block_expanded_a, #0x00F0
5947   vbic.u16 texel_block_expanded_b, #0x00F0
5948   vbic.u16 texel_block_expanded_c, #0x00F0
5949   vbic.u16 texel_block_expanded_d, #0x00F0
5950
5951   vorr.u16 texel_block_expanded_ab, texel_block_expanded_a,                    \
5952    texel_block_expanded_b
5953   vorr.u16 texel_block_expanded_cd, texel_block_expanded_c,                    \
5954    texel_block_expanded_d
5955
5956   vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd },               \
5957    [ texture_page_ptr, :256 ]!
5958
5959   subs sub_y, sub_y, #1
5960   bne 0b
5961
5962   mov sub_y, #8
5963   add vram_ptr_a, vram_ptr_a, #8
5964   add vram_ptr_b, vram_ptr_b, #8
5965
5966   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5967   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5968
5969   subs tile_x, tile_x, #1
5970   bne 0b
5971
5972   mov tile_x, #16
5973   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5974   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5975
5976   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5977   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5978
5979   subs tile_y, tile_y, #1
5980   bne 0b
5981
5982   vpop { q0 - q3 }
5983   ldmia sp!, { r4 - r11, pc }
5984
5985
5986 #undef current_texture_page
5987
5988 #define psx_gpu                                           r0
5989 #define texture_page                                      r1
5990 #define texture_page_ptr                                  r2
5991 #define vram_ptr_a                                        r3
5992 #define texture_page_x                                    r12
5993 #define texture_page_y                                    r4
5994 #define current_texture_page                              r5
5995 #define tile_y                                            r6
5996 #define tile_x                                            r7
5997 #define sub_y                                             r8
5998 #define c_4096                                            r10
5999 #define vram_ptr_b                                        r11
6000
6001
6002 #undef texels_a
6003 #undef texels_b
6004
6005 #define texels_a                                          q0
6006 #define texels_b                                          q1
6007 #define texels_c                                          q2
6008 #define texels_d                                          q3
6009
6010
6011 function(update_texture_8bpp_cache_slice)
6012   stmdb sp!, { r4 - r11, r14 }
6013   vpush { q0 - q3 }
6014
6015   ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
6016   ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
6017
6018   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
6019   mov tile_y, #16
6020
6021   and texture_page_x, texture_page, #0xF
6022   mov texture_page_y, texture_page, lsr #4
6023
6024   add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7  
6025   mov tile_x, #8
6026
6027   add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6028   eor current_texture_page, current_texture_page, texture_page
6029
6030   ands current_texture_page, current_texture_page, #0x1
6031   mov sub_y, #4
6032
6033   addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6034   movw c_4096, #4096
6035
6036   add vram_ptr_b, vram_ptr_a, #2048
6037
6038  0:
6039   vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096
6040   vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096
6041   vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096
6042   vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096
6043
6044   vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]!
6045   vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]!
6046
6047   subs sub_y, sub_y, #1
6048   bne 0b
6049
6050   mov sub_y, #4
6051
6052   add vram_ptr_a, vram_ptr_a, #16
6053   add vram_ptr_b, vram_ptr_b, #16
6054
6055   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6056   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6057
6058   subs tile_x, tile_x, #1
6059   bne 0b
6060
6061   mov tile_x, #8
6062
6063   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6064   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6065
6066   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6067   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6068
6069   subs tile_y, tile_y, #1
6070   add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6071
6072   bne 0b
6073
6074   vpop { q0 - q3 }
6075   ldmia sp!, { r4 - r11, pc }
6076
6077
6078 /* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6079 function(scale2x_tiles8)
6080   push { r4, r14 }
6081
6082   mov r4, r1
6083   add r12, r0, #1024*2
6084   mov r14, r2
6085
6086 0:
6087   vld1.u16 { q0 }, [ r1, :128 ]!
6088   vld1.u16 { q2 }, [ r1, :128 ]!
6089   vmov q1, q0
6090   vmov q3, q2
6091   vzip.16 q0, q1
6092   vzip.16 q2, q3
6093   subs r14, #2
6094   vst1.u16 { q0, q1 }, [ r0, :128 ]!
6095   vst1.u16 { q0, q1 }, [ r12, :128 ]!
6096   blt 1f
6097   vst1.u16 { q2, q3 }, [ r0, :128 ]!
6098   vst1.u16 { q2, q3 }, [ r12, :128 ]!
6099   bgt 0b
6100 1:
6101   subs r3, #1
6102   mov r14, r2
6103   add r0, #1024*2*2
6104   add r4, #1024*2
6105   sub r0, r0, r2, lsl #4+1
6106   mov r1, r4
6107   add r12, r0, #1024*2
6108   bgt 0b
6109   nop
6110
6111   pop { r4, pc }
6112
6113 // vim:filetype=armasm