63252b0fa6867c1fbbc2cfd2cbae453526d80c5d
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation; either version 2 of
8  * the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  */
15
16 #define MAX_SPANS                                         512
17 #define MAX_BLOCKS                                        64
18 #define MAX_BLOCKS_PER_ROW                                128
19
20 #define RENDER_STATE_MASK_EVALUATE                        0x20
21 #define RENDER_FLAGS_MODULATE_TEXELS                      0x1
22 #define RENDER_FLAGS_BLEND                                0x2
23 #define RENDER_INTERLACE_ENABLED                          0x1
24
25 #include "psx_gpu_offsets.h"
26
27 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
28
29 #define edge_data_left_x_offset                           0
30 #define edge_data_num_blocks_offset                       2
31 #define edge_data_right_mask_offset                       4
32 #define edge_data_y_offset                                6
33
34 .syntax unified
35 .text
36
37 #define psx_gpu                                           r0
38 #define v_a                                               r1
39 #define v_b                                               r2
40 #define v_c                                               r3
41
42 #define x0                                                r4
43 #define x1                                                r5
44 #define x2                                                r6
45 #define x0_x1                                             r5
46 #define x1_x2                                             r6
47 #define y0                                                r7
48 #define y1                                                r8
49 #define y2                                                r9
50 #define y0_y1                                             r7
51 #define y1_y2                                             r8
52 #define b0                                                r9
53 #define b1                                                r10
54 #define b2                                                r11
55 #define b0_b1                                             r10
56 #define b1_b2                                             r11
57
58
59 #define area_r_s                                          r5
60
61 #define g_bx0                                             r2
62 #define g_bx                                              r3
63 #define g_bx2                                             r4
64 #define g_bx3                                             r5
65 #define b_base                                            r6
66 #define g_by                                              r8
67
68 #define gs_bx                                             r7
69 #define gs_by                                             r10
70
71 #define ga_bx                                             g_bx
72 #define ga_by                                             g_by
73
74 #define gw_bx_h                                           g_bx
75 #define gw_by_h                                           g_by
76
77 #define gw_bx_l                                           r11
78 #define gw_by_l                                           gw_bx_l
79
80 #define store_a                                           r0
81 #define store_b                                           r1
82 #define store_inc                                         r5
83
84
85 #define v0                                                q0
86 #define uvrgb0                                            d0
87 #define x0_y0                                             d1
88
89 #define v1                                                q1
90 #define uvrgb1                                            d2
91 #define x1_y1                                             d3
92
93 #define v2                                                q2
94 #define uvrgb2                                            d4
95 #define x2_y2                                             d5
96
97 #define x0_ab                                             q3
98 #define uvrg_xxxx0                                        q3
99 #define uvrg0                                             d6
100 #define xxxx0                                             d7
101
102 #define x1_ab                                             q4
103 #define uvrg_xxxx1                                        q4
104 #define uvrg1                                             d8
105 #define xxxx1                                             d9
106
107 #define x2_ab                                             q5
108 #define uvrg_xxxx2                                        q5
109 #define uvrg2                                             d10
110 #define xxxx2                                             d11
111
112 #define y0_ab                                             q6
113 #define yyyy_uvrg0                                        q6
114 #define yyyy0                                             d12
115 #define uvrg0b                                            d13
116
117 #define y1_ab                                             q7
118 #define yyyy_uvrg1                                        q7
119 #define yyyy1                                             d14
120 #define uvrg1b                                            d15
121
122 #define y2_ab                                             q8
123 #define yyyy_uvrg2                                        q8
124 #define yyyy2                                             d16
125 #define uvrg2b                                            d17
126
127 #define d0_ab                                             q9
128 #define d0_a                                              d18
129 #define d0_b                                              d19
130
131 #define d1_ab                                             q10
132 #define d1_a                                              d20
133 #define d1_b                                              d21
134
135 #define d2_ab                                             q11
136 #define d2_a                                              d22
137 #define d2_b                                              d23
138
139 #define d3_ab                                             q12
140 #define d3_a                                              d24
141 #define d3_b                                              d25
142
143 #define ga_uvrg_x                                         q1
144 #define ga_uvrg_y                                         q4
145
146 #define dx                                                x0_x1
147 #define dy                                                y0_y1
148 #define db                                                b0_b1
149
150 #define uvrg_base                                         q11
151
152 #define gs_uvrg_x                                         q5
153 #define gs_uvrg_y                                         q6
154
155 #define g_uvrg_x                                          q1
156 #define ga_uv_x                                           d2
157 #define g_uv_x                                            d2
158 #define ga_rg_x                                           d3
159 #define g_rg_x                                            d3
160
161 #define g_uvrg_y                                          q4
162 #define ga_uv_y                                           d8
163 #define g_uv_y                                            d8
164 #define ga_rg_y                                           d9
165 #define g_rg_y                                            d9
166
167 #define gw_uv_x                                           q1
168 #define gw_rg_x                                           q2
169 #define gw_uv_y                                           q4
170 #define gw_rg_y                                           q3
171
172 #define w_mask                                            q9
173 #define w_mask_l                                          d18
174
175 #define r_shift                                           q10
176
177 #define uvrg_dx0                                          q0
178 #define uvrg_dx0l                                         d0
179 #define uvrg_dx0h                                         d1
180
181 #define uvrg_dx1                                          q1
182 #define uvrg_dx1l                                         d2
183 #define uvrg_dx1h                                         d3
184
185 #define uvrg_dx2                                          q2
186 #define uvrg_dx2l                                         d4
187 #define uvrg_dx2h                                         d5
188
189 #define uvrg_dx3                                          q3
190 #define uvrg_dx3l                                         d6
191 #define uvrg_dx3h                                         d7
192
193 #define uvrgb_phase                                       q13
194
195 .align 4
196
197 #ifndef __MACH__
198
199 #define function(name)                                                         \
200   .global name;                                                                \
201   .type name, %function;                                                       \
202   name:                                                                        \
203
204 #define JT_OP_REL(table_label, index_reg, temp)
205 #define JT_OP(x...) x
206 #define JTE(start, target) target
207
208 #else
209
210 #define function(name)                                                         \
211   .globl _##name;                                                              \
212   name:                                                                        \
213   _##name:                                                                     \
214
215 #define JT_OP_REL(table_label, index_reg, temp)                                \
216   adr temp, table_label;                                                       \
217   ldr temp, [ temp, index_reg, lsl #2 ];                                       \
218   add pc, pc, temp                                                             \
219
220 #define JT_OP(x...)
221 #define JTE(start, target) (target - start)
222
223 #define flush_render_block_buffer _flush_render_block_buffer
224 #define setup_sprite_untextured_simple _setup_sprite_untextured_simple
225 #define update_texture_8bpp_cache _update_texture_8bpp_cache
226
227 #endif
228
229 @ r0: psx_gpu
230 @ r1: v_a
231 @ r2: v_b
232 @ r3: v_c
233
234 function(compute_all_gradients)
235   // First compute the triangle area reciprocal and shift. The division will
236   // happen concurrently with much of the work which follows.
237   @ r12 = psx_gpu->triangle_area
238   ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
239   stmdb sp!, { r4 - r11, lr }
240
241   @ load exponent of 62 into upper half of double
242   movw r4, #0
243   clz r14, r12                       @ r14 = shift
244
245   movt r4, #((62 + 1023) << 4)
246   mov r12, r12, lsl r14              @ r12 = triangle_area_normalized
247
248   @ load area normalized into lower half of double
249   mov r5, r12, lsr #10
250   vmov.f64 d30, r5, r4               @ d30 = (1 << 62) + ta_n
251
252   movt r4, #((1022 + 31) << 4)
253   mov r5, r12, lsl #20
254
255   add r4, r4, r12, lsr #11
256   vmov.f64 d31, r5, r4
257
258   vdiv.f64 d30, d30, d31             @ d30 = ((1 << 62) + ta_n) / ta_n
259
260   // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
261   // ( d0       *  d1      ) - ( d2       *  d3      ) =
262   // ( m0                  ) - ( m1                  ) = gradient
263
264   // This is split to do 12 elements at a time over three sets: a, b, and c.
265   // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
266   // two of the slots are unused.
267
268   // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
269   // is g.
270
271   // First type is:  uvrg bxxx xxxx 
272   // Second type is: yyyy ybyy uvrg 
273   // Since x_a and y_c are the same the same variable is used for both. 
274
275   vld1.u32 { v0 }, [ v_a, : 128 ]    @ v0 = { uvrg0, b0, x0, y0 }
276   ldrsh x0, [ v_a, #8 ]              @ load x0
277
278   vld1.u32 { v1 }, [ v_b, : 128 ]    @ v1 = { uvrg1, b1, x1, y1}
279   ldrh x1, [ v_b, #8 ]               @ load x1
280
281   vld1.u32 { v2 }, [ v_c, : 128 ]    @ v2 = { uvrg2, b2, x2, y2 }
282   ldrh x2, [ v_c, #8 ]               @ load x2
283
284   vmovl.u8 uvrg_xxxx0, uvrgb0        @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
285   ldrh y0, [ v_a, #10 ]              @ load y0
286
287   vmovl.u8 uvrg_xxxx1, uvrgb1        @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
288   ldrh y1, [ v_b, #10 ]              @ load y1
289
290   vmovl.u8 uvrg_xxxx2, uvrgb2        @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
291   ldrh y2, [ v_c, #10 ]              @ load y2
292
293   vmov.u8 uvrg0b, uvrg0              @ uvrg0b = { uv0, rg0 }
294   vdup.u16 xxxx0, x0_y0[0]           @ xxxx0 = { xx0, xx0 }
295
296   orr x1_x2, x1, x2, lsl #16         @ x1_x2 = { x1, x2 }
297   pkhbt x0_x1, x0, x1, lsl #16       @ x0_x1 = { x0, x1 }
298
299   vmov.u8 uvrg1b, uvrg1              @ uvrg1b = { uv1, rg1 }
300   vdup.u16 xxxx1, x1_y1[0]           @ xxxx1 = { xx1, xx1 }
301
302   vmov.u8 uvrg2b, uvrg2              @ uvrg2b = { uv2, rg2 }
303   vdup.u16 xxxx2, x2_y2[0]           @ xxxx2 = { xx2, xx2 }
304
305   ldrb b2, [ v_c, #4 ]               @ load b2
306   orr y0_y1, y0, y1, lsl #16         @ y0_y1 = { y0, y1 }
307
308   ldrb b1, [ v_b, #4 ]               @ load b1
309   orr y1_y2, y1, y2, lsl #16         @ y1_y2 = { y1, y2 }
310
311   vdup.u16 yyyy0, x0_y0[1]           @ yyyy0 = { yy0, yy0 }
312   vsub.s16 d0_ab, x1_ab, x0_ab
313
314   ldrb b0, [ v_a, #4 ]               @ load b0
315   orr b1_b2, b1, b2, lsl #16         @ b1_b2 = { b1, b2 }
316
317   vdup.u16 yyyy1, x1_y1[1]           @ yyyy1 = { yy1, yy1 }
318   vsub.s16 d2_ab, x2_ab, x1_ab
319
320   vdup.u16 yyyy2, x2_y2[1]           @ yyyy2 = { yy2, yy2 }
321   vsub.s16 d1_ab, y2_ab, y1_ab
322
323   orr b0_b1, b0, b1, lsl #16         @ b1_b2 = { b1, b2 }
324   ssub16 dx, x1_x2, x0_x1            @ dx = { x1 - x0, x2 - x1 }
325
326   ssub16 dy, y1_y2, y0_y1            @ dy = { y1 - y0, y2 - y1 }
327   ssub16 db, b1_b2, b0_b1            @ db = { b1 - b0, b2 - b1 }
328
329   vsub.s16 d3_ab, y1_ab, y0_ab
330   smusdx ga_by, dx, db               @ ga_by = ((x1 - x0) * (b2 - b1)) -
331                                      @         ((x2 - X1) * (b1 - b0)) 
332   vmull.s16 ga_uvrg_x, d0_a, d1_a
333   smusdx ga_bx, db, dy               @ ga_bx = ((b1 - b0) * (y2 - y1)) -
334                                      @         ((b2 - b1) * (y1 - y0))
335   vmlsl.s16 ga_uvrg_x, d2_a, d3_a
336   movs gs_bx, ga_bx, asr #31
337
338   vmull.s16 ga_uvrg_y, d0_b, d1_b
339   rsbmi ga_bx, ga_bx, #0
340
341   @ r12 = psx_gpu->uvrgb_phase
342   ldr r12, [ psx_gpu, #psx_gpu_uvrgb_phase_offset ]
343
344   vmlsl.s16 ga_uvrg_y, d2_b, d3_b
345   movs gs_by, ga_by, asr #31
346
347   vshr.u64 d0, d30, #22
348   add b_base, r12, b0, lsl #16
349
350   vdup.u32 uvrgb_phase, r12
351
352   rsbmi ga_by, ga_by, #0
353   vclt.s32 gs_uvrg_x, ga_uvrg_x, #0  @ gs_uvrg_x = ga_uvrg_x < 0
354
355   @ r12 = psx_gpu->triangle_winding_offset
356   ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
357   vclt.s32 gs_uvrg_y, ga_uvrg_y, #0  @ gs_uvrg_y = ga_uvrg_y < 0
358
359   rsb r12, r12, #0                   @ r12 = -(triangle->winding)
360
361   vdup.u32 w_mask, r12               @ w_mask = { -w, -w, -w, -w }
362   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
363
364   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
365   vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
366
367   vadd.u32 uvrg_base, uvrgb_phase
368   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
369
370   vmov area_r_s, s0                  @ area_r_s = triangle_reciprocal
371   vabs.s32 ga_uvrg_y, ga_uvrg_y      @ ga_uvrg_y = abs(ga_uvrg_y)
372
373   vmull.u32 gw_rg_x, ga_rg_x, d0[0]
374   vmull.u32 gw_uv_x, ga_uv_x, d0[0]
375   vmull.u32 gw_rg_y, ga_rg_y, d0[0]
376   vmull.u32 gw_uv_y, ga_uv_y, d0[0]
377
378   vshl.u64 gw_rg_x, gw_rg_x, r_shift
379   vshl.u64 gw_uv_x, gw_uv_x, r_shift
380   vshl.u64 gw_rg_y, gw_rg_y, r_shift
381   vshl.u64 gw_uv_y, gw_uv_y, r_shift
382
383   veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
384   vmovn.u64 g_uv_x, gw_uv_x
385
386   veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
387   vmovn.u64 g_rg_x, gw_rg_x
388
389   veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
390   vmovn.u64 g_uv_y, gw_uv_y
391
392   vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
393   vmovn.u64 g_rg_y, gw_rg_y
394
395   veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
396   mov ga_bx, ga_bx, lsl #13
397
398   vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
399   mov ga_by, ga_by, lsl #13
400
401   vdup.u32 x0_y0, x0
402   umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
403
404   vshl.u32 g_uvrg_x, g_uvrg_x, #4
405   vshl.u32 g_uvrg_y, g_uvrg_y, #4
406
407   umull gw_by_l, gw_by_h, ga_by, area_r_s
408   vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
409
410   eor gs_bx, gs_bx, r12
411   vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
412
413   veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
414   eor gs_by, gs_by, r12
415
416   rsb r11, r14, #0                   @ r11 = negative shift for scalar lsr
417   add store_a, psx_gpu, #psx_gpu_uvrg_offset
418
419   sub r11, r11, #(32 - 13)
420
421   add store_b, store_a, #16
422   mov store_inc, #32
423
424   vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
425   vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
426
427   vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
428   mov g_bx, gw_bx_h, lsr r11
429
430   vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
431   mov g_by, gw_by_h, lsr r11
432
433   vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l },                     \
434    [ store_b, : 128 ], store_inc
435   eor g_bx, g_bx, gs_bx
436
437   vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h },                     \
438    [ store_b, : 128 ], store_inc
439   sub g_bx, g_bx, gs_bx
440
441   lsl g_bx, g_bx, #4  
442   eor g_by, g_by, gs_by
443
444   mls b_base, g_bx, x0, b_base
445   sub g_by, g_by, gs_by
446
447   lsl g_by, g_by, #4
448   mov g_bx0, #0
449
450   add g_bx2, g_bx, g_bx
451   add g_bx3, g_bx, g_bx2
452
453   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
454
455   ldmia sp!, { r4 - r11, pc }
456
457
458 #define psx_gpu                                  r0
459 #define v_a                                      r1
460 #define v_b                                      r2
461 #define v_c                                      r3
462
463 #define temp                                     r14
464
465 #define x_a                                      r4
466 #define x_b                                      r5
467 #define x_c                                      r6
468 #define y_a                                      r1
469 #define y_b                                      r2
470 #define y_c                                      r3
471
472 #define height_minor_a                           r7
473 #define height_minor_b                           r8
474 #define height_major                             r9
475 #define height                                   r9
476
477 #define reciprocal_table_ptr                     r10
478
479 #define edge_alt_low                             r4
480 #define edge_alt_high                            r5
481 #define edge_dx_dy_alt                           r6
482 #define edge_shift_alt                           r10
483
484 #define edge_dx_dy_alt_low                       r4
485 #define edge_dx_dy_alt_high                      r5
486
487 #define span_edge_data                           r4
488 #define span_uvrg_offset                         r5
489 #define span_b_offset                            r6
490
491 #define clip                                     r14
492
493 #define b                                        r11
494 #define b_dy                                     r12
495
496
497 #define alternate_x                              q0
498 #define alternate_dx_dy                          q1
499 #define alternate_x_32                           q2
500
501 #define alternate_x_low                          d0
502 #define alternate_x_high                         d1
503 #define alternate_dx_dy_low                      d2
504 #define alternate_dx_dy_high                     d3
505 #define alternate_x_32_low                       d4
506 #define alternate_x_32_high                      d5
507
508 #define left_x                                   q3
509 #define right_x                                  q4
510 #define left_dx_dy                               q5
511 #define right_dx_dy                              q6
512 #define left_edge                                q7
513 #define right_edge                               q8
514
515 #define left_x_low                               d6
516 #define left_x_high                              d7
517 #define right_x_low                              d8
518 #define right_x_high                             d9
519 #define left_dx_dy_low                           d10
520 #define left_dx_dy_high                          d11
521 #define right_dx_dy_low                          d12
522 #define right_dx_dy_high                         d13
523 #define left_edge_low                            d14
524 #define left_edge_high                           d15
525 #define right_edge_low                           d16
526 #define right_edge_high                          d17
527
528 #define y_mid_point                              d18
529 #define c_0x0004                                 d19
530
531 #define left_right_x_16                          q11
532 #define span_shifts_y                            q12
533 #define c_0x0001                                 q13
534
535 #define span_shifts                              d24
536 #define y_x4                                     d25
537 #define c_0xFFFE                                 d26
538 #define c_0x0007                                 d27
539
540 #define left_right_x_16_low                      d22
541 #define left_right_x_16_high                     d23
542
543 #define uvrg                                     q14
544 #define uvrg_dy                                  q15
545
546 #define alternate_x_16                           d4
547
548 #define v_clip                                   q3
549 #define v_clip_low                               d6
550
551 #define right_x_32                               q10
552 #define left_x_32                                q11
553 #define alternate_select                         d24
554
555 #define right_x_32_low                           d20
556 #define right_x_32_high                          d21
557 #define left_x_32_low                            d22
558 #define left_x_32_high                           d23
559
560 #define edges_xy                                 q0
561 #define edges_dx_dy                              d2
562 #define edge_shifts                              d3
563 #define edge_shifts_64                           q2
564
565 #define edges_xy_left                            d0
566 #define edges_xy_right                           d1
567
568 #define height_reciprocals                       d6
569 #define heights                                  d7
570
571 #define widths                                   d8
572 #define c_0x01                                   d9
573 #define x_starts                                 d10
574 #define x_ends                                   d11
575
576 #define heights_b                                d12
577 #define edges_dx_dy_64                           q10
578
579 #define edges_dx_dy_64_left                      d20
580 #define edges_dx_dy_64_right                     d21
581
582
583 #define setup_spans_prologue()                                                 \
584   stmdb sp!, { r4 - r11, lr };                                                 \
585                                                                                \
586   ldrsh x_a, [ v_a, #8 ];                                                      \
587   ldrsh x_b, [ v_b, #8 ];                                                      \
588   ldrsh x_c, [ v_c, #8 ];                                                      \
589   ldrsh y_a, [ v_a, #10 ];                                                     \
590   ldrsh y_b, [ v_b, #10 ];                                                     \
591   ldrsh y_c, [ v_c, #10 ];                                                     \
592                                                                                \
593   add temp, psx_gpu, #psx_gpu_uvrg_offset;                                     \
594   vld1.32 { uvrg }, [ temp ];                                                  \
595   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
596   vld1.32 { uvrg_dy }, [ temp ];                                               \
597   ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \
598                                                                                \
599   vmov.u32 c_0x01, #0x01                                                       \
600
601 #define setup_spans_load_b()                                                   \
602   ldr b, [ psx_gpu, #psx_gpu_b_offset ];                                       \
603   ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ]                                  \
604
605 #define setup_spans_prologue_b()                                               \
606   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
607   add temp, psx_gpu, #psx_gpu_viewport_start_x_offset;                         \
608                                                                                \
609   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
610   vmov.u16 c_0x0004, #0x0004;                                                  \
611                                                                                \
612   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
613   vmov.u16 c_0x0001, #0x0001;                                                  \
614                                                                                \
615   vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ];                    \
616   add temp, psx_gpu, #psx_gpu_viewport_end_x_offset;                           \
617                                                                                \
618   vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ];                  \
619   vadd.u16 right_edge, right_edge, c_0x0001;                                   \
620                                                                                \
621   vmov.u16 c_0x0007, #0x0007;                                                  \
622   vmvn.u16 c_0xFFFE, #0x0001                                                   \
623
624
625 #define compute_edge_delta_x2()                                                \
626   ldr temp, [ reciprocal_table_ptr, height, lsl #2 ];                          \
627                                                                                \
628   vdup.u32 heights, height;                                                    \
629   vsub.u32 widths, x_ends, x_starts;                                           \
630                                                                                \
631   vdup.u32 edge_shifts, temp;                                                  \
632   vsub.u32 heights_b, heights, c_0x01;                                         \
633   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
634                                                                                \
635   vmla.s32 heights_b, x_starts, heights;                                       \
636   vbic.u16 edge_shifts, #0xE0;                                                 \
637   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
638   vmull.s32 edges_xy, heights_b, height_reciprocals                            \
639
640 #define width_alt                 r6
641 #define height_reciprocal_alt     r11
642 #define height_b_alt              r12
643
644 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
645   vmov heights, height_a, height_b;                                            \
646   ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ];                        \
647   vmov.u32 edge_shifts[0], temp;                                               \
648   ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ];                        \
649   vmov.u32 edge_shifts[1], temp;                                               \
650   ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ];        \
651                                                                                \
652   vsub.u32 widths, x_ends, x_starts;                                           \
653   sub width_alt, x_c, start_c;                                                 \
654                                                                                \
655   vsub.u32 heights_b, heights, c_0x01;                                         \
656   sub height_b_alt, height_minor_b, #1;                                        \
657                                                                                \
658   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
659   lsr height_reciprocal_alt, edge_shift_alt, #10;                              \
660                                                                                \
661   vmla.s32 heights_b, x_starts, heights;                                       \
662   mla height_b_alt, height_minor_b, start_c, height_b_alt;                     \
663                                                                                \
664   vbic.u16 edge_shifts, #0xE0;                                                 \
665   and edge_shift_alt, edge_shift_alt, #0x1F;                                   \
666                                                                                \
667   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
668   mul edge_dx_dy_alt, width_alt, height_reciprocal_alt;                        \
669                                                                                \
670   vmull.s32 edges_xy, heights_b, height_reciprocals;                           \
671   smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt       \
672
673
674 #define setup_spans_adjust_y_up()                                              \
675   vsub.u32 y_x4, y_x4, c_0x0004                                                \
676
677 #define setup_spans_adjust_y_down()                                            \
678   vadd.u32 y_x4, y_x4, c_0x0004                                                \
679
680 #define setup_spans_adjust_interpolants_up()                                   \
681   vsub.u32 uvrg, uvrg, uvrg_dy;                                                \
682   sub b, b, b_dy                                                               \
683
684 #define setup_spans_adjust_interpolants_down()                                 \
685   vadd.u32 uvrg, uvrg, uvrg_dy;                                                \
686   add b, b, b_dy                                                               \
687
688
689 #define setup_spans_clip_interpolants_increment()                              \
690   mla b, b_dy, clip, b;                                                        \
691   vmla.s32 uvrg, uvrg_dy, v_clip                                               \
692
693 #define setup_spans_clip_interpolants_decrement()                              \
694   mls b, b_dy, clip, b;                                                        \
695   vmls.s32 uvrg, uvrg_dy, v_clip                                               \
696
697 #define setup_spans_clip_alternate_yes()                                       \
698   smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip                      \
699
700 #define setup_spans_clip_alternate_no()                                        \
701
702 #define setup_spans_clip(direction, alternate_active)                          \
703   vdup.u32 v_clip, clip;                                                       \
704   setup_spans_clip_alternate_##alternate_active();                             \
705   setup_spans_clip_interpolants_##direction();                                 \
706   vmlal.s32 edges_xy, edges_dx_dy, v_clip_low                                  \
707
708
709 #define setup_spans_adjust_edges_alternate_no(left_index, right_index)         \
710   vmovl.s32 edge_shifts_64, edge_shifts;                                       \
711   vmovl.s32 edges_dx_dy_64, edges_dx_dy;                                       \
712                                                                                \
713   vshl.s64 edges_xy, edges_xy, edge_shifts_64;                                 \
714   vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64;                     \
715                                                                                \
716   vmov left_x_low, edges_xy_##left_index;                                      \
717   vmov right_x_low, edges_xy_##right_index;                                    \
718                                                                                \
719   vmov left_dx_dy_low, edges_dx_dy_64_##left_index;                            \
720   vmov left_dx_dy_high, edges_dx_dy_64_##left_index;                           \
721   vmov right_dx_dy_low, edges_dx_dy_64_##right_index;                          \
722   vmov right_dx_dy_high, edges_dx_dy_64_##right_index;                         \
723                                                                                \
724   vadd.u64 left_x_high, left_x_low, left_dx_dy_low;                            \
725   vadd.u64 right_x_high, right_x_low, right_dx_dy_low;                         \
726                                                                                \
727   vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy;                                 \
728   vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy                               \
729
730
731 #define setup_spans_adjust_edges_alternate_yes(left_index, right_index)        \
732   setup_spans_adjust_edges_alternate_no(left_index, right_index);              \
733                                                                                \
734   vdup.u16 y_mid_point, y_b;                                                   \
735   rsb temp, edge_shift_alt, #32;                                               \
736                                                                                \
737   lsl edge_alt_high, edge_alt_high, edge_shift_alt;                            \
738   orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp;                    \
739   lsl edge_alt_low, edge_alt_low, edge_shift_alt;                              \
740   vmov alternate_x_low, edge_alt_low, edge_alt_high;                           \
741                                                                                \
742   asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp;                               \
743   lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt;                      \
744   vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high;           \
745   vmov alternate_dx_dy_high, alternate_dx_dy_low;                              \
746                                                                                \
747   vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low;             \
748   vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy                   \
749
750
751 #define setup_spans_y_select_up()                                              \
752   vclt.s16 alternate_select, y_x4, y_mid_point                                 \
753
754 #define setup_spans_y_select_down()                                            \
755   vcgt.s16 alternate_select, y_x4, y_mid_point                                 \
756
757
758 #define setup_spans_alternate_select_left()                                    \
759   vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select               \
760
761 #define setup_spans_alternate_select_right()                                   \
762   vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select              \
763
764
765 #define setup_spans_set_x4_alternate_yes(alternate, direction)                 \
766   vshrn.s64 alternate_x_32_low, alternate_x, #32;                              \
767   vshrn.s64 left_x_32_low, left_x, #32;                                        \
768   vshrn.s64 right_x_32_low, right_x, #32;                                      \
769                                                                                \
770   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
771   vadd.u64 left_x, left_x, left_dx_dy;                                         \
772   vadd.u64 right_x, right_x, right_dx_dy;                                      \
773                                                                                \
774   vshrn.s64 alternate_x_32_high, alternate_x, #32;                             \
775   vshrn.s64 left_x_32_high, left_x, #32;                                       \
776   vshrn.s64 right_x_32_high, right_x, #32;                                     \
777                                                                                \
778   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
779   vadd.u64 left_x, left_x, left_dx_dy;                                         \
780   vadd.u64 right_x, right_x, right_dx_dy;                                      \
781                                                                                \
782   vmovn.u32 alternate_x_16, alternate_x_32;                                    \
783   setup_spans_y_select_##direction();                                          \
784   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
785                                                                                \
786   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
787   setup_spans_alternate_select_##alternate();                                  \
788                                                                                \
789   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
790   str b, [ span_b_offset ], #4;                                                \
791   setup_spans_adjust_interpolants_##direction();                               \
792                                                                                \
793   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
794                                                                                \
795   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
796   str b, [ span_b_offset ], #4;                                                \
797   setup_spans_adjust_interpolants_##direction();                               \
798                                                                                \
799   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
800                                                                                \
801   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
802   str b, [ span_b_offset ], #4;                                                \
803   setup_spans_adjust_interpolants_##direction();                               \
804                                                                                \
805   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
806   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
807   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
808                                                                                \
809   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
810   str b, [ span_b_offset ], #4;                                                \
811   setup_spans_adjust_interpolants_##direction();                               \
812                                                                                \
813   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
814   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
815                                                                                \
816   vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!;            \
817                                                                                \
818   setup_spans_adjust_y_##direction()                                           \
819
820
821 #define setup_spans_set_x4_alternate_no(alternate, direction)                  \
822   vshrn.s64 left_x_32_low, left_x, #32;                                        \
823   vshrn.s64 right_x_32_low, right_x, #32;                                      \
824                                                                                \
825   vadd.u64 left_x, left_x, left_dx_dy;                                         \
826   vadd.u64 right_x, right_x, right_dx_dy;                                      \
827                                                                                \
828   vshrn.s64 left_x_32_high, left_x, #32;                                       \
829   vshrn.s64 right_x_32_high, right_x, #32;                                     \
830                                                                                \
831   vadd.u64 left_x, left_x, left_dx_dy;                                         \
832   vadd.u64 right_x, right_x, right_dx_dy;                                      \
833                                                                                \
834   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
835   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
836                                                                                \
837   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
838   str b, [ span_b_offset ], #4;                                                \
839   setup_spans_adjust_interpolants_##direction();                               \
840                                                                                \
841   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
842                                                                                \
843   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
844   str b, [ span_b_offset ], #4;                                                \
845   setup_spans_adjust_interpolants_##direction();                               \
846                                                                                \
847   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
848                                                                                \
849   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
850   str b, [ span_b_offset ], #4;                                                \
851   setup_spans_adjust_interpolants_##direction();                               \
852                                                                                \
853   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
854   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
855   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
856                                                                                \
857   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
858   str b, [ span_b_offset ], #4;                                                \
859   setup_spans_adjust_interpolants_##direction();                               \
860                                                                                \
861   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
862   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
863                                                                                \
864   vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!;            \
865                                                                                \
866   setup_spans_adjust_y_##direction()                                           \
867
868
869 #define edge_adjust_low           r11
870 #define edge_adjust_high          r12
871
872 #define setup_spans_alternate_adjust_yes()                                     \
873   smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a;     \
874   subs edge_alt_low, edge_alt_low, edge_adjust_low;                            \
875   sbc edge_alt_high, edge_alt_high, edge_adjust_high                           \
876
877 #define setup_spans_alternate_adjust_no()                                      \
878
879
880 #define setup_spans_down(left_index, right_index, alternate, alternate_active) \
881   setup_spans_alternate_adjust_##alternate_active();                           \
882   setup_spans_load_b();                                                        \
883                                                                                \
884   ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ];                     \
885   subs y_c, y_c, temp;                                                         \
886   subgt height, height, y_c;                                                   \
887   addgt height, height, #1;                                                    \
888                                                                                \
889   ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ];                   \
890   subs clip, temp, y_a;                                                        \
891   ble 0f;                                                                      \
892                                                                                \
893   sub height, height, clip;                                                    \
894   add y_a, y_a, clip;                                                          \
895   setup_spans_clip(increment, alternate_active);                               \
896                                                                                \
897  0:                                                                            \
898   cmp height, #0;                                                              \
899   ble 1f;                                                                      \
900                                                                                \
901   orr temp, y_a, y_a, lsl #16;                                                 \
902   add temp, temp, #(1 << 16);                                                  \
903   add y_a, temp, #2;                                                           \
904   add y_a, y_a, #(2 << 16);                                                    \
905   vmov y_x4, temp, y_a;                                                        \
906                                                                                \
907   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
908    right_index);                                                               \
909   setup_spans_prologue_b();                                                    \
910                                                                                \
911   strh height, [ psx_gpu, #psx_gpu_num_spans_offset ];                         \
912                                                                                \
913  2:                                                                            \
914   setup_spans_set_x4_alternate_##alternate_active(alternate, down);            \
915   subs height, height, #4;                                                     \
916   bhi 2b;                                                                      \
917                                                                                \
918  1:                                                                            \
919
920
921 #define setup_spans_alternate_pre_increment_yes()                              \
922   adds edge_alt_low, edge_alt_low, edge_dx_dy_alt;                             \
923   adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31                    \
924
925 #define setup_spans_alternate_pre_increment_no()                               \
926
927
928 #define setup_spans_up_decrement_yes()                                         \
929   suble height, height, #1                                                     \
930
931 #define setup_spans_up_decrement_no()                                          \
932
933
934 #define setup_spans_up(left_index, right_index, alternate, alternate_active)   \
935   setup_spans_alternate_adjust_##alternate_active();                           \
936   setup_spans_load_b();                                                        \
937   sub y_a, y_a, #1;                                                            \
938                                                                                \
939   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ];                    \
940   subs temp, temp, y_c;                                                        \
941   subgt height, height, temp;                                                  \
942   setup_spans_up_decrement_##alternate_active();                               \
943                                                                                \
944   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ];                      \
945   subs clip, y_a, temp;                                                        \
946   ble 0f;                                                                      \
947                                                                                \
948   sub height, height, clip;                                                    \
949   sub y_a, y_a, clip;                                                          \
950   setup_spans_clip(decrement, alternate_active);                               \
951                                                                                \
952  0:                                                                            \
953   cmp height, #0;                                                              \
954   ble 1f;                                                                      \
955                                                                                \
956   orr temp, y_a, y_a, lsl #16;                                                 \
957   sub temp, temp, #(1 << 16);                                                  \
958   sub y_a, temp, #2;                                                           \
959   sub y_a, y_a, #(2 << 16);                                                    \
960   vmov y_x4, temp, y_a;                                                        \
961                                                                                \
962   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
963                                                                                \
964   setup_spans_alternate_pre_increment_##alternate_active();                    \
965   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
966    right_index);                                                               \
967   setup_spans_adjust_interpolants_up();                                        \
968   setup_spans_prologue_b();                                                    \
969                                                                                \
970   strh height, [ psx_gpu, #psx_gpu_num_spans_offset ];                         \
971                                                                                \
972  2:                                                                            \
973   setup_spans_set_x4_alternate_##alternate_active(alternate, up);              \
974   subs height, height, #4;                                                     \
975   bhi 2b;                                                                      \
976                                                                                \
977  1:                                                                            \
978
979
980 #define setup_spans_epilogue()                                                 \
981   ldmia sp!, { r4 - r11, pc }                                                  \
982
983
984 #define setup_spans_up_up(minor, major)                                        \
985   setup_spans_prologue();                                                      \
986   sub height_minor_a, y_a, y_b;                                                \
987   sub height_minor_b, y_b, y_c;                                                \
988   sub height, y_a, y_c;                                                        \
989                                                                                \
990   vdup.u32 x_starts, x_a;                                                      \
991   vmov x_ends, x_c, x_b;                                                       \
992                                                                                \
993   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
994   setup_spans_up(major, minor, minor, yes);                                    \
995   setup_spans_epilogue()                                                       \
996
997 function(setup_spans_up_left)
998   setup_spans_up_up(left, right)
999
1000 function(setup_spans_up_right)
1001   setup_spans_up_up(right, left)
1002
1003 #define setup_spans_down_down(minor, major)                                    \
1004   setup_spans_prologue();                                                      \
1005   sub height_minor_a, y_b, y_a;                                                \
1006   sub height_minor_b, y_c, y_b;                                                \
1007   sub height, y_c, y_a;                                                        \
1008                                                                                \
1009   vdup.u32 x_starts, x_a;                                                      \
1010   vmov x_ends, x_c, x_b;                                                       \
1011                                                                                \
1012   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1013   setup_spans_down(major, minor, minor, yes);                                  \
1014   setup_spans_epilogue()                                                       \
1015
1016 function(setup_spans_down_left)
1017   setup_spans_down_down(left, right)
1018
1019 function(setup_spans_down_right)
1020   setup_spans_down_down(right, left)
1021
1022
1023 #define setup_spans_up_flat()                                                  \
1024   sub height, y_a, y_c;                                                        \
1025                                                                                \
1026   compute_edge_delta_x2();                                                     \
1027   setup_spans_up(left, right, none, no);                                       \
1028   setup_spans_epilogue()                                                       \
1029
1030 function(setup_spans_up_a)
1031   setup_spans_prologue()
1032
1033   vmov x_starts, x_a, x_b
1034   vdup.u32 x_ends, x_c
1035
1036   setup_spans_up_flat()
1037
1038 function(setup_spans_up_b)
1039   setup_spans_prologue()
1040
1041   vdup.u32 x_starts, x_a
1042   vmov x_ends, x_b, x_c
1043
1044   setup_spans_up_flat()
1045
1046 #define setup_spans_down_flat()                                                \
1047   sub height, y_c, y_a;                                                        \
1048                                                                                \
1049   compute_edge_delta_x2();                                                     \
1050   setup_spans_down(left, right, none, no);                                     \
1051   setup_spans_epilogue()                                                       \
1052
1053 function(setup_spans_down_a)
1054   setup_spans_prologue()
1055
1056   vmov x_starts, x_a, x_b
1057   vdup.u32 x_ends, x_c
1058
1059   setup_spans_down_flat()
1060
1061 function(setup_spans_down_b)
1062   setup_spans_prologue()
1063
1064   vdup.u32 x_starts, x_a
1065   vmov x_ends, x_b, x_c
1066
1067   setup_spans_down_flat()
1068
1069
1070 #define middle_y                                          r9
1071
1072 #define edges_xy_b                                        q11
1073 #define edges_dx_dy_b                                     d26
1074 #define edge_shifts_b                                     d27
1075 #define edges_dx_dy_and_shifts_b                          q13
1076 #define height_increment                                  d20
1077
1078 #define edges_dx_dy_and_shifts                            q1
1079
1080 #define edges_xy_b_left                                   d22
1081 #define edges_xy_b_right                                  d23
1082
1083 #define setup_spans_up_down_load_edge_set_b()                                  \
1084   vmov edges_xy, edges_xy_b;                                                   \
1085   vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b                        \
1086
1087
1088 function(setup_spans_up_down)
1089   setup_spans_prologue()
1090
1091   // s32 middle_y = y_a;
1092   sub height_minor_a, y_a, y_b
1093   sub height_minor_b, y_c, y_a
1094   sub height_major, y_c, y_b
1095
1096   vmov x_starts, x_a, x_c
1097   vdup.u32 x_ends, x_b
1098
1099   compute_edge_delta_x3(x_a, height_minor_a, height_major)
1100
1101   mov temp, #0
1102   vmov height_increment, temp, height_minor_b
1103   vmlal.s32 edges_xy, edges_dx_dy, height_increment
1104
1105   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1106   vmov edges_xy_b_right, edges_xy_right
1107
1108   vmov edge_shifts_b, edge_shifts
1109   vmov.u32 edge_shifts_b[0], edge_shift_alt
1110
1111   vneg.s32 edges_dx_dy_b, edges_dx_dy
1112   vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1113
1114   mov middle_y, y_a
1115   
1116   setup_spans_load_b()
1117   sub y_a, y_a, #1
1118
1119   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1120   subs temp, temp, y_b
1121   subgt height_minor_a, height_minor_a, temp
1122
1123   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1124   subs clip, y_a, temp
1125   ble 0f
1126
1127   sub height_minor_a, height_minor_a, clip
1128   sub y_a, y_a, clip
1129   setup_spans_clip(decrement, no)
1130
1131  0:                                                                
1132   cmp height_minor_a, #0
1133   ble 3f
1134
1135   orr temp, y_a, y_a, lsl #16
1136   sub temp, temp, #(1 << 16)
1137   sub y_a, temp, #2
1138   sub y_a, y_a, #(2 << 16)
1139   vmov y_x4, temp, y_a
1140
1141   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1142
1143   strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1144
1145   setup_spans_adjust_edges_alternate_no(left, right); 
1146   setup_spans_adjust_interpolants_up()
1147   setup_spans_up_down_load_edge_set_b()
1148
1149   setup_spans_prologue_b()
1150
1151
1152  2: 
1153   setup_spans_set_x4_alternate_no(none, up)
1154   subs height_minor_a, height_minor_a, #4
1155   bhi 2b
1156
1157   add span_edge_data, span_edge_data, height_minor_a, lsl #3
1158   add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1159   add span_b_offset, span_b_offset, height_minor_a, lsl #2
1160
1161  4:
1162   add temp, psx_gpu, #psx_gpu_uvrg_offset
1163   vld1.32 { uvrg }, [ temp ]
1164   mov y_a, middle_y
1165   
1166   setup_spans_load_b()
1167
1168   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1169   subs y_c, y_c, temp
1170   subgt height_minor_b, height_minor_b, y_c
1171   addgt height_minor_b, height_minor_b, #1
1172
1173   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1174   subs clip, temp, y_a
1175   ble 0f
1176
1177   sub height_minor_b, height_minor_b, clip
1178   add y_a, y_a, clip
1179   setup_spans_clip(increment, no)
1180
1181  0:
1182   cmp height_minor_b, #0
1183   ble 1f
1184
1185   orr temp, y_a, y_a, lsl #16
1186   add temp, temp, #(1 << 16) 
1187   add y_a, temp, #2
1188   add y_a, y_a, #(2 << 16)
1189   vmov y_x4, temp, y_a
1190
1191   setup_spans_adjust_edges_alternate_no(left, right)
1192
1193   ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1194   add temp, temp, height_minor_b
1195
1196   cmp temp, #MAX_SPANS
1197   beq 5f
1198
1199   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1200
1201  2:                                                     
1202   setup_spans_set_x4_alternate_no(none, down)
1203   subs height_minor_b, height_minor_b, #4
1204   bhi 2b
1205
1206  1:
1207   setup_spans_epilogue()
1208
1209  3:
1210   setup_spans_up_down_load_edge_set_b()
1211   setup_spans_prologue_b()
1212   bal 4b
1213
1214  5:
1215   // FIXME: overflow corner case
1216   sub temp, temp, height_minor_b
1217   bics height_minor_b, #3
1218   add temp, temp, height_minor_b
1219   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1220   bne 2b
1221   bal 1b
1222
1223 #undef span_uvrg_offset
1224 #undef span_edge_data
1225 #undef span_b_offset
1226 #undef left_x
1227 #undef b
1228
1229 #define psx_gpu                                  r0
1230 #define num_spans                                r1
1231 #define span_uvrg_offset                         r2
1232 #define span_edge_data                           r3
1233 #define span_b_offset                            r4
1234 #define b_dx                                     r5
1235 #define span_num_blocks                          r6
1236 #define y                                        r7
1237 #define left_x                                   r8
1238 #define b                                        r9
1239 #define dither_offset_ptr                        r10
1240 #define block_ptr_a                              r11
1241 #define fb_ptr                                   r12
1242 #define num_blocks                               r14
1243
1244 #define uvrg_dx_ptr                              r2
1245 #define texture_mask_ptr                         r3
1246 #define dither_shift                             r8
1247 #define dither_row                               r10
1248
1249 #define c_32                                     r7
1250 #define b_dx4                                    r8
1251 #define b_dx8                                    r9
1252 #define block_ptr_b                              r10
1253
1254 #define block_span_ptr                           r10
1255 #define right_mask                               r8
1256
1257 #define color                                    r2
1258 #define color_r                                  r3
1259 #define color_g                                  r4
1260 #define color_b                                  r5
1261
1262 #undef uvrg
1263
1264 #define u_block                                  q0
1265 #define v_block                                  q1
1266 #define r_block                                  q2
1267 #define g_block                                  q3
1268 #define b_block                                  q4
1269
1270 #define uv_dx4                                   d10
1271 #define rg_dx4                                   d11
1272 #define uv_dx8                                   d12
1273 #define rg_dx8                                   d13
1274 #define b_whole_8                                d14
1275 #define fb_mask_ptrs                             d15
1276
1277 #define uvrg_dx4                                 q5
1278 #define uvrg_dx8                                 q6
1279 #define uv_dx8                                   d12
1280 #define rg_dx8                                   d13
1281
1282 #define u_whole                                  q8
1283 #define v_whole                                  q9
1284 #define r_whole                                  q10
1285 #define g_whole                                  q11
1286 #define b_whole                                  q12
1287
1288 #define u_whole_low                              d16
1289 #define u_whole_high                             d17
1290 #define v_whole_low                              d18
1291 #define v_whole_high                             d19
1292 #define r_whole_low                              d20
1293 #define r_whole_high                             d21
1294 #define g_whole_low                              d22
1295 #define g_whole_high                             d23
1296 #define b_whole_low                              d24
1297 #define b_whole_high                             d25
1298
1299 #define dx4                                      q13
1300 #define dx8                                      q13
1301
1302 #define u_whole_8                                d26
1303 #define v_whole_8                                d27
1304 #define u_whole_8b                               d24
1305 #define r_whole_8                                d24
1306 #define g_whole_8                                d25
1307
1308 #define uv_whole_8                               q13
1309 #define uv_whole_8b                              q14
1310
1311 #define dither_offsets                           q14
1312 #define texture_mask                             q15
1313 #define texture_mask_u                           d30
1314 #define texture_mask_v                           d31
1315
1316 #define dither_offsets_short                     d28
1317
1318 #define v_left_x                                 q8
1319 #define uvrg                                     q9
1320 #define block_span                               q10
1321
1322 #define uv                                       d18
1323 #define rg                                       d19
1324
1325 #define draw_mask                                q1
1326 #define draw_mask_edge                           q13
1327 #define test_mask                                q0
1328
1329 #define uvrg_dx                                  q3
1330
1331 #define colors                                   q2
1332
1333 #define setup_blocks_texture_swizzled()                                        \
1334   vand.u8 u_whole_8b, u_whole_8, texture_mask_u;                               \
1335   vsli.u8 u_whole_8, v_whole_8, #4;                                            \
1336   vsri.u8 v_whole_8, u_whole_8b, #4                                            \
1337
1338 #define setup_blocks_texture_unswizzled()                                      \
1339
1340
1341 #define setup_blocks_shaded_textured_builder(swizzling)                        \
1342 .align 3;                                                                      \
1343                                                                                \
1344 function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
1345   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
1346   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1347                                                                                \
1348   vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ];                                 \
1349   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1350                                                                                \
1351   cmp num_spans, #0;                                                           \
1352   bxeq lr;                                                                     \
1353                                                                                \
1354   stmdb sp!, { r4 - r11, r14 };                                                \
1355   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1356                                                                                \
1357   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
1358   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1359                                                                                \
1360   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ];   \
1361   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1362                                                                                \
1363   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1364   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1365                                                                                \
1366   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
1367   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1368                                                                                \
1369   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1370                                                                                \
1371  0:                                                                            \
1372   vmov.u8 fb_mask_ptrs, #0;                                                    \
1373                                                                                \
1374   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
1375   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1376                                                                                \
1377   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
1378   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
1379                                                                                \
1380   cmp span_num_blocks, #0;                                                     \
1381   beq 1f;                                                                      \
1382                                                                                \
1383   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
1384   add num_blocks, span_num_blocks, num_blocks;                                 \
1385                                                                                \
1386   cmp num_blocks, #MAX_BLOCKS;                                                 \
1387   bgt 2f;                                                                      \
1388                                                                                \
1389  3:                                                                            \
1390   ldr b, [ span_b_offset ];                                                    \
1391   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1392                                                                                \
1393   vdup.u32 v_left_x, left_x;                                                   \
1394   and y, y, #0x3;                                                              \
1395                                                                                \
1396   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
1397   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1398                                                                                \
1399   mla b, b_dx, left_x, b;                                                      \
1400   and dither_shift, left_x, #0x03;                                             \
1401                                                                                \
1402   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
1403   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1404                                                                                \
1405   mov dither_shift, dither_shift, lsl #3;                                      \
1406   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1407                                                                                \
1408   mov c_32, #32;                                                               \
1409   subs span_num_blocks, span_num_blocks, #1;                                   \
1410                                                                                \
1411   mov dither_row, dither_row, ror dither_shift;                                \
1412   mov b_dx4, b_dx, lsl #2;                                                     \
1413                                                                                \
1414   vdup.u32 dither_offsets_short, dither_row;                                   \
1415   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1416                                                                                \
1417   vdup.u32 b_block, b;                                                         \
1418   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1419                                                                                \
1420   vdup.u32 u_block, uv[0];                                                     \
1421   mov b_dx8, b_dx, lsl #3;                                                     \
1422                                                                                \
1423   vdup.u32 v_block, uv[1];                                                     \
1424   vdup.u32 r_block, rg[0];                                                     \
1425   vdup.u32 g_block, rg[1];                                                     \
1426                                                                                \
1427   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1428                                                                                \
1429   vadd.u32 u_block, u_block, block_span;                                       \
1430   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1431                                                                                \
1432   vadd.u32 v_block, v_block, block_span;                                       \
1433   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1434                                                                                \
1435   vadd.u32 r_block, r_block, block_span;                                       \
1436   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1437                                                                                \
1438   vadd.u32 g_block, g_block, block_span;                                       \
1439   vld1.u32 { block_span }, [ block_span_ptr, :128 ];                           \
1440                                                                                \
1441   vadd.u32 b_block, b_block, block_span;                                       \
1442   add block_ptr_b, block_ptr_a, #16;                                           \
1443                                                                                \
1444   vshrn.u32 u_whole_low, u_block, #16;                                         \
1445   vshrn.u32 v_whole_low, v_block, #16;                                         \
1446   vshrn.u32 r_whole_low, r_block, #16;                                         \
1447   vshrn.u32 g_whole_low, g_block, #16;                                         \
1448                                                                                \
1449   vdup.u32 dx4, uv_dx4[0];                                                     \
1450   vshrn.u32 b_whole_low, b_block, #16;                                         \
1451                                                                                \
1452   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1453   vdup.u32 dx4, uv_dx4[1];                                                     \
1454                                                                                \
1455   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1456   vdup.u32 dx4, rg_dx4[0];                                                     \
1457                                                                                \
1458   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1459   vdup.u32 dx4, rg_dx4[1];                                                     \
1460                                                                                \
1461   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1462   vdup.u32 dx4, b_dx4;                                                         \
1463                                                                                \
1464   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1465   vdup.u32 dx8, uv_dx8[0];                                                     \
1466                                                                                \
1467   vadd.u32 u_block, u_block, dx8;                                              \
1468   vdup.u32 dx8, uv_dx8[1];                                                     \
1469                                                                                \
1470   vadd.u32 v_block, v_block, dx8;                                              \
1471   vdup.u32 dx8, rg_dx8[0];                                                     \
1472                                                                                \
1473   vadd.u32 r_block, r_block, dx8;                                              \
1474   vdup.u32 dx8, rg_dx8[1];                                                     \
1475                                                                                \
1476   vadd.u32 g_block, g_block, dx8;                                              \
1477   vdup.u32 dx8, b_dx8;                                                         \
1478                                                                                \
1479   vadd.u32 b_block, b_block, dx8;                                              \
1480   vmovn.u16 u_whole_8, u_whole;                                                \
1481                                                                                \
1482   vmovn.u16 v_whole_8, v_whole;                                                \
1483                                                                                \
1484   vmovn.u16 b_whole_8, b_whole;                                                \
1485   pld [ fb_ptr ];                                                              \
1486   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1487                                                                                \
1488   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1489   setup_blocks_texture_##swizzling();                                          \
1490                                                                                \
1491   vmovn.u16 r_whole_8, r_whole;                                                \
1492   beq 5f;                                                                      \
1493                                                                                \
1494  4:                                                                            \
1495   vmovn.u16 g_whole_8, g_whole;                                                \
1496   vshrn.u32 u_whole_low, u_block, #16;                                         \
1497                                                                                \
1498   vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32;               \
1499   vshrn.u32 v_whole_low, v_block, #16;                                         \
1500                                                                                \
1501   vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32;              \
1502   vshrn.u32 r_whole_low, r_block, #16;                                         \
1503                                                                                \
1504   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1505   vshrn.u32 g_whole_low, g_block, #16;                                         \
1506                                                                                \
1507   vdup.u32 dx4, uv_dx4[0];                                                     \
1508   vshrn.u32 b_whole_low, b_block, #16;                                         \
1509                                                                                \
1510   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1511   vdup.u32 dx4, uv_dx4[1];                                                     \
1512                                                                                \
1513   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1514   vdup.u32 dx4, rg_dx4[0];                                                     \
1515                                                                                \
1516   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1517   vdup.u32 dx4, rg_dx4[1];                                                     \
1518                                                                                \
1519   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1520   vdup.u32 dx4, b_dx4;                                                         \
1521                                                                                \
1522   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1523   vdup.u32 dx8, uv_dx8[0];                                                     \
1524                                                                                \
1525   vadd.u32 u_block, u_block, dx8;                                              \
1526   vdup.u32 dx8, uv_dx8[1];                                                     \
1527                                                                                \
1528   vadd.u32 v_block, v_block, dx8;                                              \
1529   vdup.u32 dx8, rg_dx8[0];                                                     \
1530                                                                                \
1531   vadd.u32 r_block, r_block, dx8;                                              \
1532   vdup.u32 dx8, rg_dx8[1];                                                     \
1533                                                                                \
1534   vadd.u32 g_block, g_block, dx8;                                              \
1535   vdup.u32 dx8, b_dx8;                                                         \
1536                                                                                \
1537   vadd.u32 b_block, b_block, dx8;                                              \
1538   vmovn.u16 u_whole_8, u_whole;                                                \
1539                                                                                \
1540   add fb_ptr, fb_ptr, #16;                                                     \
1541   vmovn.u16 v_whole_8, v_whole;                                                \
1542                                                                                \
1543   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1544   vmovn.u16 b_whole_8, b_whole;                                                \
1545                                                                                \
1546   pld [ fb_ptr ];                                                              \
1547                                                                                \
1548   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1549   subs span_num_blocks, span_num_blocks, #1;                                   \
1550                                                                                \
1551   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1552   setup_blocks_texture_##swizzling();                                          \
1553                                                                                \
1554   vmovn.u16 r_whole_8, r_whole;                                                \
1555   bne 4b;                                                                      \
1556                                                                                \
1557  5:                                                                            \
1558   vmovn.u16 g_whole_8, g_whole;                                                \
1559   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
1560                                                                                \
1561   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
1562   vdup.u8 draw_mask, right_mask;                                               \
1563                                                                                \
1564   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1565   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1566   vzip.u8 u_whole_8, v_whole_8;                                                \
1567                                                                                \
1568   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1569   vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32;              \
1570   vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32;                        \
1571   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1572   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1573                                                                                \
1574  1:                                                                            \
1575   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1576   add span_b_offset, span_b_offset, #4;                                        \
1577                                                                                \
1578   add span_edge_data, span_edge_data, #8;                                      \
1579   subs num_spans, num_spans, #1;                                               \
1580                                                                                \
1581   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1582   bne 0b;                                                                      \
1583                                                                                \
1584   ldmia sp!, { r4 - r11, pc };                                                 \
1585                                                                                \
1586  2:                                                                            \
1587   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1588   vpush { texture_mask };                                                      \
1589   vpush { uvrg_dx4 };                                                          \
1590                                                                                \
1591   stmdb sp!, { r0 - r3, r12, r14 };                                            \
1592   bl flush_render_block_buffer;                                                \
1593   ldmia sp!, { r0 - r3, r12, r14 };                                            \
1594                                                                                \
1595   vpop { uvrg_dx4 };                                                           \
1596   vpop { texture_mask };                                                       \
1597                                                                                \
1598   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1599   vmov.u8 fb_mask_ptrs, #0;                                                    \
1600                                                                                \
1601   mov num_blocks, span_num_blocks;                                             \
1602   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1603   bal 3b                                                                       \
1604
1605
1606 setup_blocks_shaded_textured_builder(swizzled)
1607 setup_blocks_shaded_textured_builder(unswizzled)
1608
1609
1610 #define setup_blocks_unshaded_textured_builder(swizzling)                      \
1611 .align 3;                                                                      \
1612                                                                                \
1613 function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
1614   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
1615   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1616                                                                                \
1617   vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ];                                 \
1618   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1619                                                                                \
1620   cmp num_spans, #0;                                                           \
1621   bxeq lr;                                                                     \
1622                                                                                \
1623   stmdb sp!, { r4 - r11, r14 };                                                \
1624   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1625                                                                                \
1626   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1627                                                                                \
1628   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ];   \
1629   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1630                                                                                \
1631   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1632   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1633                                                                                \
1634   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1635                                                                                \
1636   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1637                                                                                \
1638  0:                                                                            \
1639   vmov.u8 fb_mask_ptrs, #0;                                                    \
1640                                                                                \
1641   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
1642   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1643                                                                                \
1644   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
1645   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
1646                                                                                \
1647   cmp span_num_blocks, #0;                                                     \
1648   beq 1f;                                                                      \
1649                                                                                \
1650   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
1651   add num_blocks, span_num_blocks, num_blocks;                                 \
1652                                                                                \
1653   cmp num_blocks, #MAX_BLOCKS;                                                 \
1654   bgt 2f;                                                                      \
1655                                                                                \
1656  3:                                                                            \
1657   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1658                                                                                \
1659   vdup.u32 v_left_x, left_x;                                                   \
1660   and y, y, #0x3;                                                              \
1661                                                                                \
1662   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
1663   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1664                                                                                \
1665   and dither_shift, left_x, #0x03;                                             \
1666                                                                                \
1667   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
1668   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1669                                                                                \
1670   mov dither_shift, dither_shift, lsl #3;                                      \
1671   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1672                                                                                \
1673   mov c_32, #32;                                                               \
1674   subs span_num_blocks, span_num_blocks, #1;                                   \
1675                                                                                \
1676   mov dither_row, dither_row, ror dither_shift;                                \
1677                                                                                \
1678   vdup.u32 dither_offsets_short, dither_row;                                   \
1679   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1680                                                                                \
1681   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1682                                                                                \
1683   vdup.u32 u_block, uv[0];                                                     \
1684                                                                                \
1685   vdup.u32 v_block, uv[1];                                                     \
1686   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1687                                                                                \
1688   vadd.u32 u_block, u_block, block_span;                                       \
1689   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1690                                                                                \
1691   vadd.u32 v_block, v_block, block_span;                                       \
1692   add block_ptr_b, block_ptr_a, #16;                                           \
1693                                                                                \
1694   vshrn.u32 u_whole_low, u_block, #16;                                         \
1695   vshrn.u32 v_whole_low, v_block, #16;                                         \
1696                                                                                \
1697   vdup.u32 dx4, uv_dx4[0];                                                     \
1698                                                                                \
1699   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1700   vdup.u32 dx4, uv_dx4[1];                                                     \
1701                                                                                \
1702   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1703   vdup.u32 dx8, uv_dx8[0];                                                     \
1704                                                                                \
1705   vadd.u32 u_block, u_block, dx8;                                              \
1706   vdup.u32 dx8, uv_dx8[1];                                                     \
1707                                                                                \
1708   vadd.u32 v_block, v_block, dx8;                                              \
1709   vmovn.u16 u_whole_8, u_whole;                                                \
1710                                                                                \
1711   vmovn.u16 v_whole_8, v_whole;                                                \
1712                                                                                \
1713   pld [ fb_ptr ];                                                              \
1714   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1715                                                                                \
1716   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1717   setup_blocks_texture_##swizzling();                                          \
1718                                                                                \
1719   beq 5f;                                                                      \
1720                                                                                \
1721  4:                                                                            \
1722   vshrn.u32 u_whole_low, u_block, #16;                                         \
1723                                                                                \
1724   vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32;               \
1725   vshrn.u32 v_whole_low, v_block, #16;                                         \
1726                                                                                \
1727   add block_ptr_b, block_ptr_b, #32;                                           \
1728   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1729                                                                                \
1730   vdup.u32 dx4, uv_dx4[0];                                                     \
1731   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1732   vdup.u32 dx4, uv_dx4[1];                                                     \
1733                                                                                \
1734   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1735   vdup.u32 dx8, uv_dx8[0];                                                     \
1736                                                                                \
1737   vadd.u32 u_block, u_block, dx8;                                              \
1738   vdup.u32 dx8, uv_dx8[1];                                                     \
1739                                                                                \
1740   vadd.u32 v_block, v_block, dx8;                                              \
1741   vmovn.u16 u_whole_8, u_whole;                                                \
1742                                                                                \
1743   add fb_ptr, fb_ptr, #16;                                                     \
1744   vmovn.u16 v_whole_8, v_whole;                                                \
1745                                                                                \
1746   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1747   pld [ fb_ptr ];                                                              \
1748                                                                                \
1749   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1750   subs span_num_blocks, span_num_blocks, #1;                                   \
1751                                                                                \
1752   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1753   setup_blocks_texture_##swizzling();                                          \
1754                                                                                \
1755   bne 4b;                                                                      \
1756                                                                                \
1757  5:                                                                            \
1758   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
1759                                                                                \
1760   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
1761   vdup.u8 draw_mask, right_mask;                                               \
1762                                                                                \
1763   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1764   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1765   vzip.u8 u_whole_8, v_whole_8;                                                \
1766                                                                                \
1767   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1768   add block_ptr_b, block_ptr_b, #32;                                           \
1769   vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32;                        \
1770   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1771   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1772                                                                                \
1773  1:                                                                            \
1774   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1775   add span_edge_data, span_edge_data, #8;                                      \
1776   subs num_spans, num_spans, #1;                                               \
1777                                                                                \
1778   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1779   bne 0b;                                                                      \
1780                                                                                \
1781   ldmia sp!, { r4 - r11, pc };                                                 \
1782                                                                                \
1783  2:                                                                            \
1784   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1785   vpush { texture_mask };                                                      \
1786   vpush { uvrg_dx4 };                                                          \
1787                                                                                \
1788   stmdb sp!, { r0 - r3, r12, r14 };                                            \
1789   bl flush_render_block_buffer;                                                \
1790   ldmia sp!, { r0 - r3, r12, r14 };                                            \
1791                                                                                \
1792   vpop { uvrg_dx4 };                                                           \
1793   vpop { texture_mask };                                                       \
1794                                                                                \
1795   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1796   vmov.u8 fb_mask_ptrs, #0;                                                    \
1797                                                                                \
1798   mov num_blocks, span_num_blocks;                                             \
1799   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1800   bal 3b                                                                       \
1801
1802
1803 setup_blocks_unshaded_textured_builder(swizzled)
1804 setup_blocks_unshaded_textured_builder(unswizzled)
1805
1806
1807 .align 3
1808
1809 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1810   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1811   veor.u32 draw_mask, draw_mask, draw_mask
1812
1813   cmp num_spans, #0
1814   bxeq lr
1815
1816   stmdb sp!, { r4 - r11, r14 }
1817   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1818
1819   ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1820
1821   ubfx color_r, color, #3, #5
1822   ubfx color_g, color, #11, #5
1823   ubfx color_b, color, #19, #5
1824
1825   orr color, color_r, color_b, lsl #10
1826   orr color, color, color_g, lsl #5
1827
1828   vdup.u16 colors, color
1829
1830   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1831   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1832
1833   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1834   add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1835
1836  0:
1837   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1838   ldrh y, [ span_edge_data, #edge_data_y_offset ]
1839
1840   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
1841
1842   cmp span_num_blocks, #0
1843   beq 1f
1844
1845   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1846   add num_blocks, span_num_blocks, num_blocks
1847
1848   cmp num_blocks, #MAX_BLOCKS
1849   bgt 2f
1850
1851  3:
1852   add fb_ptr, fb_ptr, y, lsl #11
1853   and y, y, #0x3
1854
1855   add fb_ptr, fb_ptr, left_x, lsl #1
1856   mov c_32, #32
1857
1858   subs span_num_blocks, span_num_blocks, #1
1859
1860   add block_ptr_b, block_ptr_a, #16
1861   pld [ fb_ptr ]
1862
1863   vmov.u32 fb_mask_ptrs[1], fb_ptr
1864   beq 5f
1865
1866  4:
1867   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1868   vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1869   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1870
1871   add fb_ptr, fb_ptr, #16
1872   add block_ptr_b, block_ptr_b, #32
1873
1874   pld [ fb_ptr ]
1875
1876   vmov.u32 fb_mask_ptrs[1], fb_ptr
1877   subs span_num_blocks, span_num_blocks, #1
1878
1879   bne 4b
1880
1881  5:
1882   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1883
1884   vdup.u8 draw_mask_edge, right_mask
1885   vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1886
1887   vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1888   vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1889   add block_ptr_b, block_ptr_b, #32
1890   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1891
1892  1:
1893   add span_edge_data, span_edge_data, #8
1894   subs num_spans, num_spans, #1
1895
1896   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1897   bne 0b
1898
1899   ldmia sp!, { r4 - r11, pc }
1900                                                                            
1901  2:
1902   vpush { colors }
1903
1904   stmdb sp!, { r0 - r3, r12, r14 }
1905   bl flush_render_block_buffer
1906   ldmia sp!, { r0 - r3, r12, r14 }
1907
1908   vpop { colors }
1909
1910   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1911   veor.u32 draw_mask, draw_mask, draw_mask
1912
1913   mov num_blocks, span_num_blocks
1914   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1915   bal 3b
1916
1917
1918 #define mask_msb_scalar                                   r14
1919
1920 #define msb_mask                                          q15
1921
1922 #define pixels_low                                        d16
1923
1924 #define msb_mask_low                                      d30
1925 #define msb_mask_high                                     d31
1926
1927
1928 .align 3
1929
1930 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1931   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1932
1933   cmp num_spans, #0
1934   bxeq lr
1935
1936   stmdb sp!, { r4 - r11, r14 }
1937
1938   ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1939
1940   ubfx color_r, color, #3, #5
1941   ubfx color_g, color, #11, #5
1942
1943   ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1944   ubfx color_b, color, #19, #5
1945
1946   orr color, color_r, color_b, lsl #10
1947   orr color, color, color_g, lsl #5
1948   orr color, color, mask_msb_scalar
1949
1950   vdup.u16 colors, color
1951
1952   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1953   orr color, color, color, lsl #16
1954
1955
1956  0:
1957   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1958   ldrh y, [ span_edge_data, #edge_data_y_offset ]
1959
1960   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
1961
1962   cmp span_num_blocks, #0
1963   beq 1f
1964
1965   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1966
1967   add fb_ptr, fb_ptr, y, lsl #11
1968   subs span_num_blocks, span_num_blocks, #1
1969
1970   add fb_ptr, fb_ptr, left_x, lsl #1
1971   beq 3f
1972
1973  2:
1974   vst1.u32 { colors }, [ fb_ptr ]!
1975   subs span_num_blocks, span_num_blocks, #1
1976
1977   bne 2b
1978
1979  3:
1980   ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1981
1982   cmp right_mask, #0x0
1983   beq 5f
1984
1985   tst right_mask, #0xF
1986   streq color, [ fb_ptr ], #4
1987   moveq right_mask, right_mask, lsr #4
1988   streq color, [ fb_ptr ], #4
1989
1990   tst right_mask, #0x3
1991   streq color, [ fb_ptr ], #4
1992   moveq right_mask, right_mask, lsr #2
1993
1994   tst right_mask, #0x1
1995   strheq color, [ fb_ptr ]
1996
1997  1:
1998   add span_edge_data, span_edge_data, #8
1999   subs num_spans, num_spans, #1
2000   bne 0b
2001
2002   ldmia sp!, { r4 - r11, pc }
2003                                                                            
2004  5:
2005   vst1.u32 { colors }, [ fb_ptr ]
2006   bal 1b
2007
2008
2009 #undef c_64
2010
2011 #define c_64                                              r7
2012 #define rg_dx_ptr                                         r2
2013
2014
2015 #undef r_block
2016 #undef g_block
2017 #undef b_block
2018 #undef r_whole
2019 #undef g_whole
2020 #undef b_whole
2021 #undef r_whole_low
2022 #undef r_whole_high
2023 #undef g_whole_low
2024 #undef g_whole_high
2025 #undef b_whole_low
2026 #undef b_whole_high
2027 #undef r_whole_8
2028 #undef g_whole_8
2029 #undef b_whole_8
2030 #undef dither_offsets
2031 #undef rg_dx4
2032 #undef rg_dx8
2033 #undef dx4
2034 #undef dx8
2035 #undef v_left_x
2036 #undef uvrg
2037 #undef block_span
2038 #undef rg
2039 #undef draw_mask
2040 #undef test_mask
2041
2042 #define r_block                                           q0
2043 #define g_block                                           q1
2044 #define b_block                                           q2
2045
2046 #define r_whole                                           q3
2047 #define g_whole                                           q4
2048 #define b_whole                                           q5
2049
2050 #define r_whole_low                                       d6
2051 #define r_whole_high                                      d7
2052 #define g_whole_low                                       d8
2053 #define g_whole_high                                      d9
2054 #define b_whole_low                                       d10
2055 #define b_whole_high                                      d11
2056
2057 #define gb_whole_8                                        q6
2058
2059 #define g_whole_8                                         d12
2060 #define b_whole_8                                         d13
2061
2062 #define r_whole_8                                         d14
2063
2064 #define pixels                                            q8
2065
2066 #define rg_dx4                                            d18
2067 #define rg_dx8                                            d19
2068
2069 #define dx4                                               q10
2070 #define dx8                                               q10
2071
2072 #define v_left_x                                          d6
2073 #define uvrg                                              q4
2074 #define block_span                                        q5
2075
2076 #define rg                                                d9
2077
2078 #define d64_1                                             d22
2079 #define d64_128                                           d23
2080
2081 #define d128_4                                            q12
2082 #define d128_0x7                                          q13
2083
2084 #define d64_4                                             d24
2085
2086 #define dither_offsets                                    q14
2087 #define draw_mask                                         q15
2088
2089 #define dither_offsets_low                                d28
2090
2091 #define rg_dx                                             d0
2092 #define test_mask                                         q10
2093
2094
2095 #define setup_blocks_shaded_untextured_dither_a_dithered()                     \
2096   vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low;                           \
2097   vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets;                             \
2098
2099 #define setup_blocks_shaded_untextured_dither_b_dithered()                     \
2100   vqsub.u8 r_whole_8, r_whole_8, d64_4;                                        \
2101   vqsub.u8 gb_whole_8, gb_whole_8, d128_4                                      \
2102
2103 #define setup_blocks_shaded_untextured_dither_a_undithered()                   \
2104
2105 #define setup_blocks_shaded_untextured_dither_b_undithered()                   \
2106
2107
2108 #define setup_blocks_shaded_untextured_indirect_builder(dithering)             \
2109 .align 3;                                                                      \
2110                                                                                \
2111 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
2112   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
2113   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2114                                                                                \
2115   vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ];                                      \
2116                                                                                \
2117   cmp num_spans, #0;                                                           \
2118   bxeq lr;                                                                     \
2119                                                                                \
2120   stmdb sp!, { r4 - r11, r14 };                                                \
2121   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2122                                                                                \
2123   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
2124   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2125                                                                                \
2126   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2127                                                                                \
2128   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
2129   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2130                                                                                \
2131   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2132   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2133                                                                                \
2134   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
2135   vmov.u8 d64_1, #1;                                                           \
2136                                                                                \
2137   vmov.u8 d128_4, #4;                                                          \
2138   vmov.u8 d64_128, #128;                                                       \
2139                                                                                \
2140   vmov.u8 d128_0x7, #0x7;                                                      \
2141                                                                                \
2142  0:                                                                            \
2143   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
2144   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2145                                                                                \
2146   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
2147   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
2148                                                                                \
2149   cmp span_num_blocks, #0;                                                     \
2150   beq 1f;                                                                      \
2151                                                                                \
2152   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
2153   add num_blocks, span_num_blocks, num_blocks;                                 \
2154                                                                                \
2155   cmp num_blocks, #MAX_BLOCKS;                                                 \
2156   bgt 2f;                                                                      \
2157                                                                                \
2158  3:                                                                            \
2159   ldr b, [ span_b_offset ];                                                    \
2160   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2161                                                                                \
2162   vdup.u32 v_left_x, left_x;                                                   \
2163   and y, y, #0x3;                                                              \
2164                                                                                \
2165   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
2166   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2167                                                                                \
2168   mla b, b_dx, left_x, b;                                                      \
2169   and dither_shift, left_x, #0x03;                                             \
2170                                                                                \
2171   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
2172   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2173                                                                                \
2174   mov dither_shift, dither_shift, lsl #3;                                      \
2175   vmla.u32 rg, rg_dx, v_left_x;                                                \
2176                                                                                \
2177   mov c_64, #64;                                                               \
2178   subs span_num_blocks, span_num_blocks, #1;                                   \
2179                                                                                \
2180   mov dither_row, dither_row, ror dither_shift;                                \
2181   mov b_dx4, b_dx, lsl #2;                                                     \
2182                                                                                \
2183   vdup.u32 dither_offsets, dither_row;                                         \
2184   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2185                                                                                \
2186   vdup.u32 b_block, b;                                                         \
2187   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2188                                                                                \
2189   mov b_dx8, b_dx, lsl #3;                                                     \
2190   vdup.u32 r_block, rg[0];                                                     \
2191   vdup.u32 g_block, rg[1];                                                     \
2192                                                                                \
2193   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2194                                                                                \
2195   vadd.u32 r_block, r_block, block_span;                                       \
2196   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2197                                                                                \
2198   vadd.u32 g_block, g_block, block_span;                                       \
2199   vld1.u32 { block_span }, [ block_span_ptr, :128 ];                           \
2200                                                                                \
2201   vadd.u32 b_block, b_block, block_span;                                       \
2202   add block_ptr_b, block_ptr_a, #16;                                           \
2203                                                                                \
2204   vshrn.u32 r_whole_low, r_block, #16;                                         \
2205   vshrn.u32 g_whole_low, g_block, #16;                                         \
2206   vshrn.u32 b_whole_low, b_block, #16;                                         \
2207   vdup.u32 dx4, rg_dx4[0];                                                     \
2208                                                                                \
2209   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2210   vdup.u32 dx4, rg_dx4[1];                                                     \
2211                                                                                \
2212   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2213   vdup.u32 dx4, b_dx4;                                                         \
2214                                                                                \
2215   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2216   vdup.u32 dx8, rg_dx8[0];                                                     \
2217                                                                                \
2218   vadd.u32 r_block, r_block, dx8;                                              \
2219   vdup.u32 dx8, rg_dx8[1];                                                     \
2220                                                                                \
2221   vadd.u32 g_block, g_block, dx8;                                              \
2222   vdup.u32 dx8, b_dx8;                                                         \
2223                                                                                \
2224   vadd.u32 b_block, b_block, dx8;                                              \
2225                                                                                \
2226   vmovn.u16 r_whole_8, r_whole;                                                \
2227   vmovn.u16 g_whole_8, g_whole;                                                \
2228   vmovn.u16 b_whole_8, b_whole;                                                \
2229                                                                                \
2230   beq 5f;                                                                      \
2231   veor.u32 draw_mask, draw_mask, draw_mask;                                    \
2232                                                                                \
2233  4:                                                                            \
2234   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2235   vshrn.u32 r_whole_low, r_block, #16;                                         \
2236                                                                                \
2237   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2238   vshrn.u32 g_whole_low, g_block, #16;                                         \
2239                                                                                \
2240   vshrn.u32 b_whole_low, b_block, #16;                                         \
2241   str fb_ptr, [ block_ptr_a, #44 ];                                            \
2242                                                                                \
2243   vdup.u32 dx4, rg_dx4[0];                                                     \
2244   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2245   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2246                                                                                \
2247   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2248   vdup.u32 dx4, rg_dx4[1];                                                     \
2249                                                                                \
2250   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2251   vdup.u32 dx4, b_dx4;                                                         \
2252                                                                                \
2253   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2254   vdup.u32 dx8, rg_dx8[0];                                                     \
2255                                                                                \
2256   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2257   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2258   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2259                                                                                \
2260   vadd.u32 r_block, r_block, dx8;                                              \
2261   vdup.u32 dx8, rg_dx8[1];                                                     \
2262                                                                                \
2263   vadd.u32 g_block, g_block, dx8;                                              \
2264   vdup.u32 dx8, b_dx8;                                                         \
2265                                                                                \
2266   vadd.u32 b_block, b_block, dx8;                                              \
2267   add fb_ptr, fb_ptr, #16;                                                     \
2268                                                                                \
2269   vmovn.u16 r_whole_8, r_whole;                                                \
2270   vmovn.u16 g_whole_8, g_whole;                                                \
2271   vmovn.u16 b_whole_8, b_whole;                                                \
2272                                                                                \
2273   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64;                         \
2274   vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64;                            \
2275                                                                                \
2276   pld [ fb_ptr ];                                                              \
2277                                                                                \
2278   subs span_num_blocks, span_num_blocks, #1;                                   \
2279   bne 4b;                                                                      \
2280                                                                                \
2281  5:                                                                            \
2282   str fb_ptr, [ block_ptr_a, #44 ];                                            \
2283   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2284                                                                                \
2285   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
2286   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2287                                                                                \
2288   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2289   vdup.u8 draw_mask, right_mask;                                               \
2290                                                                                \
2291   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2292   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
2293                                                                                \
2294   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
2295                                                                                \
2296   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2297   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2298   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2299                                                                                \
2300   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64;                         \
2301   vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64;                            \
2302                                                                                \
2303  1:                                                                            \
2304   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2305   add span_b_offset, span_b_offset, #4;                                        \
2306                                                                                \
2307   add span_edge_data, span_edge_data, #8;                                      \
2308   subs num_spans, num_spans, #1;                                               \
2309                                                                                \
2310   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
2311   bne 0b;                                                                      \
2312                                                                                \
2313   ldmia sp!, { r4 - r11, pc };                                                 \
2314                                                                                \
2315  2:                                                                            \
2316   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
2317   vpush { rg_dx4 };                                                            \
2318                                                                                \
2319   stmdb sp!, { r0 - r3, r12, r14 };                                            \
2320   bl flush_render_block_buffer;                                                \
2321   ldmia sp!, { r0 - r3, r12, r14 };                                            \
2322                                                                                \
2323   vpop { rg_dx4 };                                                             \
2324                                                                                \
2325   vmov.u8 d64_1, #1;                                                           \
2326   vmov.u8 d128_4, #4;                                                          \
2327   vmov.u8 d64_128, #128;                                                       \
2328   vmov.u8 d128_0x7, #0x7;                                                      \
2329                                                                                \
2330   vadd.u32 rg_dx8, rg_dx4, rg_dx4;                                             \
2331                                                                                \
2332   mov num_blocks, span_num_blocks;                                             \
2333   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2334   bal 3b                                                                       \
2335
2336
2337 setup_blocks_shaded_untextured_indirect_builder(undithered)
2338 setup_blocks_shaded_untextured_indirect_builder(dithered)
2339
2340
2341 #undef draw_mask
2342
2343 #define mask_msb_ptr                                      r14
2344
2345 #define draw_mask                                         q0
2346 #define pixels_low                                        d16
2347 #define pixels_high                                       d17
2348
2349
2350
2351 #define setup_blocks_shaded_untextured_direct_builder(dithering)               \
2352 .align 3;                                                                      \
2353                                                                                \
2354 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
2355   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
2356   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2357                                                                                \
2358   vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ];                                      \
2359                                                                                \
2360   cmp num_spans, #0;                                                           \
2361   bxeq lr;                                                                     \
2362                                                                                \
2363   stmdb sp!, { r4 - r11, r14 };                                                \
2364   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2365                                                                                \
2366   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
2367   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2368                                                                                \
2369   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2370   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2371                                                                                \
2372   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2373   vmov.u8 d64_1, #1;                                                           \
2374                                                                                \
2375   vmov.u8 d128_4, #4;                                                          \
2376   vmov.u8 d64_128, #128;                                                       \
2377                                                                                \
2378   vmov.u8 d128_0x7, #0x7;                                                      \
2379   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
2380   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
2381                                                                                \
2382  0:                                                                            \
2383   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
2384   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2385                                                                                \
2386   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
2387   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
2388                                                                                \
2389   cmp span_num_blocks, #0;                                                     \
2390   beq 1f;                                                                      \
2391                                                                                \
2392   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
2393   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2394                                                                                \
2395   ldr b, [ span_b_offset ];                                                    \
2396   vdup.u32 v_left_x, left_x;                                                   \
2397   and y, y, #0x3;                                                              \
2398                                                                                \
2399   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
2400   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2401                                                                                \
2402   mla b, b_dx, left_x, b;                                                      \
2403   and dither_shift, left_x, #0x03;                                             \
2404                                                                                \
2405   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
2406   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2407                                                                                \
2408   mov dither_shift, dither_shift, lsl #3;                                      \
2409   vmla.u32 rg, rg_dx, v_left_x;                                                \
2410                                                                                \
2411   subs span_num_blocks, span_num_blocks, #1;                                   \
2412                                                                                \
2413   mov dither_row, dither_row, ror dither_shift;                                \
2414   mov b_dx4, b_dx, lsl #2;                                                     \
2415                                                                                \
2416   vdup.u32 dither_offsets, dither_row;                                         \
2417   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2418                                                                                \
2419   vdup.u32 b_block, b;                                                         \
2420   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2421                                                                                \
2422   mov b_dx8, b_dx, lsl #3;                                                     \
2423   vdup.u32 r_block, rg[0];      &