d187fce921e49656fbace7994c9ad89ecceebff1
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation; either version 2 of
8  * the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  */
15
16 #define RENDER_INTERLACE_ENABLED                          0x1
17
18 #include "psx_gpu.h"
19 #include "psx_gpu_offsets.h"
20
21 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
22
23 #define edge_data_left_x_offset                           0
24 #define edge_data_num_blocks_offset                       2
25 #define edge_data_right_mask_offset                       4
26 #define edge_data_y_offset                                6
27
28 .syntax unified
29 .text
30
31 #if 0
32 #define save_abi_regs() \
33   vpush {q4-q7}
34 #define restore_abi_regs() \
35   vpop  {q4-q7}
36 #else
37 #define save_abi_regs()
38 #define restore_abi_regs()
39 #endif
40
41 #define psx_gpu                                           r0
42 #define v_a                                               r1
43 #define v_b                                               r2
44 #define v_c                                               r3
45
46 #define x0                                                r4
47 #define x1                                                r5
48 #define x2                                                r6
49 #define x0_x1                                             r5
50 #define x1_x2                                             r6
51 #define y0                                                r7
52 #define y1                                                r8
53 #define y2                                                r9
54 #define y0_y1                                             r7
55 #define y1_y2                                             r8
56 #define b0                                                r9
57 #define b1                                                r10
58 #define b2                                                r11
59 #define b0_b1                                             r10
60 #define b1_b2                                             r11
61
62
63 #define area_r_s                                          r5
64
65 #define g_bx0                                             r2
66 #define g_bx                                              r3
67 #define g_bx2                                             r4
68 #define g_bx3                                             r5
69 #define b_base                                            r6
70 #define g_by                                              r8
71
72 #define gs_bx                                             r7
73 #define gs_by                                             r10
74
75 #define ga_bx                                             g_bx
76 #define ga_by                                             g_by
77
78 #define gw_bx_h                                           g_bx
79 #define gw_by_h                                           g_by
80
81 #define gw_bx_l                                           r11
82 #define gw_by_l                                           gw_bx_l
83
84 #define store_a                                           r0
85 #define store_b                                           r1
86 #define store_inc                                         r5
87
88
89 #define v0                                                q0
90 #define uvrgb0                                            d0
91 #define x0_y0                                             d1
92
93 #define v1                                                q1
94 #define uvrgb1                                            d2
95 #define x1_y1                                             d3
96
97 #define v2                                                q2
98 #define uvrgb2                                            d4
99 #define x2_y2                                             d5
100
101 #define x0_ab                                             q3
102 #define uvrg_xxxx0                                        q3
103 #define uvrg0                                             d6
104 #define xxxx0                                             d7
105
106 #define x1_ab                                             q4
107 #define uvrg_xxxx1                                        q4
108 #define uvrg1                                             d8
109 #define xxxx1                                             d9
110
111 #define x2_ab                                             q5
112 #define uvrg_xxxx2                                        q5
113 #define uvrg2                                             d10
114 #define xxxx2                                             d11
115
116 #define y0_ab                                             q6
117 #define yyyy_uvrg0                                        q6
118 #define yyyy0                                             d12
119 #define uvrg0b                                            d13
120
121 #define y1_ab                                             q7
122 #define yyyy_uvrg1                                        q7
123 #define yyyy1                                             d14
124 #define uvrg1b                                            d15
125
126 #define y2_ab                                             q8
127 #define yyyy_uvrg2                                        q8
128 #define yyyy2                                             d16
129 #define uvrg2b                                            d17
130
131 #define d0_ab                                             q9
132 #define d0_a                                              d18
133 #define d0_b                                              d19
134
135 #define d1_ab                                             q10
136 #define d1_a                                              d20
137 #define d1_b                                              d21
138
139 #define d2_ab                                             q11
140 #define d2_a                                              d22
141 #define d2_b                                              d23
142
143 #define d3_ab                                             q12
144 #define d3_a                                              d24
145 #define d3_b                                              d25
146
147 #define ga_uvrg_x                                         q1
148 #define ga_uvrg_y                                         q4
149
150 #define dx                                                x0_x1
151 #define dy                                                y0_y1
152 #define db                                                b0_b1
153
154 #define uvrg_base                                         q11
155
156 #define gs_uvrg_x                                         q5
157 #define gs_uvrg_y                                         q6
158
159 #define g_uvrg_x                                          q1
160 #define ga_uv_x                                           d2
161 #define g_uv_x                                            d2
162 #define ga_rg_x                                           d3
163 #define g_rg_x                                            d3
164
165 #define g_uvrg_y                                          q4
166 #define ga_uv_y                                           d8
167 #define g_uv_y                                            d8
168 #define ga_rg_y                                           d9
169 #define g_rg_y                                            d9
170
171 #define gw_uv_x                                           q1
172 #define gw_rg_x                                           q2
173 #define gw_uv_y                                           q4
174 #define gw_rg_y                                           q3
175
176 #define w_mask                                            q9
177 #define w_mask_l                                          d18
178
179 #define r_shift                                           q10
180
181 #define uvrg_dx0                                          q0
182 #define uvrg_dx0l                                         d0
183 #define uvrg_dx0h                                         d1
184
185 #define uvrg_dx1                                          q1
186 #define uvrg_dx1l                                         d2
187 #define uvrg_dx1h                                         d3
188
189 #define uvrg_dx2                                          q2
190 #define uvrg_dx2l                                         d4
191 #define uvrg_dx2h                                         d5
192
193 #define uvrg_dx3                                          q3
194 #define uvrg_dx3l                                         d6
195 #define uvrg_dx3h                                         d7
196
197 #define uvrgb_phase                                       q13
198
199 .align 4
200
201 #include "arm_features.h"
202
203 #define function(name) FUNCTION(name):
204
205 #ifndef TEXRELS_FORBIDDEN
206
207 #define JT_OP_REL(table_label, index_reg, temp)
208 #define JT_OP(x...) x
209 #define JTE(start, target) target
210
211 #else
212
213 #define JT_OP_REL(table_label, index_reg, temp)                                \
214   adr temp, table_label;                                                       \
215   ldr temp, [temp, index_reg, lsl #2];                                         \
216   add pc, pc, temp                                                             \
217
218 #define JT_OP(x...)
219 #define JTE(start, target) (target - start)
220
221 #endif
222
223 #ifdef __MACH__
224 #define flush_render_block_buffer _flush_render_block_buffer
225 #define update_texture_8bpp_cache _update_texture_8bpp_cache
226 #define setup_blocks_uv_adj_hack _setup_blocks_uv_adj_hack
227 #endif
228
229 @ r0: psx_gpu
230 @ r1: v_a
231 @ r2: v_b
232 @ r3: v_c
233
234 function(compute_all_gradients)
235   // First compute the triangle area reciprocal and shift. The division will
236   // happen concurrently with much of the work which follows.
237   @ r12 = psx_gpu->triangle_area
238   ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
239   stmdb sp!, { r4 - r11, lr }
240   save_abi_regs()
241
242   @ load exponent of 62 into upper half of double
243   movw r4, #0
244   clz r14, r12                       @ r14 = shift
245
246   movt r4, #((62 + 1023) << 4)
247   mov r12, r12, lsl r14              @ r12 = triangle_area_normalized
248
249   @ load area normalized into lower half of double
250   mov r5, r12, lsr #10
251   vmov.f64 d30, r5, r4               @ d30 = (1 << 62) + ta_n
252
253   movt r4, #((1022 + 31) << 4)
254   mov r5, r12, lsl #20
255
256   add r4, r4, r12, lsr #11
257   vmov.f64 d31, r5, r4
258
259   vdiv.f64 d30, d30, d31             @ d30 = ((1 << 62) + ta_n) / ta_n
260
261   // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
262   // ( d0       *  d1      ) - ( d2       *  d3      ) =
263   // ( m0                  ) - ( m1                  ) = gradient
264
265   // This is split to do 12 elements at a time over three sets: a, b, and c.
266   // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
267   // two of the slots are unused.
268
269   // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
270   // is g.
271
272   // First type is:  uvrg bxxx xxxx 
273   // Second type is: yyyy ybyy uvrg 
274   // Since x_a and y_c are the same the same variable is used for both. 
275
276   vld1.u32 { v0 }, [v_a, :128]       @ v0 = { uvrg0, b0, x0, y0 }
277   ldrsh x0, [v_a, #8]                @ load x0
278
279   vld1.u32 { v1 }, [v_b, :128]       @ v1 = { uvrg1, b1, x1, y1}
280   ldrh x1, [v_b, #8]                 @ load x1
281
282   vld1.u32 { v2 }, [v_c, :128]       @ v2 = { uvrg2, b2, x2, y2 }
283   ldrh x2, [v_c, #8]                 @ load x2
284
285   vmovl.u8 uvrg_xxxx0, uvrgb0        @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
286   ldrh y0, [v_a, #10]                @ load y0
287
288   vmovl.u8 uvrg_xxxx1, uvrgb1        @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
289   ldrh y1, [v_b, #10]                @ load y1
290
291   vmovl.u8 uvrg_xxxx2, uvrgb2        @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
292   ldrh y2, [v_c, #10]                @ load y2
293
294   vmov.u8 uvrg0b, uvrg0              @ uvrg0b = { uv0, rg0 }
295   vdup.u16 xxxx0, x0_y0[0]           @ xxxx0 = { xx0, xx0 }
296
297   orr x1_x2, x1, x2, lsl #16         @ x1_x2 = { x1, x2 }
298   pkhbt x0_x1, x0, x1, lsl #16       @ x0_x1 = { x0, x1 }
299
300   vmov.u8 uvrg1b, uvrg1              @ uvrg1b = { uv1, rg1 }
301   vdup.u16 xxxx1, x1_y1[0]           @ xxxx1 = { xx1, xx1 }
302
303   vmov.u8 uvrg2b, uvrg2              @ uvrg2b = { uv2, rg2 }
304   vdup.u16 xxxx2, x2_y2[0]           @ xxxx2 = { xx2, xx2 }
305
306   ldrb b2, [v_c, #4]                 @ load b2
307   orr y0_y1, y0, y1, lsl #16         @ y0_y1 = { y0, y1 }
308
309   ldrb b1, [v_b, #4]                 @ load b1
310   orr y1_y2, y1, y2, lsl #16         @ y1_y2 = { y1, y2 }
311
312   vdup.u16 yyyy0, x0_y0[1]           @ yyyy0 = { yy0, yy0 }
313   vsub.s16 d0_ab, x1_ab, x0_ab
314
315   ldrb b0, [v_a, #4]                 @ load b0
316   orr b1_b2, b1, b2, lsl #16         @ b1_b2 = { b1, b2 }
317
318   vdup.u16 yyyy1, x1_y1[1]           @ yyyy1 = { yy1, yy1 }
319   vsub.s16 d2_ab, x2_ab, x1_ab
320
321   vdup.u16 yyyy2, x2_y2[1]           @ yyyy2 = { yy2, yy2 }
322   vsub.s16 d1_ab, y2_ab, y1_ab
323
324   orr b0_b1, b0, b1, lsl #16         @ b1_b2 = { b1, b2 }
325   ssub16 dx, x1_x2, x0_x1            @ dx = { x1 - x0, x2 - x1 }
326
327   ssub16 dy, y1_y2, y0_y1            @ dy = { y1 - y0, y2 - y1 }
328   ssub16 db, b1_b2, b0_b1            @ db = { b1 - b0, b2 - b1 }
329
330   vsub.s16 d3_ab, y1_ab, y0_ab
331   smusdx ga_by, dx, db               @ ga_by = ((x1 - x0) * (b2 - b1)) -
332                                      @         ((x2 - X1) * (b1 - b0)) 
333   vmull.s16 ga_uvrg_x, d0_a, d1_a
334   smusdx ga_bx, db, dy               @ ga_bx = ((b1 - b0) * (y2 - y1)) -
335                                      @         ((b2 - b1) * (y1 - y0))
336   vmlsl.s16 ga_uvrg_x, d2_a, d3_a
337   movs gs_bx, ga_bx, asr #31
338
339   vmull.s16 ga_uvrg_y, d0_b, d1_b
340   rsbmi ga_bx, ga_bx, #0
341
342   @ r12 = psx_gpu->uvrgb_phase
343   ldr r12, [psx_gpu, #psx_gpu_uvrgb_phase_offset]
344
345   vmlsl.s16 ga_uvrg_y, d2_b, d3_b
346   movs gs_by, ga_by, asr #31
347
348   vshr.u64 d0, d30, #22
349   add b_base, r12, b0, lsl #16
350
351   vdup.u32 uvrgb_phase, r12
352
353   rsbmi ga_by, ga_by, #0
354   vclt.s32 gs_uvrg_x, ga_uvrg_x, #0  @ gs_uvrg_x = ga_uvrg_x < 0
355
356   @ r12 = psx_gpu->triangle_winding_offset
357   ldrb r12, [psx_gpu, #psx_gpu_triangle_winding_offset]
358   vclt.s32 gs_uvrg_y, ga_uvrg_y, #0  @ gs_uvrg_y = ga_uvrg_y < 0
359
360   rsb r12, r12, #0                   @ r12 = -(triangle->winding)
361
362   vdup.u32 w_mask, r12               @ w_mask = { -w, -w, -w, -w }
363   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
364
365   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
366   vdup.u32 r_shift, r14              @ r_shift = { shift, shift*, shift, shift* }
367                                      @ * - vshl.u64: ignored by hw
368   vadd.u32 uvrg_base, uvrgb_phase
369   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
370
371   vmov area_r_s, s0                  @ area_r_s = triangle_reciprocal
372   vabs.s32 ga_uvrg_y, ga_uvrg_y      @ ga_uvrg_y = abs(ga_uvrg_y)
373
374   vmull.u32 gw_rg_x, ga_rg_x, d0[0]
375   vmull.u32 gw_uv_x, ga_uv_x, d0[0]
376   vmull.u32 gw_rg_y, ga_rg_y, d0[0]
377   vmull.u32 gw_uv_y, ga_uv_y, d0[0]
378
379   vshl.u64 gw_rg_x, gw_rg_x, r_shift
380   vshl.u64 gw_uv_x, gw_uv_x, r_shift
381   vshl.u64 gw_rg_y, gw_rg_y, r_shift
382   vshl.u64 gw_uv_y, gw_uv_y, r_shift
383
384   veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
385   vmovn.u64 g_uv_x, gw_uv_x
386
387   veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
388   vmovn.u64 g_rg_x, gw_rg_x
389
390   veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
391   vmovn.u64 g_uv_y, gw_uv_y
392
393   vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
394   vmovn.u64 g_rg_y, gw_rg_y
395
396   veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
397   mov ga_bx, ga_bx, lsl #13
398
399   vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
400   mov ga_by, ga_by, lsl #13
401
402   vdup.u32 x0_y0, x0
403   umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
404
405   vshl.u32 g_uvrg_x, g_uvrg_x, #4
406   vshl.u32 g_uvrg_y, g_uvrg_y, #4
407
408   umull gw_by_l, gw_by_h, ga_by, area_r_s
409   vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
410
411   eor gs_bx, gs_bx, r12
412   vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
413
414   veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
415   eor gs_by, gs_by, r12
416
417   rsb r11, r14, #0                   @ r11 = negative shift for scalar lsr
418   add store_a, psx_gpu, #psx_gpu_uvrg_offset
419
420   sub r11, r11, #(32 - 13)
421
422   add store_b, store_a, #16
423   mov store_inc, #32
424
425   vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
426   vst1.u32 { uvrg_base }, [store_a, :128], store_inc
427
428   vst1.u32 { uvrg_dx1 }, [store_b, :128], store_inc
429   mov g_bx, gw_bx_h, lsr r11
430
431   vst1.u32 { g_uvrg_y }, [store_a, :128], store_inc
432   mov g_by, gw_by_h, lsr r11
433
434   vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l },                     \
435    [store_b, :128], store_inc
436   eor g_bx, g_bx, gs_bx
437
438   vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h },                     \
439    [store_b, :128], store_inc
440   sub g_bx, g_bx, gs_bx
441
442   lsl g_bx, g_bx, #4  
443   eor g_by, g_by, gs_by
444
445   mls b_base, g_bx, x0, b_base
446   sub g_by, g_by, gs_by
447
448   lsl g_by, g_by, #4
449   mov g_bx0, #0
450
451   add g_bx2, g_bx, g_bx
452   add g_bx3, g_bx, g_bx2
453
454   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
455
456   restore_abi_regs()
457   ldmia sp!, { r4 - r11, pc }
458
459
460 #define psx_gpu                                  r0
461 #define v_a                                      r1
462 #define v_b                                      r2
463 #define v_c                                      r3
464
465 #define temp                                     r14
466
467 #define x_a                                      r4
468 #define x_b                                      r5
469 #define x_c                                      r6
470 #define y_a                                      r1
471 #define y_b                                      r2
472 #define y_c                                      r3
473
474 #define height_minor_a                           r7
475 #define height_minor_b                           r8
476 #define height_major                             r9
477 #define height                                   r9
478
479 #define reciprocal_table_ptr                     r10
480
481 #define edge_alt_low                             r4
482 #define edge_alt_high                            r5
483 #define edge_dx_dy_alt                           r6
484 #define edge_shift_alt                           r10
485
486 #define edge_dx_dy_alt_low                       r4
487 #define edge_dx_dy_alt_high                      r5
488
489 #define span_edge_data                           r4
490 #define span_uvrg_offset                         r5
491 #define span_b_offset                            r6
492
493 #define clip                                     r14
494
495 #define b                                        r11
496 #define b_dy                                     r12
497
498
499 #define alternate_x                              q0
500 #define alternate_dx_dy                          q1
501 #define alternate_x_32                           q2
502
503 #define alternate_x_low                          d0
504 #define alternate_x_high                         d1
505 #define alternate_dx_dy_low                      d2
506 #define alternate_dx_dy_high                     d3
507 #define alternate_x_32_low                       d4
508 #define alternate_x_32_high                      d5
509
510 #define left_x                                   q3
511 #define right_x                                  q4
512 #define left_dx_dy                               q5
513 #define right_dx_dy                              q6
514 #define left_edge                                q7
515 #define right_edge                               q8
516
517 #define left_x_low                               d6
518 #define left_x_high                              d7
519 #define right_x_low                              d8
520 #define right_x_high                             d9
521 #define left_dx_dy_low                           d10
522 #define left_dx_dy_high                          d11
523 #define right_dx_dy_low                          d12
524 #define right_dx_dy_high                         d13
525 #define left_edge_low                            d14
526 #define left_edge_high                           d15
527 #define right_edge_low                           d16
528 #define right_edge_high                          d17
529
530 #define y_mid_point                              d18
531 #define c_0x0004                                 d19
532
533 #define left_right_x_16                          q11
534 #define span_shifts_y                            q12
535 #define c_0x0001                                 q13
536
537 #define span_shifts                              d24
538 #define y_x4                                     d25
539 #define c_0xFFFE                                 d26
540 #define c_0x0007                                 d27
541
542 #define left_right_x_16_low                      d22
543 #define left_right_x_16_high                     d23
544
545 #define uvrg                                     q14
546 #define uvrg_dy                                  q15
547 #define uv                                       d28
548
549 #define alternate_x_16                           d4
550
551 #define v_clip                                   q3
552 #define v_clip_low                               d6
553
554 #define right_x_32                               q10
555 #define left_x_32                                q11
556 #define alternate_select                         d24
557
558 #define right_x_32_low                           d20
559 #define right_x_32_high                          d21
560 #define left_x_32_low                            d22
561 #define left_x_32_high                           d23
562
563 #define tmp_max_blocks                           d20
564
565 #define edges_xy                                 q0
566 #define edges_dx_dy                              d2
567 #define edge_shifts                              d3
568 #define edge_shifts_64                           q2
569
570 #define edges_xy_left                            d0
571 #define edges_xy_right                           d1
572
573 #define height_reciprocals                       d6
574 #define heights                                  d7
575
576 #define widths                                   d8
577 #define c_0x01                                   d9
578 #define x_starts                                 d10
579 #define x_ends                                   d11
580
581 #define heights_b                                d12
582 #define edges_dx_dy_64                           q10
583
584 #define edges_dx_dy_64_left                      d20
585 #define edges_dx_dy_64_right                     d21
586
587
588 #define setup_spans_prologue()                                                 \
589   stmdb sp!, { r4 - r11, lr };                                                 \
590   save_abi_regs();                                                             \
591                                                                                \
592   ldrsh x_a, [v_a, #8];                                                        \
593   ldrsh x_b, [v_b, #8];                                                        \
594   ldrsh x_c, [v_c, #8];                                                        \
595   ldrsh y_a, [v_a, #10];                                                       \
596   ldrsh y_b, [v_b, #10];                                                       \
597   ldrsh y_c, [v_c, #10];                                                       \
598                                                                                \
599   add temp, psx_gpu, #psx_gpu_uvrg_offset;                                     \
600   vld1.32 { uvrg }, [temp];                                                    \
601   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
602   vld1.32 { uvrg_dy }, [temp];                                                 \
603   ldr reciprocal_table_ptr, [psx_gpu, #psx_gpu_reciprocal_table_ptr_offset];   \
604                                                                                \
605   vmov.u32 c_0x01, #0x01                                                       \
606
607 #define setup_spans_load_b()                                                   \
608   ldr b, [psx_gpu, #psx_gpu_b_offset];                                         \
609   ldr b_dy, [psx_gpu, #psx_gpu_b_dy_offset]                                    \
610
611 #define setup_spans_prologue_b()                                               \
612   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
613   add temp, psx_gpu, #psx_gpu_viewport_start_x_offset;                         \
614                                                                                \
615   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
616   vmov.u16 c_0x0004, #0x0004;                                                  \
617                                                                                \
618   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
619   vmov.u16 c_0x0001, #0x0001;                                                  \
620                                                                                \
621   vld1.u16 { left_edge_low[], left_edge_high[] }, [temp];                      \
622   add temp, psx_gpu, #psx_gpu_viewport_end_x_offset;                           \
623                                                                                \
624   vld1.u16 { right_edge_low[], right_edge_high[] }, [temp];                    \
625   vadd.u16 right_edge, right_edge, c_0x0001;                                   \
626                                                                                \
627   vmov.u16 c_0x0007, #0x0007;                                                  \
628   vmvn.u16 c_0xFFFE, #0x0001                                                   \
629
630
631 #define compute_edge_delta_x2()                                                \
632   ldr temp, [reciprocal_table_ptr, height, lsl #2];                            \
633                                                                                \
634   vdup.u32 heights, height;                                                    \
635   vsub.u32 widths, x_ends, x_starts;                                           \
636                                                                                \
637   vdup.u32 edge_shifts, temp;                                                  \
638   vsub.u32 heights_b, heights, c_0x01;                                         \
639   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
640                                                                                \
641   vmla.s32 heights_b, x_starts, heights;                                       \
642   vbic.u16 edge_shifts, #0xE0;                                                 \
643   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
644   vmull.s32 edges_xy, heights_b, height_reciprocals                            \
645
646 #define width_alt                 r6
647 #define height_reciprocal_alt     r11
648 #define height_b_alt              r12
649
650 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
651   vmov heights, height_a, height_b;                                            \
652   ldr temp, [reciprocal_table_ptr, height_a, lsl #2];                          \
653   vmov.u32 edge_shifts[0], temp;                                               \
654   ldr temp, [reciprocal_table_ptr, height_b, lsl #2];                          \
655   vmov.u32 edge_shifts[1], temp;                                               \
656   ldr edge_shift_alt, [reciprocal_table_ptr, height_minor_b, lsl #2];          \
657                                                                                \
658   vsub.u32 widths, x_ends, x_starts;                                           \
659   sub width_alt, x_c, start_c;                                                 \
660                                                                                \
661   vsub.u32 heights_b, heights, c_0x01;                                         \
662   sub height_b_alt, height_minor_b, #1;                                        \
663                                                                                \
664   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
665   lsr height_reciprocal_alt, edge_shift_alt, #10;                              \
666                                                                                \
667   vmla.s32 heights_b, x_starts, heights;                                       \
668   mla height_b_alt, height_minor_b, start_c, height_b_alt;                     \
669                                                                                \
670   vbic.u16 edge_shifts, #0xE0;                                                 \
671   and edge_shift_alt, edge_shift_alt, #0x1F;                                   \
672                                                                                \
673   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
674   mul edge_dx_dy_alt, width_alt, height_reciprocal_alt;                        \
675                                                                                \
676   vmull.s32 edges_xy, heights_b, height_reciprocals;                           \
677   smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt       \
678
679
680 #define setup_spans_adjust_y_up()                                              \
681   vsub.u32 y_x4, y_x4, c_0x0004                                                \
682
683 #define setup_spans_adjust_y_down()                                            \
684   vadd.u32 y_x4, y_x4, c_0x0004                                                \
685
686 #define setup_spans_adjust_interpolants_up()                                   \
687   vsub.u32 uvrg, uvrg, uvrg_dy;                                                \
688   sub b, b, b_dy                                                               \
689
690 #define setup_spans_adjust_interpolants_down()                                 \
691   vadd.u32 uvrg, uvrg, uvrg_dy;                                                \
692   add b, b, b_dy                                                               \
693
694
695 #define setup_spans_clip_interpolants_increment()                              \
696   mla b, b_dy, clip, b;                                                        \
697   vmla.s32 uvrg, uvrg_dy, v_clip                                               \
698
699 #define setup_spans_clip_interpolants_decrement()                              \
700   mls b, b_dy, clip, b;                                                        \
701   vmls.s32 uvrg, uvrg_dy, v_clip                                               \
702
703 #define setup_spans_clip_alternate_yes()                                       \
704   smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip                      \
705
706 #define setup_spans_clip_alternate_no()                                        \
707
708 #define setup_spans_clip(direction, alternate_active)                          \
709   vdup.u32 v_clip, clip;                                                       \
710   setup_spans_clip_alternate_##alternate_active();                             \
711   setup_spans_clip_interpolants_##direction();                                 \
712   vmlal.s32 edges_xy, edges_dx_dy, v_clip_low                                  \
713
714
715 #define setup_spans_adjust_edges_alternate_no(left_index, right_index)         \
716   vmovl.s32 edge_shifts_64, edge_shifts;                                       \
717   vmovl.s32 edges_dx_dy_64, edges_dx_dy;                                       \
718                                                                                \
719   vshl.s64 edges_xy, edges_xy, edge_shifts_64;                                 \
720   vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64;                     \
721                                                                                \
722   vmov left_x_low, edges_xy_##left_index;                                      \
723   vmov right_x_low, edges_xy_##right_index;                                    \
724                                                                                \
725   vmov left_dx_dy_low, edges_dx_dy_64_##left_index;                            \
726   vmov left_dx_dy_high, edges_dx_dy_64_##left_index;                           \
727   vmov right_dx_dy_low, edges_dx_dy_64_##right_index;                          \
728   vmov right_dx_dy_high, edges_dx_dy_64_##right_index;                         \
729                                                                                \
730   vadd.u64 left_x_high, left_x_low, left_dx_dy_low;                            \
731   vadd.u64 right_x_high, right_x_low, right_dx_dy_low;                         \
732                                                                                \
733   vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy;                                 \
734   vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy                               \
735
736
737 #define setup_spans_adjust_edges_alternate_yes(left_index, right_index)        \
738   setup_spans_adjust_edges_alternate_no(left_index, right_index);              \
739                                                                                \
740   vdup.u16 y_mid_point, y_b;                                                   \
741   rsb temp, edge_shift_alt, #32;                                               \
742                                                                                \
743   lsl edge_alt_high, edge_alt_high, edge_shift_alt;                            \
744   orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp;                    \
745   lsl edge_alt_low, edge_alt_low, edge_shift_alt;                              \
746   vmov alternate_x_low, edge_alt_low, edge_alt_high;                           \
747                                                                                \
748   asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp;                               \
749   lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt;                      \
750   vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high;           \
751   vmov alternate_dx_dy_high, alternate_dx_dy_low;                              \
752                                                                                \
753   vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low;             \
754   vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy                   \
755
756
757 #define setup_spans_y_select_up()                                              \
758   vclt.s16 alternate_select, y_x4, y_mid_point                                 \
759
760 #define setup_spans_y_select_down()                                            \
761   vcgt.s16 alternate_select, y_x4, y_mid_point                                 \
762
763
764 #define setup_spans_alternate_select_left()                                    \
765   vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select               \
766
767 #define setup_spans_alternate_select_right()                                   \
768   vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select              \
769
770
771 #define setup_spans_set_x4_alternate_yes(alternate, direction)                 \
772   vshrn.s64 alternate_x_32_low, alternate_x, #32;                              \
773   vshrn.s64 left_x_32_low, left_x, #32;                                        \
774   vshrn.s64 right_x_32_low, right_x, #32;                                      \
775                                                                                \
776   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
777   vadd.u64 left_x, left_x, left_dx_dy;                                         \
778   vadd.u64 right_x, right_x, right_dx_dy;                                      \
779                                                                                \
780   vshrn.s64 alternate_x_32_high, alternate_x, #32;                             \
781   vshrn.s64 left_x_32_high, left_x, #32;                                       \
782   vshrn.s64 right_x_32_high, right_x, #32;                                     \
783                                                                                \
784   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
785   vadd.u64 left_x, left_x, left_dx_dy;                                         \
786   vadd.u64 right_x, right_x, right_dx_dy;                                      \
787                                                                                \
788   vmovn.u32 alternate_x_16, alternate_x_32;                                    \
789   setup_spans_y_select_##direction();                                          \
790   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
791                                                                                \
792   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
793   setup_spans_alternate_select_##alternate();                                  \
794                                                                                \
795   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
796   str b, [span_b_offset], #4;                                                  \
797   setup_spans_adjust_interpolants_##direction();                               \
798                                                                                \
799   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
800                                                                                \
801   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
802   str b, [span_b_offset], #4;                                                  \
803   setup_spans_adjust_interpolants_##direction();                               \
804                                                                                \
805   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
806                                                                                \
807   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
808   str b, [span_b_offset], #4;                                                  \
809   setup_spans_adjust_interpolants_##direction();                               \
810                                                                                \
811   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
812   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
813   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
814                                                                                \
815   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
816   str b, [span_b_offset], #4;                                                  \
817   setup_spans_adjust_interpolants_##direction();                               \
818                                                                                \
819   vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
820   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
821   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
822   vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
823                                                                                \
824   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
825                                                                                \
826   setup_spans_adjust_y_##direction()                                           \
827
828
829 #define setup_spans_set_x4_alternate_no(alternate, direction)                  \
830   vshrn.s64 left_x_32_low, left_x, #32;                                        \
831   vshrn.s64 right_x_32_low, right_x, #32;                                      \
832                                                                                \
833   vadd.u64 left_x, left_x, left_dx_dy;                                         \
834   vadd.u64 right_x, right_x, right_dx_dy;                                      \
835                                                                                \
836   vshrn.s64 left_x_32_high, left_x, #32;                                       \
837   vshrn.s64 right_x_32_high, right_x, #32;                                     \
838                                                                                \
839   vadd.u64 left_x, left_x, left_dx_dy;                                         \
840   vadd.u64 right_x, right_x, right_dx_dy;                                      \
841                                                                                \
842   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
843   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
844                                                                                \
845   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
846   str b, [span_b_offset], #4;                                                  \
847   setup_spans_adjust_interpolants_##direction();                               \
848                                                                                \
849   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
850                                                                                \
851   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
852   str b, [span_b_offset], #4;                                                  \
853   setup_spans_adjust_interpolants_##direction();                               \
854                                                                                \
855   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
856                                                                                \
857   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
858   str b, [span_b_offset], #4;                                                  \
859   setup_spans_adjust_interpolants_##direction();                               \
860                                                                                \
861   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
862   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
863   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
864                                                                                \
865   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
866   str b, [span_b_offset], #4;                                                  \
867   setup_spans_adjust_interpolants_##direction();                               \
868                                                                                \
869   vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
870   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
871   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
872   vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
873                                                                                \
874   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
875                                                                                \
876   setup_spans_adjust_y_##direction()                                           \
877
878
879 #define edge_adjust_low           r11
880 #define edge_adjust_high          r12
881
882 #define setup_spans_alternate_adjust_yes()                                     \
883   smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a;     \
884   subs edge_alt_low, edge_alt_low, edge_adjust_low;                            \
885   sbc edge_alt_high, edge_alt_high, edge_adjust_high                           \
886
887 #define setup_spans_alternate_adjust_no()                                      \
888
889
890 #define setup_spans_down(left_index, right_index, alternate, alternate_active) \
891   setup_spans_alternate_adjust_##alternate_active();                           \
892   setup_spans_load_b();                                                        \
893                                                                                \
894   ldrsh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset];                       \
895   subs y_c, y_c, temp;                                                         \
896   subgt height, height, y_c;                                                   \
897   addgt height, height, #1;                                                    \
898                                                                                \
899   ldrsh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset];                     \
900   subs clip, temp, y_a;                                                        \
901   ble 0f;                                                                      \
902                                                                                \
903   sub height, height, clip;                                                    \
904   add y_a, y_a, clip;                                                          \
905   setup_spans_clip(increment, alternate_active);                               \
906                                                                                \
907  0:                                                                            \
908   cmp height, #0;                                                              \
909   ble 1f;                                                                      \
910                                                                                \
911   orr temp, y_a, y_a, lsl #16;                                                 \
912   cmp height, #512;                                                            \
913   add temp, temp, #(1 << 16);                                                  \
914   movgt height, #512;                                                          \
915   add y_a, temp, #2;                                                           \
916   add y_a, y_a, #(2 << 16);                                                    \
917   vmov y_x4, temp, y_a;                                                        \
918                                                                                \
919   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
920    right_index);                                                               \
921   setup_spans_prologue_b();                                                    \
922                                                                                \
923   strh height, [psx_gpu, #psx_gpu_num_spans_offset];                           \
924                                                                                \
925  2:                                                                            \
926   setup_spans_set_x4_alternate_##alternate_active(alternate, down);            \
927   subs height, height, #4;                                                     \
928   bhi 2b;                                                                      \
929                                                                                \
930   nop;                                                                         \
931   ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset];                           \
932   tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V);                      \
933   beq 1f;                                                                      \
934   add temp, span_uvrg_offset, height, lsl #4;                                  \
935   vldr uv, [temp, #(-16*2)];                                                   \
936   vstr uv, [temp, #(-16)];                                                     \
937                                                                                \
938  1:                                                                            \
939
940
941 #define setup_spans_alternate_pre_increment_yes()                              \
942   adds edge_alt_low, edge_alt_low, edge_dx_dy_alt;                             \
943   adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31                    \
944
945 #define setup_spans_alternate_pre_increment_no()                               \
946
947
948 #define setup_spans_up_decrement_yes()                                         \
949   suble height, height, #1                                                     \
950
951 #define setup_spans_up_decrement_no()                                          \
952
953
954 #define setup_spans_up(left_index, right_index, alternate, alternate_active)   \
955   setup_spans_alternate_adjust_##alternate_active();                           \
956   setup_spans_load_b();                                                        \
957   sub y_a, y_a, #1;                                                            \
958                                                                                \
959   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset];                      \
960   subs temp, temp, y_c;                                                        \
961   subgt height, height, temp;                                                  \
962   setup_spans_up_decrement_##alternate_active();                               \
963                                                                                \
964   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset];                        \
965   subs clip, y_a, temp;                                                        \
966   ble 0f;                                                                      \
967                                                                                \
968   sub height, height, clip;                                                    \
969   sub y_a, y_a, clip;                                                          \
970   setup_spans_clip(decrement, alternate_active);                               \
971                                                                                \
972  0:                                                                            \
973   cmp height, #0;                                                              \
974   ble 1f;                                                                      \
975                                                                                \
976   orr temp, y_a, y_a, lsl #16;                                                 \
977   cmp height, #512;                                                            \
978   sub temp, temp, #(1 << 16);                                                  \
979   movgt height, #512;                                                          \
980   sub y_a, temp, #2;                                                           \
981   sub y_a, y_a, #(2 << 16);                                                    \
982   vmov y_x4, temp, y_a;                                                        \
983                                                                                \
984   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
985                                                                                \
986   setup_spans_alternate_pre_increment_##alternate_active();                    \
987   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
988    right_index);                                                               \
989   setup_spans_adjust_interpolants_up();                                        \
990   setup_spans_prologue_b();                                                    \
991                                                                                \
992   strh height, [psx_gpu, #psx_gpu_num_spans_offset];                           \
993                                                                                \
994  2:                                                                            \
995   setup_spans_set_x4_alternate_##alternate_active(alternate, up);              \
996   subs height, height, #4;                                                     \
997   bhi 2b;                                                                      \
998                                                                                \
999   nop;                                                                         \
1000   ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset];                           \
1001   tst temp, #AHACK_TEXTURE_ADJ_V;                                              \
1002   beq 1f;                                                                      \
1003   add temp, psx_gpu, #psx_gpu_span_uvrg_offset_offset;                         \
1004   vldr uv, [temp, #16];                                                        \
1005   vstr uv, [temp, #0];                                                         \
1006                                                                                \
1007  1:                                                                            \
1008
1009
1010 #define setup_spans_epilogue()                                                 \
1011   restore_abi_regs();                                                          \
1012   ldmia sp!, { r4 - r11, pc }                                                  \
1013
1014
1015 #define setup_spans_up_up(minor, major)                                        \
1016   setup_spans_prologue();                                                      \
1017   sub height_minor_a, y_a, y_b;                                                \
1018   sub height_minor_b, y_b, y_c;                                                \
1019   sub height, y_a, y_c;                                                        \
1020                                                                                \
1021   vdup.u32 x_starts, x_a;                                                      \
1022   vmov x_ends, x_c, x_b;                                                       \
1023                                                                                \
1024   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1025   setup_spans_up(major, minor, minor, yes);                                    \
1026   setup_spans_epilogue()                                                       \
1027
1028 function(setup_spans_up_left)
1029   setup_spans_up_up(left, right)
1030
1031 function(setup_spans_up_right)
1032   setup_spans_up_up(right, left)
1033
1034 #define setup_spans_down_down(minor, major)                                    \
1035   setup_spans_prologue();                                                      \
1036   sub height_minor_a, y_b, y_a;                                                \
1037   sub height_minor_b, y_c, y_b;                                                \
1038   sub height, y_c, y_a;                                                        \
1039                                                                                \
1040   vdup.u32 x_starts, x_a;                                                      \
1041   vmov x_ends, x_c, x_b;                                                       \
1042                                                                                \
1043   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1044   setup_spans_down(major, minor, minor, yes);                                  \
1045   setup_spans_epilogue()                                                       \
1046
1047 function(setup_spans_down_left)
1048   setup_spans_down_down(left, right)
1049
1050 function(setup_spans_down_right)
1051   setup_spans_down_down(right, left)
1052
1053
1054 #define setup_spans_up_flat()                                                  \
1055   sub height, y_a, y_c;                                                        \
1056                                                                                \
1057   compute_edge_delta_x2();                                                     \
1058   setup_spans_up(left, right, none, no);                                       \
1059   setup_spans_epilogue()                                                       \
1060
1061 function(setup_spans_up_a)
1062   setup_spans_prologue()
1063
1064   vmov x_starts, x_a, x_b
1065   vdup.u32 x_ends, x_c
1066
1067   setup_spans_up_flat()
1068
1069 function(setup_spans_up_b)
1070   setup_spans_prologue()
1071
1072   vdup.u32 x_starts, x_a
1073   vmov x_ends, x_b, x_c
1074
1075   setup_spans_up_flat()
1076
1077 #define setup_spans_down_flat()                                                \
1078   sub height, y_c, y_a;                                                        \
1079                                                                                \
1080   compute_edge_delta_x2();                                                     \
1081   setup_spans_down(left, right, none, no);                                     \
1082   setup_spans_epilogue()                                                       \
1083
1084 function(setup_spans_down_a)
1085   setup_spans_prologue()
1086
1087   vmov x_starts, x_a, x_b
1088   vdup.u32 x_ends, x_c
1089
1090   setup_spans_down_flat()
1091
1092 function(setup_spans_down_b)
1093   setup_spans_prologue()
1094
1095   vdup.u32 x_starts, x_a
1096   vmov x_ends, x_b, x_c
1097
1098   setup_spans_down_flat()
1099
1100
1101 #define middle_y                                          r9
1102
1103 #define edges_xy_b                                        q11
1104 #define edges_dx_dy_b                                     d26
1105 #define edge_shifts_b                                     d27
1106 #define edges_dx_dy_and_shifts_b                          q13
1107 #define height_increment                                  d20
1108
1109 #define edges_dx_dy_and_shifts                            q1
1110
1111 #define edges_xy_b_left                                   d22
1112 #define edges_xy_b_right                                  d23
1113
1114 #define setup_spans_up_down_load_edge_set_b()                                  \
1115   vmov edges_xy, edges_xy_b;                                                   \
1116   vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b                        \
1117
1118
1119 function(setup_spans_up_down)
1120   setup_spans_prologue()
1121
1122   // s32 middle_y = y_a;
1123   sub height_minor_a, y_a, y_b
1124   sub height_minor_b, y_c, y_a
1125   sub height_major, y_c, y_b
1126
1127   vmov x_starts, x_a, x_c
1128   vdup.u32 x_ends, x_b
1129
1130   compute_edge_delta_x3(x_a, height_minor_a, height_major)
1131
1132   mov temp, #0
1133   vmov height_increment, temp, height_minor_b
1134   vmlal.s32 edges_xy, edges_dx_dy, height_increment
1135
1136   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1137   vmov edges_xy_b_right, edges_xy_right
1138
1139   vmov edge_shifts_b, edge_shifts
1140   vmov.u32 edge_shifts_b[0], edge_shift_alt
1141
1142   vneg.s32 edges_dx_dy_b, edges_dx_dy
1143   vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1144
1145   mov middle_y, y_a
1146   
1147   setup_spans_load_b()
1148   sub y_a, y_a, #1
1149
1150   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
1151   subs temp, temp, y_b
1152   subgt height_minor_a, height_minor_a, temp
1153
1154   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
1155   subs clip, y_a, temp
1156   ble 0f
1157
1158   sub height_minor_a, height_minor_a, clip
1159   sub y_a, y_a, clip
1160   setup_spans_clip(decrement, no)
1161
1162  0:                                                                
1163   cmp height_minor_a, #0
1164   ble 3f
1165
1166   orr temp, y_a, y_a, lsl #16
1167   sub temp, temp, #(1 << 16)
1168   sub y_a, temp, #2
1169   sub y_a, y_a, #(2 << 16)
1170   vmov y_x4, temp, y_a
1171
1172   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1173
1174   strh height_minor_a, [psx_gpu, #psx_gpu_num_spans_offset]
1175
1176   setup_spans_adjust_edges_alternate_no(left, right); 
1177   setup_spans_adjust_interpolants_up()
1178   setup_spans_up_down_load_edge_set_b()
1179
1180   setup_spans_prologue_b()
1181
1182
1183  2: 
1184   setup_spans_set_x4_alternate_no(none, up)
1185   subs height_minor_a, height_minor_a, #4
1186   bhi 2b
1187
1188   add span_edge_data, span_edge_data, height_minor_a, lsl #3
1189   add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1190   add span_b_offset, span_b_offset, height_minor_a, lsl #2
1191
1192  4:
1193   add temp, psx_gpu, #psx_gpu_uvrg_offset
1194   vld1.32 { uvrg }, [temp]
1195   mov y_a, middle_y
1196   
1197   setup_spans_load_b()
1198
1199   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
1200   subs y_c, y_c, temp
1201   subgt height_minor_b, height_minor_b, y_c
1202   addgt height_minor_b, height_minor_b, #1
1203
1204   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
1205   subs clip, temp, y_a
1206   ble 0f
1207
1208   sub height_minor_b, height_minor_b, clip
1209   add y_a, y_a, clip
1210   setup_spans_clip(increment, no)
1211
1212  0:
1213   cmp height_minor_b, #0
1214   ble 1f
1215
1216   orr temp, y_a, y_a, lsl #16
1217   add temp, temp, #(1 << 16) 
1218   add y_a, temp, #2
1219   add y_a, y_a, #(2 << 16)
1220   vmov y_x4, temp, y_a
1221
1222   setup_spans_adjust_edges_alternate_no(left, right)
1223
1224   ldrh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1225   add temp, temp, height_minor_b
1226
1227   cmp temp, #MAX_SPANS
1228   beq 5f
1229
1230   strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1231
1232  2:                                                     
1233   setup_spans_set_x4_alternate_no(none, down)
1234   subs height_minor_b, height_minor_b, #4
1235   bhi 2b
1236
1237   nop
1238   ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]
1239   tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)
1240   beq 1f
1241   add temp, span_uvrg_offset, height, lsl #4
1242   vldr uv, [temp, #(-16*2)]
1243   vstr uv, [temp, #(-16)]
1244
1245  1:
1246   setup_spans_epilogue()
1247
1248  3:
1249   setup_spans_up_down_load_edge_set_b()
1250   setup_spans_prologue_b()
1251   bal 4b
1252
1253  5:
1254   // FIXME: overflow corner case
1255   sub temp, temp, height_minor_b
1256   bics height_minor_b, #3
1257   add temp, temp, height_minor_b
1258   strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1259   bne 2b
1260   bal 1b
1261
1262 #undef span_uvrg_offset
1263 #undef span_edge_data
1264 #undef span_b_offset
1265 #undef left_x
1266 #undef b
1267
1268 #define psx_gpu                                  r0
1269 #define num_spans                                r1
1270 #define span_uvrg_offset                         r2
1271 #define span_edge_data                           r3
1272 #define span_b_offset                            r4
1273 #define b_dx                                     r5
1274 #define span_num_blocks                          r6
1275 #define y                                        r7
1276 #define left_x                                   r8
1277 #define b                                        r9
1278 #define dither_offset_ptr                        r10
1279 #define block_ptr_a                              r11
1280 #define fb_ptr                                   r12
1281 #define num_blocks                               r14
1282
1283 #define uvrg_dx_ptr                              r2
1284 #define texture_mask_ptr                         r3
1285 #define hacks_active                             r6
1286 #define dither_shift                             r8
1287 #define dither_row                               r10
1288
1289 #define c_32                                     r7
1290 #define b_dx4                                    r8
1291 #define b_dx8                                    r9
1292 #define block_ptr_b                              r10
1293
1294 #define block_span_ptr                           r10
1295 #define right_mask                               r8
1296
1297 #define color                                    r2
1298 #define color_r                                  r3
1299 #define color_g                                  r4
1300 #define color_b                                  r5
1301
1302 #undef uvrg
1303 #undef uv
1304
1305 #define u_block                                  q0
1306 #define v_block                                  q1
1307 #define r_block                                  q2
1308 #define g_block                                  q3
1309 #define b_block                                  q4
1310
1311 #define uv_dx4                                   d10
1312 #define rg_dx4                                   d11
1313 #define uv_dx8                                   d12
1314 #define rg_dx8                                   d13
1315 #define b_whole_8                                d14
1316 #define fb_mask_ptrs                             d15
1317
1318 #define uvrg_dx4                                 q5
1319 #define uvrg_dx8                                 q6
1320 #define uv_dx8                                   d12
1321 #define rg_dx8                                   d13
1322
1323 #define u_whole                                  q8
1324 #define v_whole                                  q9
1325 #define r_whole                                  q10
1326 #define g_whole                                  q11
1327 #define b_whole                                  q12
1328
1329 #define u_whole_low                              d16
1330 #define u_whole_high                             d17
1331 #define v_whole_low                              d18
1332 #define v_whole_high                             d19
1333 #define r_whole_low                              d20
1334 #define r_whole_high                             d21
1335 #define g_whole_low                              d22
1336 #define g_whole_high                             d23
1337 #define b_whole_low                              d24
1338 #define b_whole_high                             d25
1339
1340 #define dx4                                      q13
1341 #define dx8                                      q13
1342
1343 #define u_whole_8                                d26
1344 #define v_whole_8                                d27
1345 #define u_whole_8b                               d24
1346 #define r_whole_8                                d24
1347 #define g_whole_8                                d25
1348
1349 #define uv_whole_8                               q13
1350 #define uv_whole_8b                              q14
1351
1352 #define dither_offsets                           q14
1353 #define texture_mask                             q15
1354 #define texture_mask_u                           d30
1355 #define texture_mask_v                           d31
1356
1357 #define dither_offsets_short                     d28
1358
1359 #define v_left_x                                 q8
1360 #define uvrg                                     q9
1361 #define block_span                               q10
1362
1363 #define uv                                       d18
1364 #define rg                                       d19
1365
1366 #define draw_mask                                q1
1367 #define draw_mask_edge                           q13
1368 #define test_mask                                q0
1369
1370 #define uvrg_dx                                  q3
1371
1372 #define colors                                   q2
1373
1374 #define setup_blocks_texture_swizzled()                                        \
1375   vand.u8 u_whole_8b, u_whole_8, texture_mask_u;                               \
1376   vsli.u8 u_whole_8, v_whole_8, #4;                                            \
1377   vsri.u8 v_whole_8, u_whole_8b, #4                                            \
1378
1379 #define setup_blocks_texture_unswizzled()                                      \
1380
1381 #define setup_blocks_uv_adj_hack_textured(hacks_active)                        \
1382   tst hacks_active, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V);              \
1383   beq 91f;                                                                     \
1384                                                                                \
1385   /* pushing odd num of regs here realigns our unaligned stack */              \
1386   vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
1387   vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
1388   push { r0 - r4, EXTRA_UNSAVED_REGS r12, r14 };                               \
1389   mov r12, span_uvrg_offset;                                                   \
1390   sub r1, block_ptr_a, #64;                                                    \
1391   mov r2, span_edge_data;                                                      \
1392   mov r3, r12;                                                                 \
1393   bl setup_blocks_uv_adj_hack; /* psx_gpu=r0 */                                \
1394   pop  { r0 - r4, EXTRA_UNSAVED_REGS r12, r14 };                               \
1395   vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
1396   vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
1397                                                                                \
1398   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1399 91:                                                                            \
1400
1401
1402 #define setup_blocks_shaded_textured_builder(swizzling)                        \
1403 .align 3;                                                                      \
1404                                                                                \
1405 function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
1406   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
1407   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1408                                                                                \
1409   vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128];                                   \
1410   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1411                                                                                \
1412   cmp num_spans, #0;                                                           \
1413   bxeq lr;                                                                     \
1414                                                                                \
1415   stmdb sp!, { r4 - r11, r14 };                                                \
1416   save_abi_regs();                                                             \
1417   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1418                                                                                \
1419   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
1420   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1421                                                                                \
1422   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16];     \
1423   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1424                                                                                \
1425   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1426   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1427                                                                                \
1428   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
1429   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1430                                                                                \
1431   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1432                                                                                \
1433  0:                                                                            \
1434   vmov.u8 fb_mask_ptrs, #0;                                                    \
1435                                                                                \
1436   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
1437   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1438                                                                                \
1439   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
1440   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
1441                                                                                \
1442   cmp span_num_blocks, #0;                                                     \
1443   beq 1f;                                                                      \
1444                                                                                \
1445   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
1446   add num_blocks, span_num_blocks, num_blocks;                                 \
1447                                                                                \
1448   cmp num_blocks, #MAX_BLOCKS;                                                 \
1449   bgt 2f;                                                                      \
1450                                                                                \
1451  3:                                                                            \
1452   ldr b, [span_b_offset];                                                      \
1453   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1454                                                                                \
1455   vdup.u32 v_left_x, left_x;                                                   \
1456   and y, y, #0x3;                                                              \
1457                                                                                \
1458   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
1459   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1460                                                                                \
1461   mla b, b_dx, left_x, b;                                                      \
1462   and dither_shift, left_x, #0x03;                                             \
1463                                                                                \
1464   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
1465   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1466                                                                                \
1467   mov dither_shift, dither_shift, lsl #3;                                      \
1468   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1469                                                                                \
1470   mov c_32, #32;                                                               \
1471   subs span_num_blocks, span_num_blocks, #1;                                   \
1472                                                                                \
1473   mov dither_row, dither_row, ror dither_shift;                                \
1474   mov b_dx4, b_dx, lsl #2;                                                     \
1475                                                                                \
1476   vdup.u32 dither_offsets_short, dither_row;                                   \
1477   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1478                                                                                \
1479   vdup.u32 b_block, b;                                                         \
1480   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1481                                                                                \
1482   vdup.u32 u_block, uv[0];                                                     \
1483   mov b_dx8, b_dx, lsl #3;                                                     \
1484                                                                                \
1485   vdup.u32 v_block, uv[1];                                                     \
1486   vdup.u32 r_block, rg[0];                                                     \
1487   vdup.u32 g_block, rg[1];                                                     \
1488                                                                                \
1489   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1490                                                                                \
1491   vadd.u32 u_block, u_block, block_span;                                       \
1492   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1493                                                                                \
1494   vadd.u32 v_block, v_block, block_span;                                       \
1495   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1496                                                                                \
1497   vadd.u32 r_block, r_block, block_span;                                       \
1498   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1499                                                                                \
1500   vadd.u32 g_block, g_block, block_span;                                       \
1501   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
1502                                                                                \
1503   vadd.u32 b_block, b_block, block_span;                                       \
1504   add block_ptr_b, block_ptr_a, #16;                                           \
1505                                                                                \
1506   vshrn.u32 u_whole_low, u_block, #16;                                         \
1507   vshrn.u32 v_whole_low, v_block, #16;                                         \
1508   vshrn.u32 r_whole_low, r_block, #16;                                         \
1509   vshrn.u32 g_whole_low, g_block, #16;                                         \
1510                                                                                \
1511   vdup.u32 dx4, uv_dx4[0];                                                     \
1512   vshrn.u32 b_whole_low, b_block, #16;                                         \
1513                                                                                \
1514   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1515   vdup.u32 dx4, uv_dx4[1];                                                     \
1516                                                                                \
1517   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1518   vdup.u32 dx4, rg_dx4[0];                                                     \
1519                                                                                \
1520   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1521   vdup.u32 dx4, rg_dx4[1];                                                     \
1522                                                                                \
1523   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1524   vdup.u32 dx4, b_dx4;                                                         \
1525                                                                                \
1526   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1527   vdup.u32 dx8, uv_dx8[0];                                                     \
1528                                                                                \
1529   vadd.u32 u_block, u_block, dx8;                                              \
1530   vdup.u32 dx8, uv_dx8[1];                                                     \
1531                                                                                \
1532   vadd.u32 v_block, v_block, dx8;                                              \
1533   vdup.u32 dx8, rg_dx8[0];                                                     \
1534                                                                                \
1535   vadd.u32 r_block, r_block, dx8;                                              \
1536   vdup.u32 dx8, rg_dx8[1];                                                     \
1537                                                                                \
1538   vadd.u32 g_block, g_block, dx8;                                              \
1539   vdup.u32 dx8, b_dx8;                                                         \
1540                                                                                \
1541   vadd.u32 b_block, b_block, dx8;                                              \
1542   vmovn.u16 u_whole_8, u_whole;                                                \
1543                                                                                \
1544   vmovn.u16 v_whole_8, v_whole;                                                \
1545                                                                                \
1546   vmovn.u16 b_whole_8, b_whole;                                                \
1547   pld [fb_ptr];                                                                \
1548   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1549                                                                                \
1550   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1551   setup_blocks_texture_##swizzling();                                          \
1552                                                                                \
1553   vmovn.u16 r_whole_8, r_whole;                                                \
1554   beq 5f;                                                                      \
1555                                                                                \
1556  4:                                                                            \
1557   vmovn.u16 g_whole_8, g_whole;                                                \
1558   vshrn.u32 u_whole_low, u_block, #16;                                         \
1559                                                                                \
1560   vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32;                 \
1561   vshrn.u32 v_whole_low, v_block, #16;                                         \
1562                                                                                \
1563   vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32;                \
1564   vshrn.u32 r_whole_low, r_block, #16;                                         \
1565                                                                                \
1566   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1567   vshrn.u32 g_whole_low, g_block, #16;                                         \
1568                                                                                \
1569   vdup.u32 dx4, uv_dx4[0];                                                     \
1570   vshrn.u32 b_whole_low, b_block, #16;                                         \
1571                                                                                \
1572   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1573   vdup.u32 dx4, uv_dx4[1];                                                     \
1574                                                                                \
1575   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1576   vdup.u32 dx4, rg_dx4[0];                                                     \
1577                                                                                \
1578   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1579   vdup.u32 dx4, rg_dx4[1];                                                     \
1580                                                                                \
1581   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1582   vdup.u32 dx4, b_dx4;                                                         \
1583                                                                                \
1584   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1585   vdup.u32 dx8, uv_dx8[0];                                                     \
1586                                                                                \
1587   vadd.u32 u_block, u_block, dx8;                                              \
1588   vdup.u32 dx8, uv_dx8[1];                                                     \
1589                                                                                \
1590   vadd.u32 v_block, v_block, dx8;                                              \
1591   vdup.u32 dx8, rg_dx8[0];                                                     \
1592                                                                                \
1593   vadd.u32 r_block, r_block, dx8;                                              \
1594   vdup.u32 dx8, rg_dx8[1];                                                     \
1595                                                                                \
1596   vadd.u32 g_block, g_block, dx8;                                              \
1597   vdup.u32 dx8, b_dx8;                                                         \
1598                                                                                \
1599   vadd.u32 b_block, b_block, dx8;                                              \
1600   vmovn.u16 u_whole_8, u_whole;                                                \
1601                                                                                \
1602   add fb_ptr, fb_ptr, #16;                                                     \
1603   vmovn.u16 v_whole_8, v_whole;                                                \
1604                                                                                \
1605   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1606   vmovn.u16 b_whole_8, b_whole;                                                \
1607                                                                                \
1608   pld [fb_ptr];                                                                \
1609                                                                                \
1610   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1611   subs span_num_blocks, span_num_blocks, #1;                                   \
1612                                                                                \
1613   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1614   setup_blocks_texture_##swizzling();                                          \
1615                                                                                \
1616   vmovn.u16 r_whole_8, r_whole;                                                \
1617   bne 4b;                                                                      \
1618                                                                                \
1619  5:                                                                            \
1620   vmovn.u16 g_whole_8, g_whole;                                                \
1621   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
1622                                                                                \
1623   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
1624   vdup.u8 draw_mask, right_mask;                                               \
1625                                                                                \
1626   ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset];                   \
1627   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1628   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1629   vzip.u8 u_whole_8, v_whole_8;                                                \
1630                                                                                \
1631   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1632   vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32;                \
1633   vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32;                          \
1634   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1635   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1636                                                                                \
1637   setup_blocks_uv_adj_hack_textured(hacks_active);                             \
1638                                                                                \
1639  1:                                                                            \
1640   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1641   add span_b_offset, span_b_offset, #4;                                        \
1642                                                                                \
1643   add span_edge_data, span_edge_data, #8;                                      \
1644   subs num_spans, num_spans, #1;                                               \
1645                                                                                \
1646   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1647   bne 0b;                                                                      \
1648                                                                                \
1649   restore_abi_regs();                                                          \
1650   ldmia sp!, { r4 - r11, pc };                                                 \
1651                                                                                \
1652  2:                                                                            \
1653   vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
1654   vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
1655   /* pushing odd num of regs here realigns our unaligned stack */              \
1656   push  { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */              \
1657   bl flush_render_block_buffer;                                                \
1658   pop   { r0 - r3, EXTRA_UNSAVED_REGS r12 };                                   \
1659   vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
1660   vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
1661                                                                                \
1662   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1663   vmov.u8 fb_mask_ptrs, #0;                                                    \
1664                                                                                \
1665   mov num_blocks, span_num_blocks;                                             \
1666   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1667   bal 3b                                                                       \
1668
1669
1670 setup_blocks_shaded_textured_builder(swizzled)
1671 setup_blocks_shaded_textured_builder(unswizzled)
1672
1673
1674 #define setup_blocks_unshaded_textured_builder(swizzling)                      \
1675 .align 3;                                                                      \
1676                                                                                \
1677 function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
1678   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
1679   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1680                                                                                \
1681   vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128];                                   \
1682   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1683                                                                                \
1684   cmp num_spans, #0;                                                           \
1685   bxeq lr;                                                                     \
1686                                                                                \
1687   stmdb sp!, { r4 - r11, r14 };                                                \
1688   save_abi_regs();                                                             \
1689   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1690                                                                                \
1691   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1692                                                                                \
1693   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16];     \
1694   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1695                                                                                \
1696   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1697   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1698                                                                                \
1699   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1700                                                                                \
1701   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1702                                                                                \
1703  0:                                                                            \
1704   vmov.u8 fb_mask_ptrs, #0;                                                    \
1705                                                                                \
1706   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
1707   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1708                                                                                \
1709   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
1710   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
1711                                                                                \
1712   cmp span_num_blocks, #0;                                                     \
1713   beq 1f;                                                                      \
1714                                                                                \
1715   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
1716   add num_blocks, span_num_blocks, num_blocks;                                 \
1717                                                                                \
1718   cmp num_blocks, #MAX_BLOCKS;                                                 \
1719   bgt 2f;                                                                      \
1720                                                                                \
1721  3:                                                                            \
1722   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1723                                                                                \
1724   vdup.u32 v_left_x, left_x;                                                   \
1725   and y, y, #0x3;                                                              \
1726                                                                                \
1727   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
1728   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1729                                                                                \
1730   and dither_shift, left_x, #0x03;                                             \
1731                                                                                \
1732   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
1733   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1734                                                                                \
1735   mov dither_shift, dither_shift, lsl #3;                                      \
1736   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1737                                                                                \
1738   mov c_32, #32;                                                               \
1739   subs span_num_blocks, span_num_blocks, #1;                                   \
1740                                                                                \
1741   mov dither_row, dither_row, ror dither_shift;                                \
1742                                                                                \
1743   vdup.u32 dither_offsets_short, dither_row;                                   \
1744   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1745                                                                                \
1746   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1747                                                                                \
1748   vdup.u32 u_block, uv[0];                                                     \
1749                                                                                \
1750   vdup.u32 v_block, uv[1];                                                     \
1751   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1752                                                                                \
1753   vadd.u32 u_block, u_block, block_span;                                       \
1754   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1755                                                                                \
1756   vadd.u32 v_block, v_block, block_span;                                       \
1757   add block_ptr_b, block_ptr_a, #16;                                           \
1758                                                                                \
1759   vshrn.u32 u_whole_low, u_block, #16;                                         \
1760   vshrn.u32 v_whole_low, v_block, #16;                                         \
1761                                                                                \
1762   vdup.u32 dx4, uv_dx4[0];                                                     \
1763                                                                                \
1764   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1765   vdup.u32 dx4, uv_dx4[1];                                                     \
1766                                                                                \
1767   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1768   vdup.u32 dx8, uv_dx8[0];                                                     \
1769                                                                                \
1770   vadd.u32 u_block, u_block, dx8;                                              \
1771   vdup.u32 dx8, uv_dx8[1];                                                     \
1772                                                                                \
1773   vadd.u32 v_block, v_block, dx8;                                              \
1774   vmovn.u16 u_whole_8, u_whole;                                                \
1775                                                                                \
1776   vmovn.u16 v_whole_8, v_whole;                                                \
1777                                                                                \
1778   pld [fb_ptr];                                                                \
1779   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1780                                                                                \
1781   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1782   setup_blocks_texture_##swizzling();                                          \
1783                                                                                \
1784   beq 5f;                                                                      \
1785                                                                                \
1786  4:                                                                            \
1787   vshrn.u32 u_whole_low, u_block, #16;                                         \
1788                                                                                \
1789   vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32;                 \
1790   vshrn.u32 v_whole_low, v_block, #16;                                         \
1791                                                                                \
1792   add block_ptr_b, block_ptr_b, #32;                                           \
1793   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1794                                                                                \
1795   vdup.u32 dx4, uv_dx4[0];                                                     \
1796   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1797   vdup.u32 dx4, uv_dx4[1];                                                     \
1798                                                                                \
1799   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1800   vdup.u32 dx8, uv_dx8[0];                                                     \
1801                                                                                \
1802   vadd.u32 u_block, u_block, dx8;                                              \
1803   vdup.u32 dx8, uv_dx8[1];                                                     \
1804                                                                                \
1805   vadd.u32 v_block, v_block, dx8;                                              \
1806   vmovn.u16 u_whole_8, u_whole;                                                \
1807                                                                                \
1808   add fb_ptr, fb_ptr, #16;                                                     \
1809   vmovn.u16 v_whole_8, v_whole;                                                \
1810                                                                                \
1811   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1812   pld [fb_ptr];                                                                \
1813                                                                                \
1814   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1815   subs span_num_blocks, span_num_blocks, #1;                                   \
1816                                                                                \
1817   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1818   setup_blocks_texture_##swizzling();                                          \
1819                                                                                \
1820   bne 4b;                                                                      \
1821                                                                                \
1822  5:                                                                            \
1823   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
1824                                                                                \
1825   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
1826   vdup.u8 draw_mask, right_mask;                                               \
1827                                                                                \
1828   ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset];                   \
1829   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1830   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1831   vzip.u8 u_whole_8, v_whole_8;                                                \
1832                                                                                \
1833   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1834   add block_ptr_b, block_ptr_b, #32;                                           \
1835   vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32;                          \
1836   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1837   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1838                                                                                \
1839   setup_blocks_uv_adj_hack_textured(hacks_active);                             \
1840                                                                                \
1841  1:                                                                            \
1842   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1843   add span_edge_data, span_edge_data, #8;                                      \
1844   subs num_spans, num_spans, #1;                                               \
1845                                                                                \
1846   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1847   bne 0b;                                                                      \
1848                                                                                \
1849   restore_abi_regs();                                                          \
1850   ldmia sp!, { r4 - r11, pc };                                                 \
1851                                                                                \
1852  2:                                                                            \
1853   vstr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
1854   vstr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
1855   push  { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */              \
1856   bl flush_render_block_buffer;                                                \
1857   pop   { r0 - r3, EXTRA_UNSAVED_REGS r12 };                                   \
1858   vldr texture_mask_u, [r0, #psx_gpu_saved_tmp_offset];                        \
1859   vldr texture_mask_v, [r0, #psx_gpu_saved_tmp_offset + 8];                    \
1860                                                                                \
1861   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1862   vmov.u8 fb_mask_ptrs, #0;                                                    \
1863                                                                                \
1864   mov num_blocks, span_num_blocks;                                             \
1865   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1866   bal 3b                                                                       \
1867
1868
1869 setup_blocks_unshaded_textured_builder(swizzled)
1870 setup_blocks_unshaded_textured_builder(unswizzled)
1871
1872
1873 .align 3
1874
1875 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1876   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
1877   veor.u32 draw_mask, draw_mask, draw_mask
1878
1879   cmp num_spans, #0
1880   bxeq lr
1881
1882   stmdb sp!, { r4 - r11, r14 }
1883   save_abi_regs()
1884   vld1.u32 { test_mask }, [psx_gpu, :128]
1885
1886   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
1887
1888   ubfx color_r, color, #3, #5
1889   ubfx color_g, color, #11, #5
1890   ubfx color_b, color, #19, #5
1891
1892   orr color, color_r, color_b, lsl #10
1893   orr color, color, color_g, lsl #5
1894
1895   vdup.u16 colors, color
1896
1897   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
1898   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1899
1900   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1901   add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1902
1903  0:
1904   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1905   ldrh y, [span_edge_data, #edge_data_y_offset]
1906
1907   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
1908
1909   cmp span_num_blocks, #0
1910   beq 1f
1911
1912   ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
1913   add num_blocks, span_num_blocks, num_blocks
1914
1915   cmp num_blocks, #MAX_BLOCKS
1916   bgt 2f
1917
1918  3:
1919   add fb_ptr, fb_ptr, y, lsl #11
1920   and y, y, #0x3
1921
1922   add fb_ptr, fb_ptr, left_x, lsl #1
1923   mov c_32, #32
1924
1925   subs span_num_blocks, span_num_blocks, #1
1926
1927   add block_ptr_b, block_ptr_a, #16
1928   pld [fb_ptr]
1929
1930   vmov.u32 fb_mask_ptrs[1], fb_ptr
1931   beq 5f
1932
1933  4:
1934   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_32
1935   vst1.u32 { colors }, [block_ptr_b, :128], c_32
1936   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
1937
1938   add fb_ptr, fb_ptr, #16
1939   add block_ptr_b, block_ptr_b, #32
1940
1941   pld [fb_ptr]
1942
1943   vmov.u32 fb_mask_ptrs[1], fb_ptr
1944   subs span_num_blocks, span_num_blocks, #1
1945
1946   bne 4b
1947
1948  5:
1949   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]
1950
1951   vdup.u8 draw_mask_edge, right_mask
1952   vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1953
1954   vst1.u32 { colors }, [block_ptr_b, :128], c_32
1955   vst1.u32 { draw_mask_edge }, [block_ptr_a, :128], c_32
1956   add block_ptr_b, block_ptr_b, #32
1957   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
1958
1959  1:
1960   add span_edge_data, span_edge_data, #8
1961   subs num_spans, num_spans, #1
1962
1963   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
1964   bne 0b
1965
1966   restore_abi_regs()
1967   ldmia sp!, { r4 - r11, pc }
1968                                                                            
1969  2:
1970   vstr d4, [r0, #psx_gpu_saved_tmp_offset]       /* colors */
1971   vstr d5, [r0, #psx_gpu_saved_tmp_offset + 8]
1972   push { r0 - r3, EXTRA_UNSAVED_REGS r12 }
1973   bl flush_render_block_buffer
1974   pop  { r0 - r3, EXTRA_UNSAVED_REGS r12 }
1975   vldr d4, [r0, #psx_gpu_saved_tmp_offset]
1976   vldr d5, [r0, #psx_gpu_saved_tmp_offset + 8]
1977
1978   vld1.u32 { test_mask }, [psx_gpu, :128]
1979   veor.u32 draw_mask, draw_mask, draw_mask
1980
1981   mov num_blocks, span_num_blocks
1982   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1983   bal 3b
1984
1985
1986 #define mask_msb_scalar                                   r14
1987
1988 #define msb_mask                                          q15
1989
1990 #define pixels_low                                        d16
1991
1992 #define msb_mask_low                                      d30
1993 #define msb_mask_high                                     d31
1994
1995
1996 .align 3
1997
1998 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1999   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
2000
2001   cmp num_spans, #0
2002   bxeq lr
2003
2004   stmdb sp!, { r4 - r11, r14 }
2005
2006   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
2007
2008   ubfx color_r, color, #3, #5
2009   ubfx color_g, color, #11, #5
2010
2011   ldrh mask_msb_scalar, [psx_gpu, #psx_gpu_mask_msb_offset]
2012   ubfx color_b, color, #19, #5
2013
2014   orr color, color_r, color_b, lsl #10
2015   orr color, color, color_g, lsl #5
2016   orr color, color, mask_msb_scalar
2017
2018   vdup.u16 colors, color
2019
2020   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
2021   orr color, color, color, lsl #16
2022
2023
2024  0:
2025   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
2026   ldrh y, [span_edge_data, #edge_data_y_offset]
2027
2028   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
2029
2030   cmp span_num_blocks, #0
2031   beq 1f
2032
2033   ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
2034
2035   add fb_ptr, fb_ptr, y, lsl #11
2036   subs span_num_blocks, span_num_blocks, #1
2037
2038   add fb_ptr, fb_ptr, left_x, lsl #1
2039   beq 3f
2040
2041  2:
2042   vst1.u32 { colors }, [fb_ptr]!
2043   subs span_num_blocks, span_num_blocks, #1
2044
2045   bne 2b
2046
2047  3:
2048   ldrb right_mask, [span_edge_data, #edge_data_right_mask_offset]
2049
2050   cmp right_mask, #0x0
2051   beq 5f
2052
2053   tst right_mask, #0xF
2054   streq color, [fb_ptr], #4
2055   moveq right_mask, right_mask, lsr #4
2056   streq color, [fb_ptr], #4
2057
2058   tst right_mask, #0x3
2059   streq color, [fb_ptr], #4
2060   moveq right_mask, right_mask, lsr #2
2061
2062   tst right_mask, #0x1
2063   strheq color, [fb_ptr]
2064
2065  1:
2066   add span_edge_data, span_edge_data, #8
2067   subs num_spans, num_spans, #1
2068   bne 0b
2069
2070   ldmia sp!, { r4 - r11, pc }
2071                                                                            
2072  5:
2073   vst1.u32 { colors }, [fb_ptr]
2074   bal 1b
2075
2076
2077 #undef c_64
2078
2079 #define c_64                                              r7
2080 #define rg_dx_ptr                                         r2
2081
2082
2083 #undef r_block
2084 #undef g_block
2085 #undef b_block
2086 #undef r_whole
2087 #undef g_whole
2088 #undef b_whole
2089 #undef r_whole_low
2090 #undef r_whole_high
2091 #undef g_whole_low
2092 #undef g_whole_high
2093 #undef b_whole_low
2094 #undef b_whole_high
2095 #undef r_whole_8
2096 #undef g_whole_8
2097 #undef b_whole_8
2098 #undef dither_offsets
2099 #undef rg_dx4
2100 #undef rg_dx8
2101 #undef dx4
2102 #undef dx8
2103 #undef v_left_x
2104 #undef uvrg
2105 #undef block_span
2106 #undef rg
2107 #undef draw_mask
2108 #undef test_mask
2109
2110 #define r_block                                           q0
2111 #define g_block                                           q1
2112 #define b_block                                           q2
2113
2114 #define r_whole                                           q3
2115 #define g_whole                                           q4
2116 #define b_whole                                           q5
2117
2118 #define r_whole_low                                       d6
2119 #define r_whole_high                                      d7
2120 #define g_whole_low                                       d8
2121 #define g_whole_high                                      d9
2122 #define b_whole_low                                       d10
2123 #define b_whole_high                                      d11
2124
2125 #define gb_whole_8                                        q6
2126
2127 #define g_whole_8                                         d12
2128 #define b_whole_8                                         d13
2129
2130 #define r_whole_8                                         d14
2131
2132 #define pixels                                            q8
2133
2134 #define rg_dx4                                            d18
2135 #define rg_dx8                                            d19
2136
2137 #define dx4                                               q10
2138 #define dx8                                               q10
2139
2140 #define v_left_x                                          d6
2141 #define uvrg                                              q4
2142 #define block_span                                        q5
2143
2144 #define rg                                                d9
2145
2146 #define d64_1                                             d22
2147 #define d64_128                                           d23
2148
2149 #define d128_4                                            q12
2150 #define d128_0x7                                          q13
2151
2152 #define d64_4                                             d24
2153
2154 #define dither_offsets                                    q14
2155 #define draw_mask                                         q15
2156
2157 #define dither_offsets_low                                d28
2158
2159 #define rg_dx                                             d0
2160 #define test_mask                                         q10
2161
2162
2163 #define setup_blocks_shaded_untextured_dither_a_dithered()                     \
2164   vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low;                           \
2165   vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets;                             \
2166
2167 #define setup_blocks_shaded_untextured_dither_b_dithered()                     \
2168   vqsub.u8 r_whole_8, r_whole_8, d64_4;                                        \
2169   vqsub.u8 gb_whole_8, gb_whole_8, d128_4                                      \
2170
2171 #define setup_blocks_shaded_untextured_dither_a_undithered()                   \
2172
2173 #define setup_blocks_shaded_untextured_dither_b_undithered()                   \
2174
2175
2176 #define setup_blocks_shaded_untextured_indirect_builder(dithering)             \
2177 .align 3;                                                                      \
2178                                                                                \
2179 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
2180   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
2181   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2182                                                                                \
2183   vld1.u32 { rg_dx }, [rg_dx_ptr, :64];                                        \
2184                                                                                \
2185   cmp num_spans, #0;                                                           \
2186   bxeq lr;                                                                     \
2187                                                                                \
2188   stmdb sp!, { r4 - r11, r14 };                                                \
2189   save_abi_regs();                                                             \
2190   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2191                                                                                \
2192   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
2193   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2194                                                                                \
2195   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2196                                                                                \
2197   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
2198   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2199                                                                                \
2200   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2201   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2202                                                                                \
2203   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
2204   vmov.u8 d64_1, #1;                                                           \
2205                                                                                \
2206   vmov.u8 d128_4, #4;                                                          \
2207   vmov.u8 d64_128, #128;                                                       \
2208                                                                                \
2209   vmov.u8 d128_0x7, #0x7;                                                      \
2210                                                                                \
2211  0:                                                                            \
2212   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
2213   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2214                                                                                \
2215   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
2216   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
2217                                                                                \
2218   cmp span_num_blocks, #0;                                                     \
2219   beq 1f;                                                                      \
2220                                                                                \
2221   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
2222   add num_blocks, span_num_blocks, num_blocks;                                 \
2223                                                                                \
2224   cmp num_blocks, #MAX_BLOCKS;                                                 \
2225   bgt 2f;                                                                      \
2226                                                                                \
2227  3:                                                                            \
2228   ldr b, [span_b_offset];                                                      \
2229   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2230                                                                                \
2231   vdup.u32 v_left_x, left_x;                                                   \
2232   and y, y, #0x3;                                                              \
2233                                                                                \
2234   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
2235   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2236                                                                                \
2237   mla b, b_dx, left_x, b;                                                      \
2238   and dither_shift, left_x, #0x03;                                             \
2239                                                                                \
2240   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
2241   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2242                                                                                \
2243   mov dither_shift, dither_shift, lsl #3;                                      \
2244   vmla.u32 rg, rg_dx, v_left_x;                                                \
2245                                                                                \
2246   mov c_64, #64;                                                               \
2247   subs span_num_blocks, span_num_blocks, #1;                                   \
2248                                                                                \
2249   mov dither_row, dither_row, ror dither_shift;                                \
2250   mov b_dx4, b_dx, lsl #2;                                                     \
2251                                                                                \
2252   vdup.u32 dither_offsets, dither_row;                                         \
2253   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2254                                                                                \
2255   vdup.u32 b_block, b;                                                         \
2256   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2257                                                                                \
2258   mov b_dx8, b_dx, lsl #3;                                                     \
2259   vdup.u32 r_block, rg[0];                                                     \
2260   vdup.u32 g_block, rg[1];                                                     \
2261                                                                                \
2262   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2263                                                                                \
2264   vadd.u32 r_block, r_block, block_span;                                       \
2265   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2266                                                                                \
2267   vadd.u32 g_block, g_block, block_span;                                       \
2268   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
2269                                                                                \
2270   vadd.u32 b_block, b_block, block_span;                                       \
2271   add block_ptr_b, block_ptr_a, #16;                                           \
2272                                                                                \
2273   vshrn.u32 r_whole_low, r_block, #16;                                         \
2274   vshrn.u32 g_whole_low, g_block, #16;                                         \
2275   vshrn.u32 b_whole_low, b_block, #16;                                         \
2276   vdup.u32 dx4, rg_dx4[0];                                                     \
2277                                                                                \
2278   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2279   vdup.u32 dx4, rg_dx4[1];                                                     \
2280                                                                                \
2281   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2282   vdup.u32 dx4, b_dx4;                                                         \
2283                                                                                \
2284   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2285   vdup.u32 dx8, rg_dx8[0];                                                     \
2286                                                                                \
2287   vadd.u32 r_block, r_block, dx8;                                              \
2288   vdup.u32 dx8, rg_dx8[1];                                                     \
2289                                                                                \
2290   vadd.u32 g_block, g_block, dx8;                                              \
2291   vdup.u32 dx8, b_dx8;                                                         \
2292                                                                                \
2293   vadd.u32 b_block, b_block, dx8;                                              \
2294                                                                                \
2295   vmovn.u16 r_whole_8, r_whole;                                                \
2296   vmovn.u16 g_whole_8, g_whole;                                                \
2297   vmovn.u16 b_whole_8, b_whole;                                                \
2298                                                                                \
2299   beq 5f;                                                                      \
2300   veor.u32 draw_mask, draw_mask, draw_mask;                                    \
2301                                                                                \
2302  4:                                                                            \
2303   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2304   vshrn.u32 r_whole_low, r_block, #16;                                         \
2305                                                                                \
2306   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2307   vshrn.u32 g_whole_low, g_block, #16;                                         \
2308                                                                                \
2309   vshrn.u32 b_whole_low, b_block, #16;                                         \
2310   str fb_ptr, [block_ptr_a, #44];                                              \
2311                                                                                \
2312   vdup.u32 dx4, rg_dx4[0];                                                     \
2313   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2314   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2315                                                                                \
2316   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2317   vdup.u32 dx4, rg_dx4[1];                                                     \
2318                                                                                \
2319   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2320   vdup.u32 dx4, b_dx4;                                                         \
2321                                                                                \
2322   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2323   vdup.u32 dx8, rg_dx8[0];                                                     \
2324                                                                                \
2325   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2326   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2327   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2328                                                                                \
2329   vadd.u32 r_block, r_block, dx8;                                              \
2330   vdup.u32 dx8, rg_dx8[1];                                                     \
2331                                                                                \
2332   vadd.u32 g_block, g_block, dx8;                                              \
2333   vdup.u32 dx8, b_dx8;                                                         \
2334                                                                                \
2335   vadd.u32 b_block, b_block, dx8;                                              \
2336   add fb_ptr, fb_ptr, #16;                                                     \
2337                                                                                \
2338   vmovn.u16 r_whole_8, r_whole;                                                \
2339   vmovn.u16 g_whole_8, g_whole;                                                \
2340   vmovn.u16 b_whole_8, b_whole;                                                \
2341                                                                                \
2342   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64;                           \
2343   vst1.u32 { pixels }, [block_ptr_b, :128], c_64;                              \
2344                                                                                \
2345   pld [fb_ptr];                                                                \
2346                                                                                \
2347   subs span_num_blocks, span_num_blocks, #1;                                   \
2348   bne 4b;                                                                      \
2349                                                                                \
2350  5:                                                                            \
2351   str fb_ptr, [block_ptr_a, #44];                                              \
2352   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2353                                                                                \
2354   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
2355   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2356                                                                                \
2357   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2358   vdup.u8 draw_mask, right_mask;                                               \
2359                                                                                \
2360   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2361   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
2362                                                                                \
2363   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
2364                                                                                \
2365   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2366   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2367   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2368                                                                                \
2369   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64;                           \
2370   vst1.u32 { pixels }, [block_ptr_b, :128], c_64;                              \
2371                                                                                \
2372  1:                                                                            \
2373   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2374   add span_b_offset, span_b_offset, #4;                                        \
2375                                                                                \
2376   add span_edge_data, span_edge_data, #8;                                      \
2377   subs num_spans, num_spans, #1;                                               \
2378                                                                                \
2379   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
2380   bne 0b;                                                                      \
2381                                                                                \
2382   restore_abi_regs();                                                          \
2383   pop   { r4 - r11, pc };                                                      \
2384                                                                                \
2385  2:                                                                            \
2386   vstr rg_dx4, [r0, #psx_gpu_saved_tmp_offset];                                \
2387   push  { r0 - r3, EXTRA_UNSAVED_REGS r12 };                                   \
2388   bl flush_render_block_buffer;                                                \
2389   pop   { r0 - r3, EXTRA_UNSAVED_REGS r12 };                                   \
2390   vldr rg_dx4, [r0, #psx_gpu_saved_tmp_offset];                                \
2391                                                                                \
2392   vmov.u8 d64_1, #1;                                                           \
2393   vmov.u8 d128_4, #4;                                                          \
2394   vmov.u8 d64_128, #128;                                                       \
2395   vmov.u8 d128_0x7, #0x7;                                                      \
2396                                                                                \
2397   vadd.u32 rg_dx8, rg_dx4, rg_dx4;                                             \
2398                                                                                \
2399   mov num_blocks, span_num_blocks;                                             \
2400   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2401   bal 3b                                                                       \
2402
2403
2404 setup_blocks_shaded_untextured_indirect_builder(undithered)
2405 setup_blocks_shaded_untextured_indirect_builder(dithered)
2406
2407
2408 #undef draw_mask
2409
2410 #define mask_msb_ptr                                      r14
2411
2412 #define draw_mask                                         q0
2413 #define pixels_low                                        d16
2414 #define pixels_high                                       d17
2415
2416
2417
2418 #define setup_blocks_shaded_untextured_direct_builder(dithering)               \
2419 .align 3;                                                                      \
2420                                                                                \
2421 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
2422   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
2423   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2424                                                                                \
2425   vld1.u32 { rg_dx }, [rg_dx_ptr, :64];                                        \
2426                                                                                \
2427   cmp num_spans, #0;                                                           \
2428   bxeq lr;                                                                     \
2429                                                                                \
2430   stmdb sp!, { r4 - r11, r14 };                                                \
2431   save_abi_regs();                                                             \
2432   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2433                                                                                \
2434   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
2435   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2436                                                                                \
2437   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2438   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2439                                                                                \
2440   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2441   vmov.u8 d64_1, #1;                                                           \
2442                                                                                \
2443   vmov.u8 d128_4, #4;                                                          \
2444   vmov.u8 d64_128, #128;                                                       \
2445                                                                                \
2446   vmov.u8 d128_0x7, #0x7;                                                      \
2447   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
2448   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
2449                                                                                \
2450  0:                                                                            \
2451   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
2452   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2453                                                                                \
2454   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
2455   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
2456                                                                                \
2457   cmp span_num_blocks, #0;                                                     \
2458   beq 1f;                                                                      \
2459                                                                                \
2460   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
2461   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2462                                                                                \
2463   ldr b, [span_b_offset];                                                      \
2464   vdup.u32 v_left_x, left_x;                                                   \
2465   and y, y, #0x3;                                                              \
2466                                                                                \
2467   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
2468   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2469                                                                                \
2470   mla b, b_dx, left_x, b;                                                      \
2471   and dither_shift, left_x, #0x03;                                             \
2472                                                                                \
2473   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
2474   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2475                                                                                \
2476   mov dither_shift, dither_shift, lsl #3;                                      \
2477   vmla.u32 rg, rg_dx, v_left_x;                                                \
2478                                                                                \
2479   subs span_num_blocks, span_num_blocks, #1;                                   \
2480                                                                                \
2481   mov dither_row, dither_row, ror dither_shift;                                \
2482   mov b_dx4, b_dx, lsl #2;                                                     \
2483                                                                                \
2484   vdup.u32 dither_offsets, dither_row;                                         \
2485   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2486                                                                                \
2487   vdup.u32 b_block, b;                                                         \
2488   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2489                                                                                \
2490   mov b_dx8, b_dx, lsl #3;                                                     \
2491   vdup.u32 r_block, rg[0];                                                     \
2492   vdup.u32 g_block, rg[1];                                                     \
2493                                                                                \
2494   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2495                                                                                \
2496   vadd.u32 r_block, r_block, block_span;                                       \
2497   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2498                                                                                \
2499   vadd.u32 g_block, g_block, block_span;                                       \
2500   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
2501                                                                                \
2502   vadd.u32 b_block, b_block, block_span;                                       \
2503   add block_ptr_b, block_ptr_a, #16;                                           \
2504                                                                                \
2505   vshrn.u32 r_whole_low, r_block, #16;                                         \
2506   vshrn.u32 g_whole_low, g_block, #16;                                         \
2507   vshrn.u32 b_whole_low, b_block, #16;                                         \
2508   vdup.u32 dx4, rg_dx4[0];                                                     \
2509                                                                                \
2510   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2511   vdup.u32 dx4, rg_dx4[1];                                                     \
2512                                                                                \
2513   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2514   vdup.u32 dx4, b_dx4;                                                         \
2515                                                                                \
2516   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2517   vdup.u32 dx8, rg_dx8[0];                                                     \
2518                                                                                \
2519   vadd.u32 r_block, r_block, dx8;                                              \
2520   vdup.u32 dx8, rg_dx8[1];                                                     \
2521                                                                                \
2522   vadd.u32 g_block, g_block, dx8;                                              \
2523   vdup.u32 dx8, b_dx8;                                                         \
2524                                                                                \
2525   vadd.u32 b_block, b_block, dx8;                                              \
2526                                                                                \
2527   vmovn.u16 r_whole_8, r_whole;                                                \
2528   vmovn.u16 g_whole_8, g_whole;                                                \
2529   vmovn.u16 b_whole_8, b_whole;                                                \
2530                                                                                \
2531   beq 3f;                                                                      \
2532                                                                                \
2533  2:                                                                            \
2534   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2535   vshrn.u32 r_whole_low, r_block, #16;                                         \
2536                                                                                \
2537   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2538   vshrn.u32 g_whole_low, g_block, #16;                                         \
2539                                                                                \
2540   vshrn.u32 b_whole_low, b_block, #16;                                         \
2541                                                                                \
2542   vdup.u32 dx4, rg_dx4[0];                                                     \
2543   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2544   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2545                                                                                \
2546   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2547   vdup.u32 dx4, rg_dx4[1];                                                     \
2548                                                                                \
2549   vmov pixels, msb_mask;                                                       \
2550   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2551   vdup.u32 dx4, b_dx4;                                                         \
2552                                                                                \
2553   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2554   vdup.u32 dx8, rg_dx8[0];                                                     \
2555                                                                                \
2556   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2557   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2558   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2559                                                                                \
2560   vadd.u32 r_block, r_block, dx8;                                              \
2561   vdup.u32 dx8, rg_dx8[1];                                                     \
2562                                                                                \
2563   vadd.u32 g_block, g_block, dx8;                                              \
2564   vdup.u32 dx8, b_dx8;                                                         \
2565                                                                                \
2566   vadd.u32 b_block, b_block, dx8;                                              \
2567                                                                                \
2568   vmovn.u16 r_whole_8, r_whole;                                                \
2569   vmovn.u16 g_whole_8, g_whole;                                                \
2570   vmovn.u16 b_whole_8, b_whole;                                                \
2571                                                                                \
2572   vst1.u32 { pixels }, [fb_ptr]!;                                              \
2573   subs span_num_blocks, span_num_blocks, #1;                                   \
2574   bne 2b;                                                                      \
2575                                                                                \
2576  3:                                                                            \
2577   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2578                                                                                \
2579   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
2580   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2581                                                                                \
2582   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2583   rbit right_mask, right_mask;                                                 \
2584   vmov pixels, msb_mask;                                                       \
2585   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2586   clz right_mask, right_mask;                                                  \
2587                                                                                \
2588   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2589   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2590   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2591                                                                                \
2592   JT_OP_REL(100f, right_mask, temp);                                           \
2593   JT_OP(ldr pc, [pc, right_mask, lsl #2]);                                     \
2594   nop;                                                                         \
2595  100:                                                                          \
2596   nop;                                                                         \
2597   .word JTE(100b, 4f);                                                         \
2598   .word JTE(100b, 5f);                                                         \
2599   .word JTE(100b, 6f);                                                         \
2600   .word JTE(100b, 7f);                                                         \
2601   .word JTE(100b, 8f);                                                         \
2602   .word JTE(100b, 9f);                                                         \
2603   .word JTE(100b, 10f);                                                        \
2604   .word JTE(100b, 11f);                                                        \
2605                                                                                \
2606  4:                                                                            \
2607   vst1.u16 { pixels_low[0] }, [fb_ptr];                                        \
2608   bal 1f;                                                                      \
2609                                                                                \
2610  5:                                                                            \
2611   vst1.u32 { pixels_low[0] }, [fb_ptr];                                        \
2612   bal 1f;                                                                      \
2613                                                                                \
2614  6:                                                                            \
2615   vst1.u32 { pixels_low[0] }, [fb_ptr]!;                                       \
2616   vst1.u16 { pixels_low[2] }, [fb_ptr];                                        \
2617   bal 1f;                                                                      \
2618                                                                                \
2619  7:                                                                            \
2620   vst1.u32 { pixels_low }, [fb_ptr];                                           \
2621   bal 1f;                                                                      \
2622                                                                                \
2623  8:                                                                            \
2624   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2625   vst1.u16 { pixels_high[0] }, [fb_ptr];                                       \
2626   bal 1f;                                                                      \
2627                                                                                \
2628  9:                                                                            \
2629   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2630   vst1.u32 { pixels_high[0] }, [fb_ptr]!;                                      \
2631   bal 1f;                                                                      \
2632                                                                                \
2633  10:                                                                           \
2634   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2635   vst1.u32 { pixels_high[0] }, [fb_ptr]!;                                      \
2636   vst1.u16 { pixels_high[2] }, [fb_ptr];                                       \
2637   bal 1f;                                                                      \
2638                                                                                \
2639  11:                                                                           \
2640   vst1.u32 { pixels }, [fb_ptr];                                               \
2641   bal 1f;                                                                      \
2642                                                                                \
2643  1:                                                                            \
2644   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2645   add span_b_offset, span_b_offset, #4;                                        \
2646                                                                                \
2647   add span_edge_data, span_edge_data, #8;                                      \
2648   subs num_spans, num_spans, #1;                                               \
2649                                                                                \
2650   bne 0b;                                                                      \
2651                                                                                \
2652   restore_abi_regs();                                                          \
2653   ldmia sp!, { r4 - r11, pc }                                                  \
2654
2655 setup_blocks_shaded_untextured_direct_builder(undithered)
2656 setup_blocks_shaded_untextured_direct_builder(dithered)
2657
2658
2659 #undef psx_gpu
2660 #undef num_blocks
2661 #undef triangle
2662 #undef c_64
2663
2664 #define psx_gpu                                  r0
2665 #define block_ptr                                r1
2666 #define num_blocks                               r2
2667 #define uv_01                                    r3
2668 #define uv_23                                    r4
2669 #define uv_45                                    r5
2670 #define uv_67                                    r6
2671 #define uv_0                                     r7
2672 #define uv_1                                     r3
2673 #define uv_2                                     r8
2674 #define uv_3                                     r4
2675 #define uv_4                                     r9
2676 #define uv_5                                     r5
2677 #define uv_6                                     r10
2678 #define uv_7                                     r6
2679 #define texture_ptr                              r11
2680
2681 #define pixel_0                                  r7
2682 #define pixel_1                                  r3
2683 #define pixel_2                                  r8
2684 #define pixel_3                                  r4
2685 #define pixel_4                                  r9
2686 #define pixel_5                                  r5
2687 #define pixel_6                                  r10
2688 #define pixel_7                                  r6
2689
2690 #define pixels_a                                 r7
2691 #define pixels_b                                 r9
2692 #define pixels_c                                 r8
2693 #define pixels_d                                 r10
2694
2695 #define c_64                                     r0
2696
2697 #define clut_ptr                                 r12
2698 #define current_texture_mask                     r5
2699 #define dirty_textures_mask                      r6
2700
2701 #define texels                                   d0
2702
2703 #define clut_low_a                               d2
2704 #define clut_low_b                               d3
2705 #define clut_high_a                              d4
2706 #define clut_high_b                              d5
2707
2708 #define clut_a                                   q1
2709 #define clut_b                                   q2
2710
2711 #define texels_low                               d6
2712 #define texels_high                              d7
2713
2714 .align 3
2715
2716 function(texture_blocks_untextured)
2717   bx lr
2718
2719
2720 .align 3
2721
2722 function(texture_blocks_4bpp)
2723   stmdb sp!, { r3 - r11, r14 }
2724   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2725
2726   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2727   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2728
2729   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2730   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]
2731
2732   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
2733   vuzp.u8 clut_a, clut_b
2734
2735   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
2736   tst dirty_textures_mask, current_texture_mask
2737
2738   bne 1f
2739   mov c_64, #64
2740
2741 0:
2742   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2743
2744   uxtah uv_0, texture_ptr, uv_01
2745   uxtah uv_1, texture_ptr, uv_01, ror #16
2746
2747   uxtah uv_2, texture_ptr, uv_23
2748   uxtah uv_3, texture_ptr, uv_23, ror #16
2749
2750   uxtah uv_4, texture_ptr, uv_45
2751   ldrb pixel_0, [uv_0]
2752
2753   uxtah uv_5, texture_ptr, uv_45, ror #16
2754   ldrb pixel_1, [uv_1]
2755
2756   uxtah uv_6, texture_ptr, uv_67
2757   ldrb pixel_2, [uv_2]
2758
2759   uxtah uv_7, texture_ptr, uv_67, ror #16
2760   ldrb pixel_3, [uv_3]
2761
2762   ldrb pixel_4, [uv_4]
2763   subs num_blocks, num_blocks, #1
2764
2765   ldrb pixel_5, [uv_5]
2766   orr pixels_a, pixel_0, pixel_1, lsl #8
2767
2768   ldrb pixel_6, [uv_6]
2769   orr pixels_b, pixel_4, pixel_5, lsl #8
2770
2771   ldrb pixel_7, [uv_7]
2772   orr pixels_a, pixels_a, pixel_2, lsl #16
2773
2774   orr pixels_b, pixels_b, pixel_6, lsl #16
2775   orr pixels_a, pixels_a, pixel_3, lsl #24
2776
2777   orr pixels_b, pixels_b, pixel_7, lsl #24
2778   vmov texels, pixels_a, pixels_b
2779
2780   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2781   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2782
2783   vst2.u8 { texels_low, texels_high }, [block_ptr, :128], c_64
2784   bne 0b
2785
2786   ldmia sp!, { r3 - r11, pc }
2787
2788 1:
2789   stmdb sp!, { r1 - r2 }  
2790   bl update_texture_4bpp_cache
2791
2792   mov c_64, #64
2793   ldmia sp!, { r1 - r2 }
2794   bal 0b
2795
2796
2797 .align 3
2798
2799 function(texture_blocks_8bpp)
2800   push { r4 - r11, lr }
2801   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2802
2803   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2804   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2805
2806   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2807   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
2808
2809   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]
2810   tst dirty_textures_mask, current_texture_mask
2811
2812   bne 1f
2813   nop
2814
2815 0:
2816   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2817
2818   uxtah uv_0, texture_ptr, uv_01
2819   uxtah uv_1, texture_ptr, uv_01, ror #16
2820
2821   uxtah uv_2, texture_ptr, uv_23
2822   uxtah uv_3, texture_ptr, uv_23, ror #16
2823
2824   uxtah uv_4, texture_ptr, uv_45
2825   ldrb pixel_0, [uv_0]
2826
2827   uxtah uv_5, texture_ptr, uv_45, ror #16
2828   ldrb pixel_1, [uv_1]
2829
2830   uxtah uv_6, texture_ptr, uv_67
2831   ldrb pixel_2, [uv_2]
2832
2833   uxtah uv_7, texture_ptr, uv_67, ror #16
2834   ldrb pixel_3, [uv_3]
2835
2836   ldrb pixel_4, [uv_4]
2837   add pixel_0, pixel_0, pixel_0
2838
2839   ldrb pixel_5, [uv_5]
2840   add pixel_1, pixel_1, pixel_1
2841
2842   ldrb pixel_6, [uv_6]
2843   add pixel_2, pixel_2, pixel_2
2844
2845   ldrb pixel_7, [uv_7]
2846   add pixel_3, pixel_3, pixel_3
2847
2848   ldrh pixel_0, [clut_ptr, pixel_0]
2849   add pixel_4, pixel_4, pixel_4
2850
2851   ldrh pixel_1, [clut_ptr, pixel_1]
2852   add pixel_5, pixel_5, pixel_5
2853
2854   ldrh pixel_2, [clut_ptr, pixel_2]
2855   add pixel_6, pixel_6, pixel_6
2856
2857   ldrh pixel_3, [clut_ptr, pixel_3]
2858   add pixel_7, pixel_7, pixel_7
2859
2860   ldrh pixel_4, [clut_ptr, pixel_4]
2861   orr pixels_a, pixel_0, pixel_1, lsl #16
2862
2863   ldrh pixel_5, [clut_ptr, pixel_5]
2864   orr pixels_c, pixel_2, pixel_3, lsl #16
2865
2866   ldrh pixel_6, [clut_ptr, pixel_6]
2867   subs num_blocks, num_blocks, #1
2868
2869   ldrh pixel_7, [clut_ptr, pixel_7]
2870   orr pixels_b, pixel_4, pixel_5, lsl #16
2871
2872   orr pixels_d, pixel_6, pixel_7, lsl #16
2873   stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d } 
2874
2875   add block_ptr, block_ptr, #64
2876   bne 0b
2877
2878   pop { r4 - r11, pc }
2879
2880 1:
2881   /* pushing odd num of regs here realigns our unaligned stack */
2882   push { r1 - r2, EXTRA_UNSAVED_REGS r12 }
2883   bl   update_texture_8bpp_cache
2884   pop  { r1 - r2, EXTRA_UNSAVED_REGS r12 }
2885   bal  0b
2886
2887
2888 #undef uv_0
2889 #undef uv_1
2890 #undef uv_2
2891 #undef uv_3
2892 #undef uv_4
2893 #undef uv_5
2894 #undef uv_6
2895 #undef uv_7
2896
2897 #undef pixel_0
2898 #undef pixel_1
2899 #undef pixel_2
2900 #undef pixel_3
2901 #undef pixel_4
2902 #undef pixel_5
2903 #undef pixel_6
2904 #undef pixel_7
2905
2906 #undef texture_ptr
2907
2908 #undef pixels_a
2909 #undef pixels_b
2910 #undef pixels_c
2911 #undef pixels_d
2912
2913 #define psx_gpu                                  r0
2914 #define block_ptr                                r1
2915 #define num_blocks                               r2
2916
2917 #define uv_0                                     r3
2918 #define uv_1                                     r4
2919 #define u_0                                      r3
2920 #define u_1                                      r4
2921 #define v_0                                      r5
2922 #define v_1                                      r6
2923
2924 #define uv_2                                     r5
2925 #define uv_3                                     r6
2926 #define u_2                                      r5
2927 #define u_3                                      r6
2928 #define v_2                                      r7
2929 #define v_3                                      r8
2930
2931 #define uv_4                                     r7
2932 #define uv_5                                     r8
2933 #define u_4                                      r7
2934 #define u_5                                      r8
2935 #define v_4                                      r9
2936 #define v_5                                      r10
2937
2938 #define uv_6                                     r9
2939 #define uv_7                                     r10
2940 #define u_6                                      r9
2941 #define u_7                                      r10
2942 #define v_6                                      r11
2943 #define v_7                                      r0
2944
2945 #define pixel_0                                  r3
2946 #define pixel_1                                  r4
2947 #define pixel_2                                  r5
2948 #define pixel_3                                  r6
2949 #define pixel_4                                  r7
2950 #define pixel_5                                  r8
2951 #define pixel_6                                  r9
2952 #define pixel_7                                  r10
2953
2954 #define pixels_a                                 r3
2955 #define pixels_b                                 r5
2956 #define pixels_c                                 r7
2957 #define pixels_d                                 r9
2958
2959 #define texture_ptr                              r12
2960
2961
2962 .align 3
2963
2964 function(texture_blocks_16bpp)
2965   stmdb sp!, { r3 - r11, r14 }
2966   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2967
2968   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2969   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2970
2971 0:
2972   ldrh uv_0, [block_ptr]
2973   subs num_blocks, num_blocks, #1
2974
2975   ldrh uv_1, [block_ptr, #2]
2976
2977   and v_0, uv_0, #0xFF00
2978   and v_1, uv_1, #0xFF00
2979
2980   and u_0, uv_0, #0xFF
2981   and u_1, uv_1, #0xFF
2982
2983   add uv_0, u_0, v_0, lsl #2
2984   ldrh uv_2, [block_ptr, #4]
2985
2986   add uv_1, u_1, v_1, lsl #2
2987   ldrh uv_3, [block_ptr, #6]
2988
2989   add uv_0, uv_0, uv_0
2990   add uv_1, uv_1, uv_1
2991
2992   and v_2, uv_2, #0xFF00
2993   and v_3, uv_3, #0xFF00
2994
2995   and u_2, uv_2, #0xFF
2996   and u_3, uv_3, #0xFF
2997
2998   add uv_2, u_2, v_2, lsl #2
2999   ldrh uv_4, [block_ptr, #8]
3000
3001   add uv_3, u_3, v_3, lsl #2
3002   ldrh uv_5, [block_ptr, #10]
3003
3004   add uv_2, uv_2, uv_2
3005   add uv_3, uv_3, uv_3
3006
3007   and v_4, uv_4, #0xFF00
3008   and v_5, uv_5, #0xFF00
3009
3010   and u_4, uv_4, #0xFF
3011   and u_5, uv_5, #0xFF
3012
3013   add uv_4, u_4, v_4, lsl #2
3014   ldrh uv_6, [block_ptr, #12]
3015
3016   add uv_5, u_5, v_5, lsl #2
3017   ldrh uv_7, [block_ptr, #14]
3018
3019   add uv_4, uv_4, uv_4
3020   ldrh pixel_0, [texture_ptr, uv_0]
3021
3022   add uv_5, uv_5, uv_5
3023   ldrh pixel_1, [texture_ptr, uv_1]
3024
3025   and v_6, uv_6, #0xFF00
3026   ldrh pixel_2, [texture_ptr, uv_2]
3027
3028   and v_7, uv_7, #0xFF00
3029   ldrh pixel_3, [texture_ptr, uv_3]
3030
3031   and u_6, uv_6, #0xFF
3032   ldrh pixel_4, [texture_ptr, uv_4]
3033
3034   and u_7, uv_7, #0xFF
3035   ldrh pixel_5, [texture_ptr, uv_5]
3036
3037   add uv_6, u_6, v_6, lsl #2
3038   add uv_7, u_7, v_7, lsl #2
3039
3040   add uv_6, uv_6, uv_6
3041   add uv_7, uv_7, uv_7
3042
3043   orr pixels_a, pixel_0, pixel_1, lsl #16
3044   orr pixels_b, pixel_2, pixel_3, lsl #16
3045
3046   ldrh pixel_6, [texture_ptr, uv_6]
3047   orr pixels_c, pixel_4, pixel_5, lsl #16
3048
3049   ldrh pixel_7, [texture_ptr, uv_7]
3050   orr pixels_d, pixel_6, pixel_7, lsl #16
3051
3052   stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
3053   add block_ptr, block_ptr, #64
3054
3055   bne 0b
3056
3057   ldmia sp!, { r3 - r11, pc }
3058
3059
3060 #undef num_blocks
3061
3062 #undef test_mask
3063 #undef texels
3064 #undef pixels_b
3065 #undef pixels
3066 #undef d64_1
3067 #undef d64_4
3068 #undef d64_128
3069 #undef draw_mask
3070 #undef msb_mask
3071 #undef msb_mask_low
3072 #undef msb_mask_high
3073 #undef fb_pixels
3074
3075 #undef c_32
3076 #undef fb_ptr
3077 #undef mask_msb_ptr
3078
3079 #define psx_gpu                                  r0
3080 #define num_blocks                               r1
3081 #define color_ptr                                r2
3082 #define colors_scalar                            r2
3083 #define colors_scalar_compare                    r3
3084 #define mask_msb_ptr                             r2
3085
3086 #define block_ptr_load_a                         r0
3087 #define block_ptr_store                          r3
3088 #define block_ptr_load_b                         r12
3089 #define c_32                                     r2
3090
3091 #define c_48                                     r4
3092 #define fb_ptr                                   r14
3093 #define draw_mask_bits_scalar                    r5
3094
3095 #define d128_0x07                                q0
3096 #define d128_0x1F                                q1
3097 #define d128_0x8000                              q2
3098 #define test_mask                                q3
3099 #define texels                                   q4
3100 #define colors_rg                                q5
3101 #define colors_b_dm_bits                         q6
3102 #define texels_rg                                q7
3103 #define pixels_r                                 q8
3104 #define pixels_g                                 q9
3105 #define pixels_b                                 q10
3106 #define pixels                                   q11
3107 #define zero_mask                                q4
3108 #define draw_mask                                q12
3109 #define msb_mask                                 q13
3110
3111 #define fb_pixels                                q8
3112
3113 #define pixels_gb_low                            q9
3114
3115 #define colors_r                                 d10
3116 #define colors_g                                 d11
3117 #define colors_b                                 d12
3118 #define draw_mask_bits                           d13
3119 #define texels_r                                 d14
3120 #define texels_g                                 d15
3121 #define pixels_r_low                             d16
3122 #define pixels_g_low                             d18
3123 #define pixels_b_low                             d19
3124 #define msb_mask_low                             d26
3125 #define msb_mask_high                            d27
3126
3127 #define d64_1                                    d28
3128 #define d64_4                                    d29
3129 #define d64_128                                  d30
3130 #define texels_b                                 d31
3131
3132 #define shade_blocks_textured_modulated_prologue_indirect()                    \
3133   mov c_48, #48;                                                               \
3134   add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset                         \
3135
3136 #define shade_blocks_textured_modulated_prologue_direct()                      \
3137   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3138   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]            \
3139
3140
3141 #define shade_blocks_textured_modulated_prologue_shaded(dithering, target)     \
3142   
3143 #define shade_blocks_textured_false_modulation_check_undithered(target)        \
3144   ldr colors_scalar, [psx_gpu, #psx_gpu_triangle_color_offset];                \
3145   movw colors_scalar_compare, #0x8080;                                         \
3146                                                                                \
3147   movt colors_scalar_compare, #0x80;                                           \
3148   cmp colors_scalar, colors_scalar_compare;                                    \
3149   beq shade_blocks_textured_unmodulated_##target                               \
3150
3151 #define shade_blocks_textured_false_modulation_check_dithered(target)          \
3152
3153 #define shade_blocks_textured_modulated_prologue_unshaded(dithering, target)   \
3154   shade_blocks_textured_false_modulation_check_##dithering(target);            \
3155   add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset;                      \
3156   vld1.u32 { colors_r[] }, [color_ptr, :32];                                   \
3157   vdup.u8 colors_g, colors_r[1];                                               \
3158   vdup.u8 colors_b, colors_r[2];                                               \
3159   vdup.u8 colors_r, colors_r[0]                                                \
3160
3161
3162 #define shade_blocks_textured_modulated_load_dithered(target)                  \
3163   vld1.u32 { target }, [block_ptr_load_b, :128]                                \
3164
3165 #define shade_blocks_textured_modulated_load_last_dithered(target)             \
3166   vld1.u32 { target }, [block_ptr_load_b, :128], c_32                          \
3167
3168 #define shade_blocks_textured_modulated_load_undithered(target)                \
3169
3170 #define shade_blocks_textured_modulated_load_last_undithered(target)           \
3171   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3172
3173 #define shade_blocks_textured_modulate_dithered(channel)                       \
3174   vmlal.u8 pixels_##channel, texels_##channel, colors_##channel                \
3175
3176 #define shade_blocks_textured_modulate_undithered(channel)                     \
3177   vmull.u8 pixels_##channel, texels_##channel, colors_##channel                \
3178
3179
3180 #define shade_blocks_textured_modulated_store_draw_mask_indirect(offset)       \
3181   vst1.u32 { draw_mask }, [block_ptr_store, :128]!                             \
3182
3183 #define shade_blocks_textured_modulated_store_draw_mask_direct(offset)         \
3184   ldr fb_ptr, [block_ptr_load_b, #(offset - 64)];                              \
3185   vld1.u32 { fb_pixels }, [fb_ptr];                                            \
3186   vbit.u16 pixels, fb_pixels, draw_mask                                        \
3187
3188 #define shade_blocks_textured_modulated_store_pixels_indirect()                \
3189   vst1.u32 { pixels }, [block_ptr_store, :128], c_48                           \
3190
3191 #define shade_blocks_textured_modulated_store_pixels_direct()                  \
3192   vst1.u32 { pixels }, [fb_ptr]                                                \
3193
3194
3195 #define shade_blocks_textured_modulated_load_rg_shaded()                       \
3196   vld1.u32 { colors_r, colors_g }, [block_ptr_load_b, :128], c_32              \
3197
3198 #define shade_blocks_textured_modulated_load_rg_unshaded()                     \
3199   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3200
3201 #define shade_blocks_textured_modulated_load_bdm_shaded()                      \
3202   vld1.u32 { colors_b, draw_mask_bits }, [block_ptr_load_a, :128], c_32        \
3203
3204 #define shade_blocks_textured_modulated_load_bdm_unshaded()                    \
3205   ldr draw_mask_bits_scalar, [block_ptr_load_a, #8];                           \
3206   add block_ptr_load_a, block_ptr_load_a, #32                                  \
3207
3208 #define shade_blocks_textured_modulated_expand_draw_mask_shaded()              \
3209   vdup.u16 draw_mask, draw_mask_bits[0]                                        \
3210
3211 #define shade_blocks_textured_modulated_expand_draw_mask_unshaded()            \
3212   vdup.u16 draw_mask, draw_mask_bits_scalar                                    \
3213
3214
3215 #define shade_blocks_textured_modulated_apply_msb_mask_indirect()              \
3216
3217 #define shade_blocks_textured_modulated_apply_msb_mask_direct()                \
3218   vorr.u16 pixels, pixels, msb_mask                                            \
3219
3220
3221 #define shade_blocks_textured_modulated_builder(shading, dithering, target)    \
3222 .align 3;                                                                      \
3223                                                                                \
3224 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
3225   save_abi_regs();                                                             \
3226   shade_blocks_textured_modulated_prologue_##shading(dithering, target);       \
3227   stmdb sp!, { r4 - r5, lr };                                                  \
3228   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3229                                                                                \
3230   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
3231                                                                                \
3232   shade_blocks_textured_modulated_prologue_##target();                         \
3233                                                                                \
3234   add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset;                       \
3235   mov c_32, #32;                                                               \
3236                                                                                \
3237   add block_ptr_load_b, block_ptr_load_a, #16;                                 \
3238   vmov.u8 d64_1, #1;                                                           \
3239   vmov.u8 d64_4, #4;                                                           \
3240   vmov.u8 d64_128, #128;                                                       \
3241                                                                                \
3242   vld1.u32 { texels }, [block_ptr_load_a, :128], c_32;                         \
3243   vmov.u8 d128_0x07, #0x07;                                                    \
3244                                                                                \
3245   shade_blocks_textured_modulated_load_rg_##shading();                         \
3246   vmov.u8 d128_0x1F, #0x1F;                                                    \
3247                                                                                \
3248   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3249   vmov.u16 d128_0x8000, #0x8000;                                               \
3250                                                                                \
3251   vmovn.u16 texels_r, texels;                                                  \
3252   vshrn.u16 texels_g, texels, #5;                                              \
3253                                                                                \
3254   vshrn.u16 texels_b, texels, #7;                                              \
3255   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3256                                                                                \
3257   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3258   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3259                                                                                \
3260   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3261   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3262                                                                                \
3263   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3264   vshr.u8 texels_b, texels_b, #3;                                              \
3265                                                                                \
3266   shade_blocks_textured_modulate_##dithering(r);                               \
3267   shade_blocks_textured_modulate_##dithering(g);                               \
3268   shade_blocks_textured_modulate_##dithering(b);                               \
3269                                                                                \
3270   vand.u16 pixels, texels, d128_0x8000;                                        \
3271   vceq.u16 zero_mask, texels, #0;                                              \
3272                                                                                \
3273   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3274   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3275   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3276                                                                                \
3277   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3278   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3279   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3280   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3281                                                                                \
3282   subs num_blocks, num_blocks, #1;                                             \
3283   beq 1f;                                                                      \
3284                                                                                \
3285  .align 3;                                                                     \
3286                                                                                \
3287  0:                                                                            \
3288   vld1.u32 { texels }, [block_ptr_load_a, :128], c_32;                         \
3289   shade_blocks_textured_modulated_load_rg_##shading();                         \
3290   vshrn.u16 texels_g, texels, #5;                                              \
3291                                                                                \
3292   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3293   vshrn.u16 texels_b, texels, #7;                                              \
3294                                                                                \
3295   pld [block_ptr_load_a];                                                      \
3296   vmovn.u16 texels_r, texels;                                                  \
3297   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3298                                                                                \
3299   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3300   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3301   shade_blocks_textured_modulated_store_draw_mask_##target(-4);                \
3302                                                                                \
3303   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3304   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3305                                                                                \
3306   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3307   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3308                                                                                \
3309   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3310   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3311                                                                                \
3312   shade_blocks_textured_modulated_store_pixels_##target();                     \
3313   vshr.u8 texels_b, texels_b, #3;                                              \
3314                                                                                \
3315   shade_blocks_textured_modulate_##dithering(r);                               \
3316   shade_blocks_textured_modulate_##dithering(g);                               \
3317   shade_blocks_textured_modulate_##dithering(b);                               \
3318                                                                                \
3319   vand.u16 pixels, texels, d128_0x8000;                                        \
3320   vceq.u16 zero_mask, texels, #0;                                              \
3321                                                                                \
3322   subs num_blocks, num_blocks, #1;                                             \
3323                                                                                \
3324   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3325   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3326   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3327                                                                                \
3328   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3329   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3330   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3331   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3332                                                                                \
3333   bne 0b;                                                                      \
3334                                                                                \
3335  1:                                                                            \
3336   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3337   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3338   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3339                                                                                \
3340   shade_blocks_textured_modulated_store_draw_mask_##target(28);                \
3341   shade_blocks_textured_modulated_store_pixels_##target();                     \
3342                                                                                \
3343   ldmia sp!, { r4 - r5, lr };                                                  \
3344   restore_abi_regs();                                                          \
3345   bx lr                                                                        \
3346
3347
3348 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3349 shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3350 shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3351 shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3352
3353 shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3354 shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3355 shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3356 shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3357
3358
3359 #undef c_64
3360 #undef fb_ptr
3361 #undef color_ptr
3362
3363 #undef color_r
3364 #undef color_g
3365 #undef color_b
3366
3367 #undef test_mask
3368 #undef pixels
3369 #undef draw_mask
3370 #undef zero_mask
3371 #undef fb_pixels
3372 #undef msb_mask
3373 #undef msb_mask_low
3374 #undef msb_mask_high
3375
3376 #define psx_gpu                                  r0
3377 #define num_blocks                               r1
3378 #define mask_msb_ptr                             r2
3379 #define color_ptr                                r3
3380
3381 #define block_ptr_load                           r0
3382 #define draw_mask_store_ptr                      r3
3383 #define draw_mask_bits_ptr                       r12
3384 #define draw_mask_ptr                            r12
3385 #define pixel_store_ptr                          r14
3386
3387 #define fb_ptr_cmp                               r4
3388
3389 #define fb_ptr                                   r3
3390 #define fb_ptr_next                              r14
3391
3392 #define c_64                                     r2
3393
3394 #define test_mask                                q0
3395 #define pixels                                   q1
3396 #define draw_mask                                q2
3397 #define zero_mask                                q3
3398 #define draw_mask_combined                       q4
3399 #define fb_pixels                                q5
3400 #define fb_pixels_next                           q6
3401 #define msb_mask                                 q7
3402
3403 #define draw_mask_low                            d4
3404 #define draw_mask_high                           d5
3405 #define msb_mask_low                             d14
3406 #define msb_mask_high                            d15
3407
3408 .align 3
3409 function(shade_blocks_textured_unmodulated_indirect)
3410   stmdb sp!, { r4, r14 }
3411   save_abi_regs()
3412   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3413
3414   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3415   add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3416
3417   vld1.u32 { test_mask }, [psx_gpu, :128]
3418   add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3419
3420   mov c_64, #64
3421   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3422
3423   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3424   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3425    [draw_mask_bits_ptr, :16], c_64
3426   vceq.u16 zero_mask, pixels, #0
3427
3428   vtst.u16 draw_mask, draw_mask, test_mask
3429   vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
3430
3431   subs num_blocks, num_blocks, #1
3432   beq 1f
3433
3434  0:
3435   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3436   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3437
3438   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3439    [draw_mask_bits_ptr, :16], c_64
3440   vceq.u16 zero_mask, pixels, #0
3441
3442   vtst.u16 draw_mask, draw_mask, test_mask
3443   vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
3444
3445   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
3446   subs num_blocks, num_blocks, #1
3447
3448   bne 0b
3449
3450  1:
3451   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3452   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
3453
3454   restore_abi_regs()
3455   ldmia sp!, { r4, pc }
3456
3457
3458 .align 3
3459
3460 function(shade_blocks_textured_unmodulated_direct)
3461   stmdb sp!, { r4, r14 }
3462   save_abi_regs()
3463   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3464
3465   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3466   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3467
3468   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
3469   mov c_64, #64
3470
3471   vld1.u32 { test_mask }, [psx_gpu, :128]
3472   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3473
3474   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3475    [draw_mask_bits_ptr, :16], c_64
3476   ldr fb_ptr_next, [block_ptr_load, #44]
3477
3478   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3479   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3480   vceq.u16 zero_mask, pixels, #0
3481   vtst.u16 draw_mask, draw_mask, test_mask
3482
3483   subs num_blocks, num_blocks, #1
3484   beq 1f
3485
3486  0:
3487   mov fb_ptr, fb_ptr_next
3488   ldr fb_ptr_next, [block_ptr_load, #44]
3489
3490   vorr.u16 pixels, pixels, msb_mask
3491
3492   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3493   vmov fb_pixels, fb_pixels_next
3494
3495   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3496    [draw_mask_bits_ptr, :16], c_64
3497   vbif.u16 fb_pixels, pixels, draw_mask_combined
3498
3499   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3500   pld [fb_ptr_next, #64]
3501
3502   add fb_ptr_cmp, fb_ptr_cmp, #14
3503   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3504
3505   cmp fb_ptr_cmp, #28
3506   bls 4f
3507
3508   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3509   vceq.u16 zero_mask, pixels, #0
3510
3511   vst1.u16 { fb_pixels }, [fb_ptr]
3512   vtst.u16 draw_mask, draw_mask, test_mask
3513
3514  3:
3515   subs num_blocks, num_blocks, #1
3516   bne 0b
3517
3518  1:
3519   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3520   vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3521
3522   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
3523
3524   restore_abi_regs()
3525   ldmia sp!, { r4, pc }
3526
3527  4:
3528   vst1.u16 { fb_pixels }, [fb_ptr]
3529   vceq.u16 zero_mask, pixels, #0
3530
3531   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3532   vtst.u16 draw_mask, draw_mask, test_mask
3533
3534   bal 3b
3535
3536
3537 function(shade_blocks_unshaded_untextured_indirect)
3538   bx lr
3539
3540 .align 3
3541
3542 function(shade_blocks_unshaded_untextured_direct)
3543   stmdb sp!, { r4, r14 }
3544   save_abi_regs()
3545   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3546
3547   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3548   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3549
3550   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
3551   add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3552
3553   add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3554   vld1.u16 { pixels }, [color_ptr, :128]
3555
3556   mov c_64, #64
3557   vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
3558
3559   vorr.u16 pixels, pixels, msb_mask
3560   subs num_blocks, num_blocks, #1
3561
3562   ldr fb_ptr_next, [block_ptr_load], #64
3563
3564   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3565   beq 1f
3566
3567  0:
3568   vmov fb_pixels, fb_pixels_next
3569   mov fb_ptr, fb_ptr_next
3570   ldr fb_ptr_next, [block_ptr_load], #64
3571
3572   vbif.u16 fb_pixels, pixels, draw_mask
3573   vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
3574
3575   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3576   add fb_ptr_cmp, fb_ptr_cmp, #14
3577   cmp fb_ptr_cmp, #28
3578   bls 4f
3579
3580   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3581   vst1.u16 { fb_pixels }, [fb_ptr]
3582
3583  3:
3584   subs num_blocks, num_blocks, #1
3585   bne 0b
3586
3587  1:
3588   vbif.u16 fb_pixels_next, pixels, draw_mask
3589   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
3590
3591   restore_abi_regs()
3592   ldmia sp!, { r4, pc }
3593
3594  4:
3595   vst1.u16 { fb_pixels }, [fb_ptr]
3596   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3597   bal 3b
3598
3599
3600 #undef draw_mask_ptr
3601 #undef c_64
3602 #undef fb_ptr
3603 #undef fb_ptr_next
3604 #undef fb_ptr_cmp
3605
3606 #define psx_gpu                                  r0
3607 #define num_blocks                               r1
3608 #define msb_mask_ptr                             r2
3609 #define pixel_ptr                                r3
3610 #define draw_mask_ptr                            r0
3611 #define c_64                                     r2
3612 #define fb_ptr                                   r12
3613 #define fb_ptr_next                              r14
3614 #define fb_ptr_cmp                               r4
3615
3616 #undef msb_mask
3617 #undef draw_mask
3618 #undef pixels
3619 #undef fb_pixels
3620 #undef d128_0x8000
3621 #undef msb_mask_low
3622 #undef msb_mask_high
3623 #undef draw_mask_next
3624 #undef pixels_g
3625 #undef blend_pixels
3626 #undef fb_pixels_next
3627
3628 #define msb_mask                                 q0
3629 #define draw_mask                                q1
3630 #define pixels                                   q2
3631 #define fb_pixels                                q3
3632 #define blend_pixels                             q4
3633 #define pixels_no_msb                            q5
3634 #define blend_mask                               q6
3635 #define fb_pixels_no_msb                         q7
3636 #define d128_0x8000                              q8
3637 #define d128_0x0421                              q9
3638 #define fb_pixels_next                           q10
3639 #define blend_pixels_next                        q11
3640 #define pixels_next                              q12
3641 #define draw_mask_next                           q13
3642 #define write_mask                               q14
3643
3644 #define pixels_rb                                q5
3645 #define pixels_mg                                q7
3646 #define pixels_g                                 q7
3647 #define d128_0x7C1F                              q8
3648 #define d128_0x03E0                              q9
3649 #define fb_pixels_rb                             q10
3650 #define fb_pixels_g                              q11
3651 #define fb_pixels_masked                         q11
3652 #define d128_0x83E0                              q15
3653 #define pixels_fourth                            q7
3654 #define d128_0x1C07                              q12
3655 #define d128_0x00E0                              q13
3656 #define d128_0x80E0                              q13
3657
3658 #define msb_mask_low                             d0
3659 #define msb_mask_high                            d1
3660
3661 #define blend_blocks_average_set_blend_mask_textured(source)                   \
3662   vclt.s16 blend_mask, source, #0                                              \
3663
3664 #define blend_blocks_average_set_stp_bit_textured()                            \
3665   vorr.u16 blend_pixels, #0x8000                                               \
3666
3667 #define blend_blocks_average_combine_textured(source)                          \
3668   vbif.u16 blend_pixels, source, blend_mask                                    \
3669   
3670 #define blend_blocks_average_set_blend_mask_untextured(source)                 \
3671
3672 #define blend_blocks_average_set_stp_bit_untextured()                          \
3673
3674 #define blend_blocks_average_combine_untextured(source)                        \
3675
3676 #define blend_blocks_average_mask_set_on()                                     \
3677   vclt.s16 write_mask, fb_pixels_next, #0                                      \
3678
3679 #define blend_blocks_average_mask_copy_on()                                    \
3680   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3681
3682 #define blend_blocks_average_mask_copy_b_on()                                  \
3683   vorr.u16 draw_mask_next, draw_mask_next, write_mask                          \
3684
3685 #define blend_blocks_average_mask_set_off()                                    \
3686
3687 #define blend_blocks_average_mask_copy_off()                                   \
3688   vmov draw_mask, draw_mask_next                                               \
3689
3690 #define blend_blocks_average_mask_copy_b_off()                                 \
3691
3692 #define blend_blocks_average_builder(texturing, mask_evaluate)                 \
3693 .align 3;                                                                      \
3694                                                                                \
3695 function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
3696   stmdb sp!, { r4, r14 };                                                      \
3697   save_abi_regs();                                                             \
3698   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3699   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3700                                                                                \
3701   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3702   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3703                                                                                \
3704   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3705   mov c_64, #64;                                                               \
3706                                                                                \
3707   vmov.u16 d128_0x8000, #0x8000;                                               \
3708   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
3709   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3710                                                                                \
3711   vmov.u16 d128_0x0421, #0x0400;                                               \
3712   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
3713                                                                                \
3714   vorr.u16 d128_0x0421, #0x0021;                                               \
3715   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3716                                                                                \
3717   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3718   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3719   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3720   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3721   blend_blocks_average_mask_set_##mask_evaluate();                             \
3722   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3723                                                                                \
3724   subs num_blocks, num_blocks, #1;                                             \
3725   beq 1f;                                                                      \
3726                                                                                \
3727  0:                                                                            \
3728   mov fb_ptr, fb_ptr_next;                                                     \
3729   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3730                                                                                \
3731   vmov pixels, pixels_next;                                                    \
3732   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
3733                                                                                \
3734   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3735                                                                                \
3736   blend_blocks_average_mask_copy_##mask_evaluate();                            \
3737   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
3738                                                                                \
3739   blend_blocks_average_set_blend_mask_##texturing(pixels);                     \
3740   blend_blocks_average_set_stp_bit_##texturing();                              \
3741   vmov fb_pixels, fb_pixels_next;                                              \
3742   blend_blocks_average_combine_##texturing(pixels);                            \
3743                                                                                \
3744   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3745   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3746   cmp fb_ptr_cmp, #28;                                                         \
3747   bls 2f;                                                                      \
3748                                                                                \
3749   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3750   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3751                                                                                \
3752   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3753   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3754                                                                                \
3755   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3756   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3757                                                                                \
3758   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3759   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3760   blend_blocks_average_mask_set_##mask_evaluate();                             \
3761   vst1.u16 { fb_pixels }, [fb_ptr];                                            \
3762                                                                                \
3763  3:                                                                            \
3764   subs num_blocks, num_blocks, #1;                                             \
3765   bne 0b;                                                                      \
3766                                                                                \
3767  1:                                                                            \
3768   blend_blocks_average_mask_copy_b_##mask_evaluate();                          \
3769   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3770                                                                                \
3771   blend_blocks_average_set_blend_mask_##texturing(pixels_next);                \
3772   blend_blocks_average_set_stp_bit_##texturing();                              \
3773   blend_blocks_average_combine_##texturing(pixels_next);                       \
3774                                                                                \
3775   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3776   vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next;                       \
3777   vst1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3778                                                                                \
3779   restore_abi_regs();                                                          \
3780   ldmia sp!, { r4, pc };                                                       \
3781                                                                                \
3782  2:                                                                            \
3783   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3784   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3785   vst1.u16 { fb_pixels }, [fb_ptr];                                            \
3786                                                                                \
3787   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3788   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3789   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3790   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3791   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3792   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3793                                                                                \
3794   bal 3b                                                                       \
3795
3796 blend_blocks_average_builder(textured, off)
3797 blend_blocks_average_builder(untextured, off)
3798 blend_blocks_average_builder(textured, on)
3799 blend_blocks_average_builder(untextured, on)
3800
3801
3802 #define blend_blocks_add_mask_set_on()                                         \
3803   vclt.s16 write_mask, fb_pixels, #0                                           \
3804
3805 #define blend_blocks_add_mask_copy_on()                                        \
3806   vorr.u16 draw_mask, draw_mask, write_mask                                    \
3807
3808 #define blend_blocks_add_mask_set_off()                                        \
3809
3810 #define blend_blocks_add_mask_copy_off()                                       \
3811
3812
3813 #define blend_blocks_add_textured_builder(mask_evaluate)                       \
3814 .align 3;                                                                      \
3815                                                                                \
3816 function(blend_blocks_textured_add_##mask_evaluate)                            \
3817   stmdb sp!, { r4, r14 };                                                      \
3818   save_abi_regs();                                                             \
3819   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3820   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3821                                                                                \
3822   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3823   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3824                                                                                \
3825   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3826   mov c_64, #64;                                                               \
3827                                                                                \
3828   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3829   vmov.u16 d128_0x03E0, #0x0300;                                               \
3830   vmov.u16 d128_0x83E0, #0x8000;                                               \
3831   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3832   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3833   vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0;                              \
3834                                                                                \
3835   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3836   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3837   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3838   vclt.s16 blend_mask, pixels, #0;                                             \
3839   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3840   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3841   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3842                                                                                \
3843   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3844   vorr.u16 pixels, pixels, msb_mask;                                           \
3845   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3846   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3847   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3848   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3849   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3850   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3851   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3852   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3853                                                                                \
3854   subs num_blocks, num_blocks, #1;                                             \
3855   beq 1f;                                                                      \
3856                                                                                \
3857  0:                                                                            \
3858   mov fb_ptr, fb_ptr_next;                                                     \
3859                                                                                \
3860   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3861                                                                                \
3862   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3863   vclt.s16 blend_mask, pixels, #0;                                             \
3864                                                                                \
3865   vorr.u16 pixels, pixels, msb_mask;                                           \
3866   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3867   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3868                                                                                \
3869   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3870   pld [fb_ptr_next, #64];                                                      \
3871                                                                                \
3872   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3873   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3874                                                                                \
3875   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3876   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3877                                                                                \
3878   cmp fb_ptr_cmp, #28;                                                         \
3879   bls 2f;                                                                      \
3880                                                                                \
3881   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3882   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3883   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3884   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3885   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3886   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3887   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3888                                                                                \
3889  3:                                                                            \
3890   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3891   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3892   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3893   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3894   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3895                                                                                \
3896   subs num_blocks, num_blocks, #1;                                             \
3897   bne 0b;                                                                      \
3898                                                                                \
3899  1:                                                                            \
3900   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3901   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3902   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
3903                                                                                \
3904   restore_abi_regs();                                                          \
3905   ldmia sp!, { r4, pc };                                                       \
3906                                                                                \
3907  2:                                                                            \
3908   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3909   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3910                                                                                \
3911   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3912   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3913   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3914   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3915   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3916   bal 3b                                                                       \
3917
3918
3919 #define blend_blocks_add_untextured_builder(mask_evaluate)                     \
3920 .align 3;                                                                      \
3921                                                                                \
3922 function(blend_blocks_untextured_add_##mask_evaluate)                          \
3923   stmdb sp!, { r4, r14 };                                                      \
3924   save_abi_regs();                                                             \
3925   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3926   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3927                                                                                \
3928   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3929   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3930                                                                                \
3931   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3932   mov c_64, #64;                                                               \
3933                                                                                \
3934   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3935   vmov.u16 d128_0x03E0, #0x0300;                                               \
3936   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3937   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3938                                                                                \
3939   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3940   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3941   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3942   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3943   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3944   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3945                                                                                \
3946   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3947   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3948   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3949   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3950   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3951   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3952   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3953   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3954                                                                                \
3955   subs num_blocks, num_blocks, #1;                                             \
3956   beq 1f;                                                                      \
3957                                                                                \
3958  0:                                                                            \
3959   mov fb_ptr, fb_ptr_next;                                                     \
3960                                                                                \
3961   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3962                                                                                \
3963   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3964                                                                                \
3965   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3966   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3967   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3968                                                                                \
3969   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3970   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3971                                                                                \
3972   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3973   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3974   cmp fb_ptr_cmp, #28;                                                         \
3975   bls 2f;                                                                      \
3976                                                                                \
3977   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3978   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3979   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3980   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3981   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3982   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3983                                                                                \
3984  3:                                                                            \
3985   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3986   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3987   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3988   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3989   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3990                                                                                \
3991   subs num_blocks, num_blocks, #1;                                             \
3992   bne 0b;                                                                      \
3993                                                                                \
3994  1:                                                                            \
3995   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3996   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3997   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3998   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
3999                                                                                \
4000   restore_abi_regs();                                                          \
4001   ldmia sp!, { r4, pc };                                                       \
4002                                                                                \
4003  2:                                                                            \
4004   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4005   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
4006                                                                                \
4007   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4008   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4009   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4010   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4011   bal 3b                                                                       \
4012
4013
4014 blend_blocks_add_textured_builder(off)
4015 blend_blocks_add_textured_builder(on)
4016 blend_blocks_add_untextured_builder(off)
4017 blend_blocks_add_untextured_builder(on)
4018
4019 #define blend_blocks_subtract_set_blend_mask_textured()                        \
4020   vclt.s16 blend_mask, pixels_next, #0                                         \
4021
4022 #define blend_blocks_subtract_combine_textured()                               \
4023   vbif.u16 blend_pixels, pixels, blend_mask                                    \
4024
4025 #define blend_blocks_subtract_set_stp_textured()                               \
4026   vorr.u16 blend_pixels, #0x8000                                               \
4027
4028 #define blend_blocks_subtract_msb_mask_textured()                              \
4029   vorr.u16 pixels, pixels_next, msb_mask                                       \
4030
4031 #define blend_blocks_subtract_set_blend_mask_untextured()                      \
4032
4033 #define blend_blocks_subtract_combine_untextured()                             \
4034
4035 #define blend_blocks_subtract_set_stp_untextured()                             \
4036   vorr.u16 blend_pixels, blend_pixels, msb_mask                                \
4037
4038 #define blend_blocks_subtract_msb_mask_untextured()                            \
4039
4040
4041 #define blend_blocks_subtract_mask_set_on()                                    \
4042   vclt.s16 write_mask, fb_pixels, #0                                           \
4043
4044 #define blend_blocks_subtract_mask_copy_on()                                   \
4045   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
4046
4047 #define blend_blocks_subtract_mask_set_off()                                   \
4048
4049 #define blend_blocks_subtract_mask_copy_off()                                  \
4050   vmov draw_mask, draw_mask_next                                               \
4051
4052
4053 #define blend_blocks_subtract_builder(texturing, mask_evaluate)                \
4054 .align 3;                                                                      \
4055                                                                                \
4056 function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
4057   stmdb sp!, { r4, r14 };                                                      \
4058   save_abi_regs();                                                             \
4059   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4060   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4061                                                                                \
4062   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4063   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4064                                                                                \
4065   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4066   mov c_64, #64;                                                               \
4067                                                                                \
4068   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4069   vmov.u16 d128_0x03E0, #0x0300;                                               \
4070   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4071   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4072                                                                                \
4073   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
4074   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4075   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
4076   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4077   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4078   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4079   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4080                                                                                \
4081   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4082   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4083   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4084   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4085   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4086                                                                                \
4087   subs num_blocks, num_blocks, #1;                                             \
4088   beq 1f;                                                                      \
4089                                                                                \
4090  0:                                                                            \
4091   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4092   mov fb_ptr, fb_ptr_next;                                                     \
4093   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4094                                                                                \
4095   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
4096   blend_blocks_subtract_msb_mask_##texturing();                                \
4097                                                                                \
4098   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
4099   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4100   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4101   blend_blocks_subtract_set_stp_##texturing();                                 \
4102   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4103   blend_blocks_subtract_combine_##texturing();                                 \
4104   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4105   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4106                                                                                \
4107   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4108   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4109   cmp fb_ptr_cmp, #28;                                                         \
4110   bls 2f;                                                                      \
4111                                                                                \
4112   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4113   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4114   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4115   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4116   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4117   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4118   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4119                                                                                \
4120  3:                                                                            \
4121   subs num_blocks, num_blocks, #1;                                             \
4122   bne 0b;                                                                      \
4123                                                                                \
4124  1:                                                                            \
4125   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4126                                                                                \
4127   blend_blocks_subtract_msb_mask_##texturing();                                \
4128   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4129   blend_blocks_subtract_set_stp_##texturing();                                 \
4130   blend_blocks_subtract_combine_##texturing();                                 \
4131   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4132   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4133                                                                                \
4134   restore_abi_regs();                                                          \
4135   ldmia sp!, { r4, pc };                                                       \
4136                                                                                \
4137  2:                                                                            \
4138   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4139   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4140   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4141   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4142   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4143   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4144   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4145   bal 3b                                                                       \
4146
4147
4148 blend_blocks_subtract_builder(textured, off)
4149 blend_blocks_subtract_builder(textured, on)
4150 blend_blocks_subtract_builder(untextured, off)
4151 blend_blocks_subtract_builder(untextured, on)
4152
4153
4154 #define blend_blocks_add_fourth_textured_builder(mask_evaluate)                \
4155 .align 3;                                                                      \
4156                                                                                \
4157 function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
4158   stmdb sp!, { r4, r14 };                                                      \
4159   save_abi_regs();                                                             \
4160   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4161   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4162                                                                                \
4163   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4164   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4165                                                                                \
4166   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4167   mov c_64, #64;                                                               \
4168                                                                                \
4169   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4170   vmov.u16 d128_0x03E0, #0x0300;                                               \
4171   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4172   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4173   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4174   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4175   vorr.u16 d128_0x1C07, #0x0007;                                               \
4176                                                                                \
4177   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4178   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4179   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4180   vclt.s16 blend_mask, pixels, #0;                                             \
4181   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4182   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4183   vshr.s16 pixels_fourth, pixels, #2;                                          \
4184   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4185                                                                                \
4186   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4187   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4188   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4189   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4190   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4191   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4192   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4193   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4194                                                                                \
4195   subs num_blocks, num_blocks, #1;                                             \
4196   beq 1f;                                                                      \
4197                                                                                \
4198  0:                                                                            \
4199   mov fb_ptr, fb_ptr_next;                                                     \
4200   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4201                                                                                \
4202   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4203   vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
4204   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4205                                                                                \
4206   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4207   vclt.s16 blend_mask, pixels, #0;                                             \
4208   vshr.s16 pixels_fourth, pixels, #2;                                          \
4209   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4210   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4211                                                                                \
4212   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4213   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4214                                                                                \
4215   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4216   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4217   cmp fb_ptr_cmp, #28;                                                         \
4218   bls 2f;                                                                      \
4219                                                                                \
4220   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4221   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4222   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4223   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4224   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4225   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4226                                                                                \
4227  3:                                                                            \
4228   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4229   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4230   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4231   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4232   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4233                                                                                \
4234   subs num_blocks, num_blocks, #1;                                             \
4235   bne 0b;                                                                      \
4236                                                                                \
4237  1:                                                                            \
4238   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4239   vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
4240   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4241   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4242   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4243   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4244                                                                                \
4245   restore_abi_regs();                                                          \
4246   ldmia sp!, { r4, pc };                                                       \
4247                                                                                \
4248  2:                                                                            \
4249   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4250   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4251                                                                                \
4252   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4253   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4254   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4255   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4256   bal 3b                                                                       \
4257
4258
4259
4260 #define blend_blocks_add_fourth_untextured_builder(mask_evaluate)              \
4261 .align 3;                                                                      \
4262                                                                                \
4263 function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
4264   stmdb sp!, { r4, r14 };                                                      \
4265   save_abi_regs();                                                             \
4266   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4267   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4268                                                                                \
4269   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4270   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4271                                                                                \
4272   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4273   mov c_64, #64;                                                               \
4274                                                                                \
4275   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4276   vmov.u16 d128_0x03E0, #0x0300;                                               \
4277   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4278   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4279   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4280   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4281   vorr.u16 d128_0x1C07, #0x0007;                                               \
4282                                                                                \
4283   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4284   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4285   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4286   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4287   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4288   vshr.s16 pixels_fourth, pixels, #2;                                          \
4289   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4290                                                                                \
4291   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4292   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4293   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4294   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4295   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4296   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4297   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4298   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4299                                                                                \
4300   subs num_blocks, num_blocks, #1;                                             \
4301   beq 1f;                                                                      \
4302                                                                                \
4303  0:                                                                            \
4304   mov fb_ptr, fb_ptr_next;                                                     \
4305   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4306                                                                                \
4307   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4308                                                                                \
4309   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4310   vshr.s16 pixels_fourth, pixels, #2;                                          \
4311   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4312   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4313                                                                                \
4314   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4315   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4316                                                                                \
4317   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4318   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4319   cmp fb_ptr_cmp, #28;                                                         \
4320   bls 2f;                                                                      \
4321                                                                                \
4322   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4323   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4324   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4325   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4326   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4327   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4328                                                                                \
4329  3:                                                                            \
4330   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4331   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4332   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4333   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4334   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4335                                                                                \
4336   subs num_blocks, num_blocks, #1;                                             \
4337   bne 0b;                                                                      \
4338                                                                                \
4339  1:                                                                            \
4340   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4341   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4342   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4343   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4344                                                                                \
4345   restore_abi_regs();                                                          \
4346   ldmia sp!, { r4, pc };                                                       \
4347                                                                                \
4348  2:                                                                            \
4349   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4350   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4351                                                                                \
4352   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4353   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4354   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4355   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4356   bal 3b                                                                       \
4357
4358
4359 blend_blocks_add_fourth_textured_builder(off)
4360 blend_blocks_add_fourth_textured_builder(on)
4361 blend_blocks_add_fourth_untextured_builder(off)
4362 blend_blocks_add_fourth_untextured_builder(on)
4363
4364 // TODO: Optimize this more. Need a scene that actually uses it for
4365 // confirmation..
4366
4367 .align 3
4368
4369 function(blend_blocks_textured_unblended_on)         
4370   stmdb sp!, { r4, r14 }
4371   save_abi_regs()
4372   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4373   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
4374
4375   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4376   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
4377
4378   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4379   mov c_64, #64
4380
4381   ldr fb_ptr, [pixel_ptr, #28]
4382   vld1.u16 { fb_pixels }, [fb_ptr]
4383   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
4384   vclt.s16 write_mask, fb_pixels, #0
4385   vld1.u32 { pixels }, [pixel_ptr, :128], c_64
4386
4387   subs num_blocks, num_blocks, #1
4388   beq 1f
4389
4390  0:
4391   vorr.u16 pixels, pixels, msb_mask
4392   vorr.u16 draw_mask, draw_mask, write_mask
4393   vbif.u16 fb_pixels, pixels, draw_mask
4394   vst1.u16 { fb_pixels }, [fb_ptr]
4395
4396   ldr fb_ptr, [pixel_ptr, #28]
4397   vld1.u16 { fb_pixels }, [fb_ptr]
4398   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
4399   vclt.s16 write_mask, fb_pixels, #0
4400   vld1.u32 { pixels }, [pixel_ptr, :128], c_64
4401
4402   subs num_blocks, num_blocks, #1
4403   bne 0b
4404  
4405  1:
4406   vorr.u16 pixels, pixels, msb_mask
4407   vorr.u16 draw_mask, draw_mask, write_mask
4408   vbif.u16 fb_pixels, pixels, draw_mask
4409   vst1.u16 { fb_pixels }, [fb_ptr]
4410
4411   restore_abi_regs()
4412   ldmia sp!, { r4, pc }
4413
4414
4415 function(blend_blocks_textured_unblended_off)
4416   bx lr
4417
4418
4419 function(warmup)
4420   mov r3, #64
4421   cmp r0, #0
4422   bxeq lr
4423
4424  0:
4425   vld1.u32 { u_whole_8, v_whole_8 }, [r1, :128], r3
4426
4427   subs r0, r0, #1
4428   bne 0b
4429
4430   bx lr
4431
4432 #undef vram_ptr
4433 #undef color
4434 #undef x
4435 #undef y
4436 #undef width
4437 #undef height
4438 #undef fb_ptr
4439 #undef texture_mask
4440 #undef num_blocks
4441 #undef temp
4442 #undef dirty_textures_mask
4443 #undef clut_ptr
4444 #undef current_texture_mask
4445
4446 #define psx_gpu                                           r0
4447 #define x                                                 r1
4448 #define y                                                 r2
4449 #define u                                                 r3
4450 #define v                                                 r4
4451 #define width                                             r5
4452 #define height                                            r6
4453 #define offset_u                                          r8
4454 #define offset_v                                          r9
4455 #define offset_u_right                                    r10
4456 #define width_rounded                                     r11
4457 #define height_rounded                                    r12
4458
4459 #define texture_offset_base                               r1
4460 #define tile_width                                        r2
4461 #define tile_height                                       r3
4462 #define num_blocks                                        r4
4463 #define block                                             r5
4464 #define sub_tile_height                                   r6
4465 #define fb_ptr                                            r7
4466 #define texture_mask                                      r8
4467 #define column_data                                       r9
4468 #define texture_offset                                    r10
4469 #define tiles_remaining                                   r11
4470 #define fb_ptr_advance_column                             r12
4471 #define texture_block_ptr                                 r14
4472
4473 #define temp                                              r14
4474
4475 #define texture_page_ptr                                  r3
4476 #define left_block_mask                                   r4
4477 #define right_block_mask                                  r5
4478 #define texture_mask_rev                                  r10
4479 #define control_mask                                      r11
4480
4481 #define dirty_textures_mask                               r4
4482 #define clut_ptr                                          r5
4483 #define current_texture_mask                              r6
4484
4485
4486 #undef texels
4487 #undef clut_low_a
4488 #undef clut_low_b
4489 #undef clut_high_a
4490 #undef clut_high_b
4491 #undef clut_a
4492 #undef clut_b
4493 #undef texels_low
4494 #undef texels_high
4495
4496 #define texels                                            d0
4497 #define draw_masks_fb_ptrs                                q1
4498
4499 #define draw_mask_fb_ptr_left                             d2
4500 #define draw_mask_fb_ptr_right                            d3
4501
4502 #define draw_mask_fb_ptr_left_a                           d2
4503 #define draw_mask_fb_ptr_left_b                           d3
4504 #define draw_mask_fb_ptr_right_a                          d10
4505 #define draw_mask_fb_ptr_right_b                          d11
4506 #define draw_masks_fb_ptrs2                               q5
4507
4508 #define clut_low_a                                        d4
4509 #define clut_low_b                                        d5
4510 #define clut_high_a                                       d6
4511 #define clut_high_b                                       d7
4512
4513 #define block_masks                                       d8
4514 #define block_masks_shifted                               d9
4515
4516 #define clut_a                                            q2
4517 #define clut_b                                            q3
4518
4519 #define texels_low                                        d12
4520 #define texels_high                                       d13
4521
4522 #define texels_wide_low                                   d14
4523 #define texels_wide_high                                  d15
4524 #define texels_wide                                       q7
4525
4526 .align 3
4527
4528 setup_sprite_flush_blocks:
4529   push   { r0 - r3, EXTRA_UNSAVED_REGS r12, lr }
4530   add    block, r0, #psx_gpu_saved_tmp_offset        /* r5 */
4531   vstmia block, { q1 - q3 }
4532   bl     flush_render_block_buffer
4533   vldmia block, { q1 - q3 }
4534   pop    { r0 - r3, EXTRA_UNSAVED_REGS r12, lr }
4535
4536   add    block, psx_gpu, #psx_gpu_blocks_offset
4537   bx     lr
4538
4539
4540 setup_sprite_update_texture_4bpp_cache:
4541   push { r0 - r4, lr }
4542   bl update_texture_4bpp_cache
4543   pop  { r0 - r4, pc }
4544
4545
4546 setup_sprite_update_texture_8bpp_cache:
4547   push { r0 - r4, EXTRA_UNSAVED_REGS lr }
4548   bl update_texture_8bpp_cache
4549   pop  { r0 - r4, EXTRA_UNSAVED_REGS pc }
4550
4551
4552 #define setup_sprite_tiled_initialize_4bpp()                                   \
4553   ldr dirty_textures_mask,                                                     \
4554    [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset];                        \
4555   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset];                           \
4556                                                                                \
4557   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset];   \
4558   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128];                               \
4559                                                                                \
4560   tst current_texture_mask, dirty_textures_mask;                               \
4561   vuzp.u8 clut_a, clut_b;                                                      \
4562                                                                                \
4563   blne setup_sprite_update_texture_4bpp_cache                                  \
4564
4565 #define setup_sprite_tiled_initialize_8bpp()                                   \
4566   ldr dirty_textures_mask,                                                     \
4567    [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset];                        \
4568   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset];   \
4569                                                                                \
4570   tst current_texture_mask, dirty_textures_mask;                               \
4571   blne setup_sprite_update_texture_8bpp_cache                                  \
4572
4573
4574 #define setup_sprite_block_count_single()                                      \
4575   sub_tile_height                                                              \
4576
4577 #define setup_sprite_block_count_double()                                      \
4578   sub_tile_height, lsl #1                                                      \
4579
4580 #define setup_sprite_tile_add_blocks(type)                                     \
4581   add num_blocks, num_blocks, setup_sprite_block_count_##type();               \
4582   cmp num_blocks, #MAX_BLOCKS;                                                 \
4583                                                                                \
4584   movgt num_blocks, setup_sprite_block_count_##type();                         \
4585   blgt setup_sprite_flush_blocks                                               \
4586
4587
4588 #define setup_sprite_tile_full_4bpp(edge)                                      \
4589   setup_sprite_tile_add_blocks(double);                                        \
4590                                                                                \
4591  4:                                                                            \
4592   and texture_block_ptr, texture_offset, texture_mask;                         \
4593   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4594                                                                                \
4595   pld [fb_ptr];                                                                \
4596   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4597   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4598                                                                                \
4599   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4600   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4601                                                                                \
4602   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4603   add texture_block_ptr, texture_offset, #8;                                   \
4604                                                                                \
4605   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4606   add block, block, #40;                                                       \
4607                                                                                \
4608   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4609   add fb_ptr, fb_ptr, #16;                                                     \
4610                                                                                \
4611   vst1.u32 { draw_mask_fb_ptr_left }, [block, :64];                            \
4612   add block, block, #24;                                                       \
4613                                                                                \
4614   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4615   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4616                                                                                \
4617   pld [fb_ptr];                                                                \
4618   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4619   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4620                                                                                \
4621   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4622   add block, block, #40;                                                       \
4623                                                                                \
4624   add texture_offset, texture_offset, #0x10;                                   \
4625   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4626                                                                                \
4627   vst1.u32 { draw_mask_fb_ptr_right }, [block, :64];                           \
4628   add block, block, #24;                                                       \
4629                                                                                \
4630   subs sub_tile_height, sub_tile_height, #1;                                   \
4631   bne 4b;                                                                      \
4632                                                                                \
4633   add texture_offset, texture_offset, #0xF00;                                  \
4634   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4635
4636   
4637 #define setup_sprite_tile_half_4bpp(edge)                                      \
4638   setup_sprite_tile_add_blocks(single);                                        \
4639                                                                                \
4640  4:                                                                            \
4641   and texture_block_ptr, texture_offset, texture_mask;                         \
4642   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4643                                                                                \
4644   pld [fb_ptr];                                                                \
4645   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4646   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4647                                                                                \
4648   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4649   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4650                                                                                \
4651   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4652   add block, block, #40;                                                       \
4653                                                                                \
4654   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4655   vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64];                          \
4656                                                                                \
4657   add block, block, #24;                                                       \
4658   add texture_offset, texture_offset, #0x10;                                   \
4659                                                                                \
4660   add fb_ptr, fb_ptr, #2048;                                                   \
4661   subs sub_tile_height, sub_tile_height, #1;                                   \
4662                                                                                \
4663   bne 4b;                                                                      \
4664                                                                                \
4665   add texture_offset, texture_offset, #0xF00;                                  \
4666   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4667  
4668  
4669 #define setup_sprite_tile_full_8bpp(edge)                                      \
4670   setup_sprite_tile_add_blocks(double);                                        \
4671   add block, block, #16;                                                       \
4672                                                                                \
4673  4:                                                                            \
4674   and texture_block_ptr, texture_offset, texture_mask;                         \
4675   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4676                                                                                \
4677   pld [fb_ptr];                                                                \
4678   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4679   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4680                                                                                \
4681   add texture_block_ptr, texture_offset, #8;                                   \
4682   vst1.u32 { texels }, [block, :64];                                           \
4683                                                                                \
4684   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4685   add block, block, #24;                                                       \
4686                                                                                \
4687   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4688                                                                                \
4689   add fb_ptr, fb_ptr, #16;                                                     \
4690   vst1.u32 { draw_mask_fb_ptr_left }, [block, :64];                            \
4691                                                                                \
4692   add block, block, #40;                                                       \
4693   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4694   pld [fb_ptr];                                                                \
4695                                                                                \
4696   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4697   vst1.u32 { texels }, [block, :64];                                           \
4698   add block, block, #24;                                                       \
4699                                                                                \
4700   add texture_offset, texture_offset, #0x10;                                   \
4701   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4702                                                                                \
4703   vst1.u32 { draw_mask_fb_ptr_right }, [block, :64];                           \
4704   add block, block, #40;                                                       \
4705                                                                                \
4706   subs sub_tile_height, sub_tile_height, #1;                                   \
4707   bne 4b;                                                                      \
4708                                                                                \
4709   sub block, block, #16;                                                       \
4710   add texture_offset, texture_offset, #0xF00;                                  \
4711   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4712
4713   
4714 #define setup_sprite_tile_half_8bpp(edge)                                      \
4715   setup_sprite_tile_add_blocks(single);                                        \
4716   add block, block, #16;                                                       \
4717                                                                                \
4718  4:                                                                            \
4719   and texture_block_ptr, texture_offset, texture_mask;                         \
4720   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4721   pld [fb_ptr];                                                                \
4722                                                                                \
4723   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4724   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4725                                                                                \
4726   vst1.u32 { texels }, [block, :64];                                           \
4727   add block, block, #24;                                                       \
4728                                                                                \
4729   vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64];                          \
4730   add block, block, #40;                                                       \
4731                                                                                \
4732   add texture_offset, texture_offset, #0x10;                                   \
4733   add fb_ptr, fb_ptr, #2048;                                                   \
4734                                                                                \
4735   subs sub_tile_height, sub_tile_height, #1;                                   \
4736   bne 4b;                                                                      \
4737                                                                                \
4738   sub block, block, #16;                                                       \
4739   add texture_offset, texture_offset, #0xF00;                                  \
4740   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4741
4742  
4743 #define setup_sprite_tile_column_edge_pre_adjust_half_right()                  \
4744   add texture_offset, texture_offset_base, #8;                                 \
4745   add fb_ptr, fb_ptr, #16                                                      \
4746
4747 #define setup_sprite_tile_column_edge_pre_adjust_half_left()                   \
4748   mov texture_offset, texture_offset_base                                      \
4749
4750 #define setup_sprite_tile_column_edge_pre_adjust_half(edge)                    \
4751   setup_sprite_tile_column_edge_pre_adjust_half_##edge()                       \
4752
4753 #define setup_sprite_tile_column_edge_pre_adjust_full(edge)                    \
4754   mov texture_offset, texture_offset_base                                      \
4755
4756 #define setup_sprite_tile_column_edge_post_adjust_half_right()                 \
4757   sub fb_ptr, fb_ptr, #16                                                      \
4758
4759 #define setup_sprite_tile_column_edge_post_adjust_half_left()                  \
4760
4761 #define setup_sprite_tile_column_edge_post_adjust_half(edge)                   \
4762   setup_sprite_tile_column_edge_post_adjust_half_##edge()                      \
4763
4764 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
4765
4766
4767 #define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode,  \
4768  x4mode)                                                                       \
4769   mov sub_tile_height, column_data;                                            \
4770   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4771   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4772   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4773
4774 #define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode,   \
4775  x4mode)                                                                       \
4776   and sub_tile_height, column_data, #0xFF;                                     \
4777   mov tiles_remaining, column_data, lsr #16;                                   \
4778   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4779   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4780                                                                                \
4781   subs tiles_remaining, tiles_remaining, #1;                                   \
4782   beq 2f;                                                                      \
4783                                                                                \
4784  3:                                                                            \
4785   mov sub_tile_height, #16;                                                    \
4786   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4787   subs tiles_remaining, tiles_remaining, #1;                                   \
4788   bne 3b;                                                                      \
4789                                                                                \
4790  2:                                                                            \
4791   uxtb sub_tile_height, column_data, ror #8;                                   \
4792   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4793   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4794
4795
4796 #define setup_sprite_column_data_single()                                      \
4797   mov column_data, height;                                                     \
4798   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]            \
4799
4800 #define setup_sprite_column_data_multi()                                       \
4801   and height_rounded, height_rounded, #0xF;                                    \
4802   rsb column_data, offset_v, #16;                                              \
4803                                                                                \
4804   add height_rounded, height_rounded, #1;                                      \
4805   sub tile_height, tile_height, #1;                                            \
4806                                                                                \
4807   orr column_data, column_data, tile_height, lsl #16;                          \
4808   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset];           \
4809                                                                                \
4810   orr column_data, column_data, height_rounded, lsl #8                         \
4811
4812 #define setup_sprite_setup_left_draw_mask_fb_ptr()                             \
4813   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4814   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4815
4816 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column()              \
4817   mov fb_ptr_advance_column, #32;                                              \
4818   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4819                                                                                \
4820   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11;           \
4821   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4822
4823 #define setup_sprite_setup_right_draw_mask_fb_ptr()                            \
4824   vdup.u8 draw_mask_fb_ptr_left, block_masks[4];                               \
4825   vdup.u8 draw_mask_fb_ptr_right, block_masks[5]                               \
4826
4827 #define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode,     \
4828  edge, x4mode)                                                                 \
4829  setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode:     \
4830   setup_sprite_column_data_##multi_height();                                   \
4831   vext.32 block_masks_shifted, block_masks, block_masks, #1;                   \
4832   vorr.u32 block_masks, block_masks, block_masks_shifted;                      \
4833   setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
4834                                                                                \
4835   setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
4836   vpop { q4 - q7 };                                                            \
4837   pop  { r3 - r11, pc }                                                        \
4838
4839 #define setup_sprite_tiled_advance_column()                                    \
4840   add texture_offset_base, texture_offset_base, #0x100;                        \
4841   tst texture_offset_base, #0xF00;                                             \
4842   subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00)             \
4843
4844 #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode,      \
4845  right_mode, x4mode)                                                           \
4846  setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
4847   setup_sprite_column_data_##multi_height();                                   \
4848                                                                                \
4849   setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode();           \
4850                                                                                \
4851   setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
4852                                                                                \
4853   subs tile_width, tile_width, #2;                                             \
4854   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4855                                                                                \
4856   beq 1f;                                                                      \
4857                                                                                \
4858   vmov.u8 draw_masks_fb_ptrs, #0;                                              \
4859   vmov.u8 draw_masks_fb_ptrs2, #0;                                             \
4860                                                                                \
4861  0:                                                                            \
4862   setup_sprite_tiled_advance_column();                                         \
4863   setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode);      \
4864   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4865   subs tile_width, tile_width, #1;                                             \
4866   bne 0b;                                                                      \
4867                                                                                \
4868  1:                                                                            \
4869   setup_sprite_setup_right_draw_mask_fb_ptr##x4mode();                         \
4870                                                                                \
4871   setup_sprite_tiled_advance_column();                                         \
4872   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
4873   vpop { q4 - q7 };                                                            \
4874   pop  { r3 - r11, pc }                                                        \
4875
4876
4877 #define setup_sprite_offset_u_adjust()                                         \
4878
4879 #define setup_sprite_get_left_block_mask()                                     \
4880   and left_block_mask, left_block_mask, #0xFF                                  \
4881
4882 #define setup_sprite_compare_left_block_mask()                                 \
4883   cmp left_block_mask, #0xFF                                                   \
4884
4885 #define setup_sprite_get_right_block_mask()                                    \
4886   uxtb right_block_mask, right_block_mask, ror #8                              \
4887
4888 #define setup_sprite_compare_right_block_mask()                                \
4889   cmp right_block_mask, #0xFF                                                  \
4890
4891
4892
4893 /* 4x stuff */
4894 #define fb_ptr2 column_data
4895
4896 #define setup_sprite_offset_u_adjust_4x()                                      \
4897   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
4898   lsl offset_u_right, #1;                                                      \
4899   lsl offset_u, #1;                                                            \
4900   add offset_u_right, #1                                                       \
4901
4902 #define setup_sprite_get_left_block_mask_4x()                                  \
4903   sxth left_block_mask, left_block_mask                                        \
4904
4905 #define setup_sprite_compare_left_block_mask_4x()                              \
4906   cmp left_block_mask, #0xFFFFFFFF                                             \
4907
4908 #define setup_sprite_get_right_block_mask_4x()                                 \
4909   sxth right_block_mask, right_block_mask, ror #16                             \
4910
4911 #define setup_sprite_compare_right_block_mask_4x()                             \
4912   cmp right_block_mask, #0xFFFFFFFF                                            \
4913
4914
4915 #define widen_texels_16bpp(texels_)                                            \
4916   vmov texels_wide_low, texels_;                                               \
4917   vmov texels_wide_high, texels_;                                              \
4918   vzip.16 texels_wide_low, texels_wide_high                                    \
4919
4920 #define widen_texels_8bpp(texels_)                                             \
4921   vmov texels_wide_low, texels_;                                               \
4922   vmov texels_wide_high, texels_;                                              \
4923   vzip.8 texels_wide_low, texels_wide_high                                     \
4924
4925 #define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)         \
4926   vst1.u32 { texels_ }, [block_, :128];                                        \
4927   add block_, block_, #40;                                                     \
4928                                                                                \
4929   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4930   vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64];                               \
4931   add block_, block_, #24                                                      \
4932
4933 /* assumes 16-byte offset already added to block_ */
4934 #define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)          \
4935   vst1.u32 { texels_ }, [block_, :64];                                         \
4936   add block_, block_, #24;                                                     \
4937                                                                                \
4938   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4939   vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64];                               \
4940   add block_, block_, #40                                                      \
4941
4942 #define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,             \
4943  draw_mask_fb_ptr_b_)                                                          \
4944   widen_texels_16bpp(texels_low);                                              \
4945   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4946                                                                                \
4947   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr);          \
4948                                                                                \
4949   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);      \
4950   widen_texels_16bpp(texels_high);                                             \
4951                                                                                \
4952   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4953   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);      \
4954                                                                                \
4955   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4956   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)       \
4957
4958 #define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,              \
4959  draw_mask_fb_ptr_b_)                                                          \
4960   widen_texels_8bpp(texels);                                                   \
4961   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4962                                                                                \
4963   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr);       \
4964   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);   \
4965                                                                                \
4966   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4967   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);  \
4968                                                                                \
4969   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4970   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)   \
4971
4972
4973 #define setup_sprite_tiled_initialize_4bpp_4x()                                \
4974   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset];                           \
4975   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128];                               \
4976                                                                                \
4977   vuzp.u8 clut_a, clut_b                                                       \
4978
4979 #define setup_sprite_tiled_initialize_8bpp_4x()                                \
4980
4981
4982 #define setup_sprite_block_count_single_4x()                                   \
4983   sub_tile_height, lsl #2                                                      \
4984
4985 #define setup_sprite_block_count_double_4x()                                   \
4986   sub_tile_height, lsl #(1+2)                                                  \
4987
4988 #define setup_sprite_tile_full_4bpp_4x(edge)                                   \
4989   setup_sprite_tile_add_blocks(double_4x);                                     \
4990   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
4991                                                                                \
4992  4:                                                                            \
4993   and texture_block_ptr, texture_offset, texture_mask;                         \
4994   pld [fb_ptr];                                                                \
4995                                                                                \
4996   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4997   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4998                                                                                \
4999   add texture_block_ptr, texture_offset, #8;                                   \
5000   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5001                                                                                \
5002   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
5003   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5004                                                                                \
5005   vzip.8 texels_low, texels_high;                                              \
5006   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                  \
5007    draw_mask_fb_ptr_left_b);                                                   \
5008                                                                                \
5009   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5010   pld [fb_ptr, #2048];                                                         \
5011                                                                                \
5012   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5013   add fb_ptr, fb_ptr, #16*2;                                                   \
5014                                                                                \
5015   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5016   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5017                                                                                \
5018   vzip.8 texels_low, texels_high;                                              \
5019   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                 \
5020    draw_mask_fb_ptr_right_b);                                                  \
5021                                                                                \
5022   add texture_offset, texture_offset, #0x10;                                   \
5023   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5024                                                                                \
5025   subs sub_tile_height, sub_tile_height, #1;                                   \
5026   bne 4b;                                                                      \
5027                                                                                \
5028   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5029   add texture_offset, texture_offset, #0xF00;                                  \
5030   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5031
5032
5033 #define setup_sprite_tile_half_4bpp_4x(edge)                                   \
5034   setup_sprite_tile_add_blocks(single_4x);                                     \
5035   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5036                                                                                \
5037  4:                                                                            \
5038   and texture_block_ptr, texture_offset, texture_mask;                         \
5039   pld [fb_ptr];                                                                \
5040                                                                                \
5041   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5042   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5043                                                                                \
5044   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5045   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5046                                                                                \
5047   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5048   add texture_offset, texture_offset, #0x10;                                   \
5049                                                                                \
5050   vzip.8 texels_low, texels_high;                                              \
5051   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,              \
5052    draw_mask_fb_ptr_##edge##_b);                                               \
5053                                                                                \
5054   pld [fb_ptr, #2048];                                                         \
5055   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5056                                                                                \
5057   subs sub_tile_height, sub_tile_height, #1;                                   \
5058   bne 4b;                                                                      \
5059                                                                                \
5060   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5061   add texture_offset, texture_offset, #0xF00;                                  \
5062   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5063
5064
5065 #define setup_sprite_tile_full_8bpp_4x(edge)                                   \
5066   setup_sprite_tile_add_blocks(double_4x);                                     \
5067   add block, block, #16;                                                       \
5068   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5069                                                                                \
5070  4:                                                                            \
5071   and texture_block_ptr, texture_offset, texture_mask;                         \
5072   pld [fb_ptr];                                                                \
5073                                                                                \
5074   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5075   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5076                                                                                \
5077   add texture_block_ptr, texture_offset, #8;                                   \
5078   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                   \
5079    draw_mask_fb_ptr_left_b);                                                   \
5080                                                                                \
5081   pld [fb_ptr, #2048];                                                         \
5082   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
5083                                                                                \
5084   add fb_ptr, fb_ptr, #16*2;                                                   \
5085   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5086                                                                                \
5087   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5088                                                                                \
5089   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                  \
5090    draw_mask_fb_ptr_right_b);                                                  \
5091                                                                                \
5092   add texture_offset, texture_offset, #0x10;                                   \
5093   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5094                                                                                \
5095   subs sub_tile_height, sub_tile_height, #1;                                   \
5096   bne 4b;                                                                      \
5097                                                                                \
5098   sub block, block, #16;                                                       \
5099   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5100   add texture_offset, texture_offset, #0xF00;                                  \
5101   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5102
5103   
5104 #define setup_sprite_tile_half_8bpp_4x(edge)                                   \
5105   setup_sprite_tile_add_blocks(single_4x);                                     \
5106   add block, block, #16;                                                       \
5107   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5108                                                                                \
5109  4:                                                                            \
5110   and texture_block_ptr, texture_offset, texture_mask;                         \
5111   pld [fb_ptr];                                                                \
5112                                                                                \
5113   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5114   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5115                                                                                \
5116   pld [fb_ptr, #2048];                                                         \
5117   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,               \
5118    draw_mask_fb_ptr_##edge##_b);                                               \
5119                                                                                \
5120   add texture_offset, texture_offset, #0x10;                                   \
5121   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5122                                                                                \
5123   subs sub_tile_height, sub_tile_height, #1;                                   \
5124   bne 4b;                                                                      \
5125                                                                                \
5126   sub block, block, #16;                                                       \
5127   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5128   add texture_offset, texture_offset, #0xF00;                                  \
5129   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5130
5131  
5132 #define setup_sprite_tile_column_edge_pre_adjust_half_right_4x()               \
5133   add texture_offset, texture_offset_base, #8;                                 \
5134   add fb_ptr, fb_ptr, #16 * 2                                                  \
5135
5136 #define setup_sprite_tile_column_edge_pre_adjust_half_left_4x()                \
5137   mov texture_offset, texture_offset_base                                      \
5138
5139 #define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge)                 \
5140   setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x()                  \
5141
5142 #define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge)                 \
5143   mov texture_offset, texture_offset_base                                      \
5144
5145 #define setup_sprite_tile_column_edge_post_adjust_half_right_4x()              \
5146   sub fb_ptr, fb_ptr, #16 * 2                                                  \
5147
5148 #define setup_sprite_tile_column_edge_post_adjust_half_left_4x()               \
5149
5150 #define setup_sprite_tile_column_edge_post_adjust_half_4x(edge)                \
5151   setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x()                 \
5152
5153 #define setup_sprite_tile_column_edge_post_adjust_full_4x(edge)                \
5154
5155
5156 #define setup_sprite_setup_left_draw_mask_fb_ptr_4x()                          \
5157   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5158   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5159   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5160   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5161
5162 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x()           \
5163   mov fb_ptr_advance_column, #32 * 2;                                          \
5164   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5165   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5166   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1;       \
5167   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5168   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5169
5170 #define setup_sprite_setup_right_draw_mask_fb_ptr_4x()                         \
5171   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4];                             \
5172   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5];                             \
5173   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6];                            \
5174   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7]                             \
5175
5176
5177 // r0: psx_gpu
5178 // r1: x
5179 // r2: y
5180 // r3: u
5181 // [sp]: v
5182 // [sp + 4]: width
5183 // [sp + 8]: height
5184 // [sp + 12]: color (unused)
5185
5186 #define setup_sprite_tiled_builder(texture_mode, x4mode)                       \
5187                                                                                \
5188 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, full,        \
5189   x4mode);                                                                     \
5190 setup_sprite_tile_column_width_single(texture_mode, multi,  full, none,        \
5191   x4mode);                                                                     \
5192 setup_sprite_tile_column_width_multi(texture_mode,  single, full, full,        \
5193   x4mode);                                                                     \
5194 setup_sprite_tile_column_width_single(texture_mode, single, full, none,        \
5195   x4mode);                                                                     \
5196 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, full,        \
5197   x4mode);                                                                     \
5198 setup_sprite_tile_column_width_single(texture_mode, multi,  half, right,       \
5199   x4mode);                                                                     \
5200 setup_sprite_tile_column_width_multi(texture_mode,  single, half, full,        \
5201   x4mode);                                                                     \
5202 setup_sprite_tile_column_width_single(texture_mode, single, half, right,       \
5203   x4mode);                                                                     \
5204 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, half,        \
5205   x4mode);                                                                     \
5206 setup_sprite_tile_column_width_single(texture_mode, multi,  half, left,        \
5207   x4mode);                                                                     \
5208 setup_sprite_tile_column_width_multi(texture_mode,  single, full, half,        \
5209   x4mode);                                                                     \
5210 setup_sprite_tile_column_width_single(texture_mode, single, half, left,        \
5211   x4mode);                                                                     \
5212 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, half,        \
5213   x4mode);                                                                     \
5214 setup_sprite_tile_column_width_multi(texture_mode,  single, half, half,        \
5215   x4mode);                                                                     \
5216                                                                                \
5217 .align 4;                                                                      \
5218                                                                                \
5219 function(setup_sprite_##texture_mode##x4mode)                                  \
5220   push { r3 - r11, lr };                                                       \
5221   setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
5222                                                                                \
5223   ldr v, [sp, #4*(10+0)];                                                      \
5224   and offset_u, u, #0xF;                                                       \
5225                                                                                \
5226   ldr width, [sp, #4*(10+1)];                                                  \
5227   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
5228                                                                                \
5229   ldr height, [sp, #4*(10+2)];                                                 \
5230   add fb_ptr, fb_ptr, y, lsl #11;                                              \
5231                                                                                \
5232   vpush { q4 - q7 };                                                           \
5233                                                                                \
5234   add fb_ptr, fb_ptr, x, lsl #1;                                               \
5235   and offset_v, v, #0xF;                                                       \
5236                                                                                \
5237   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
5238   add width_rounded, offset_u, width;                                          \
5239                                                                                \
5240   add height_rounded, offset_v, height;                                        \
5241   add width_rounded, width_rounded, #15;                                       \
5242                                                                                \
5243   add height_rounded, height_rounded, #15;                                     \
5244   mov tile_width, width_rounded, lsr #4;                                       \
5245                                                                                \
5246   /* texture_offset_base = VH-VL-00-00                                       */\
5247   mov texture_offset_base, v, lsl #8;                                          \
5248   and offset_u_right, width_rounded, #0xF;                                     \
5249                                                                                \
5250   /* texture_offset_base = VH-UH-UL-00                                       */\
5251   bfi texture_offset_base, u, #4, #8;                                          \
5252   mov right_block_mask, #0xFFFFFFFE;                                           \
5253                                                                                \
5254   setup_sprite_offset_u_adjust##x4mode();                                      \
5255                                                                                \
5256   /* texture_offset_base = VH-UH-VL-00                                       */\
5257   bfi texture_offset_base, v, #4, #4;                                          \
5258   mov left_block_mask, #0xFFFFFFFF;                                            \
5259                                                                                \
5260   mov tile_height, height_rounded, lsr #4;                                     \
5261   mvn left_block_mask, left_block_mask, lsl offset_u;                          \
5262                                                                                \
5263   /* texture_mask = HH-HL-WH-WL                                              */\
5264   ldrh texture_mask, [psx_gpu, #psx_gpu_texture_mask_width_offset];            \
5265   mov right_block_mask, right_block_mask, lsl offset_u_right;                  \
5266                                                                                \
5267   /* texture_mask_rev = WH-WL-HH-HL                                          */\
5268   rev16 texture_mask_rev, texture_mask;                                        \
5269   vmov block_masks, left_block_mask, right_block_mask;                         \
5270                                                                                \
5271   /* texture_mask = HH-HL-HL-WL                                              */\
5272   bfi texture_mask, texture_mask_rev, #4, #4;                                  \
5273   /* texture_mask_rev = 00-00-00-WH                                          */\
5274   mov texture_mask_rev, texture_mask_rev, lsr #12;                             \
5275                                                                                \
5276   /* texture_mask = HH-WH-HL-WL                                              */\
5277   bfi texture_mask, texture_mask_rev, #8, #4;                                  \
5278   setup_sprite_get_left_block_mask##x4mode();                                  \
5279                                                                                \
5280   mov control_mask, #0;                                                        \
5281   setup_sprite_compare_left_block_mask##x4mode();                              \
5282                                                                                \
5283   setup_sprite_get_right_block_mask##x4mode();                                 \
5284   orreq control_mask, control_mask, #0x4;                                      \
5285                                                                                \
5286   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
5287   setup_sprite_compare_right_block_mask##x4mode();                             \
5288                                                                                \
5289   orreq control_mask, control_mask, #0x8;                                      \
5290   cmp tile_width, #1;                                                          \
5291                                                                                \
5292   add block, psx_gpu, #psx_gpu_blocks_offset;                                  \
5293   orreq control_mask, control_mask, #0x1;                                      \
5294                                                                                \
5295   cmp tile_height, #1;                                                         \
5296   add block, block, num_blocks, lsl #6;                                        \
5297                                                                                \
5298   orreq control_mask, control_mask, #0x2;                                      \
5299   JT_OP_REL(9f, control_mask, temp);                                           \
5300   JT_OP(ldr pc, [pc, control_mask, lsl #2]);                                   \
5301   nop;                                                                         \
5302                                                                                \
5303  9:                                                                            \
5304  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode);   \
5305  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode);  \
5306  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode);  \
5307  .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5308  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode);   \
5309  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5310  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode);  \
5311  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5312  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode);   \
5313  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode);  \
5314  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode);  \
5315  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5316  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode);   \
5317  .word 0x00000000;                                                             \
5318  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode);  \
5319
5320
5321 setup_sprite_tiled_builder(4bpp,);
5322 setup_sprite_tiled_builder(8bpp,);
5323
5324 #undef draw_mask_fb_ptr_left
5325 #undef draw_mask_fb_ptr_right
5326
5327 setup_sprite_tiled_builder(4bpp, _4x);
5328 setup_sprite_tiled_builder(8bpp, _4x);
5329
5330
5331 #undef block_ptr
5332 #undef num_blocks
5333 #undef clut_ptr
5334
5335 #define psx_gpu                                           r0
5336 #define block_ptr                                         r0
5337 #define num_blocks                                        r1
5338 #define clut_ptr                                          r2
5339 #define texel_shift_mask                                  r3
5340 #define block_pixels_a                                    r4
5341 #define block_pixels_b                                    r5
5342 #define texel_0                                           r6
5343 #define texel_2                                           r7
5344 #define texel_4                                           r8
5345 #define texel_6                                           r9
5346 #define texel_1                                           r10
5347 #define texel_3                                           r11
5348 #define texel_5                                           r12
5349 #define texel_7                                           r14
5350 #define texels_01                                         r6
5351 #define texels_23                                         r7
5352 #define texels_45                                         r8
5353 #define texels_67                                         r9
5354
5355 function(texture_sprite_blocks_8bpp)
5356   push { r4 - r11, r14 }
5357   movw texel_shift_mask, #(0xFF << 1)
5358
5359   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5360   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
5361
5362   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5363   ldr block_pixels_a, [block_ptr, #16]
5364
5365  0:
5366   and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5367   ldr block_pixels_b, [block_ptr, #20]
5368
5369   and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5370   ldrh texel_0, [clut_ptr, texel_0]
5371
5372   and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5373   ldrh texel_1, [clut_ptr, texel_1]
5374
5375   and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5376   ldr block_pixels_a, [block_ptr, #(64 + 16)]
5377
5378   ldrh texel_2, [clut_ptr, texel_2]
5379   and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5380
5381   ldrh texel_3, [clut_ptr, texel_3]
5382   and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5383
5384   ldrh texel_4, [clut_ptr, texel_4]
5385   and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5386
5387   ldrh texel_5, [clut_ptr, texel_5]
5388   and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5389
5390   ldrh texel_6, [clut_ptr, texel_6]
5391   orr texels_01, texel_0, texel_1, lsl #16
5392
5393   ldrh texel_7, [clut_ptr, texel_7]
5394   orr texels_23, texel_2, texel_3, lsl #16
5395
5396   orr texels_45, texel_4, texel_5, lsl #16
5397   str texels_01, [block_ptr, #0]
5398
5399   orr texels_67, texel_6, texel_7, lsl #16
5400   str texels_23, [block_ptr, #4]
5401
5402   subs num_blocks, num_blocks, #1
5403   str texels_45, [block_ptr, #8]
5404
5405   str texels_67, [block_ptr, #12]
5406   add block_ptr, block_ptr, #64
5407
5408   bne 0b
5409   nop
5410
5411   pop { r4 - r11, pc }
5412
5413
5414 #undef width_rounded
5415 #undef texture_mask
5416 #undef num_blocks
5417 #undef texture_offset
5418 #undef texels_low
5419 #undef texels_high
5420 #undef texels_wide_low
5421 #undef texels_wide_high
5422 #undef texels_wide
5423 #undef fb_ptr2
5424 #undef temp
5425
5426 #define psx_gpu                                           r0
5427 #define x                                                 r1
5428 #define y                                                 r2
5429 #define u                                                 r3
5430 #define v                                                 r4
5431 #define width                                             r5
5432 #define height                                            r6
5433 #define left_offset                                       r8
5434 #define width_rounded                                     r9
5435 #define right_width                                       r10
5436
5437 #define block_width                                       r11
5438
5439 #define texture_offset_base                               r1
5440 #define texture_mask                                      r2
5441 #define texture_page_ptr                                  r3
5442 #define num_blocks                                        r4
5443 #define block                                             r5
5444 #define fb_ptr                                            r7
5445 #define texture_offset                                    r8
5446 #define blocks_remaining                                  r9
5447 #define fb_ptr2                                           r10
5448 #define fb_ptr_pitch                                      r12
5449 #define texture_block_ptr                                 r14
5450
5451 #define texture_mask_width                                r2
5452 #define texture_mask_height                               r3
5453 #define left_mask_bits                                    r4
5454 #define right_mask_bits                                   r5
5455
5456
5457 #undef block_masks
5458 #undef block_masks_shifted
5459 #undef texels
5460
5461 #define block_masks                                       d0
5462 #define block_masks_shifted                               d1
5463 #define draw_mask_fb_ptr                                  d2
5464 #define texels                                            q2
5465
5466 #define draw_mask_fb_ptr_a                                d2
5467 #define draw_mask_fb_ptr_b                                d3
5468 #define texels_low                                        d4
5469 #define texels_high                                       d5
5470 #define texels_wide_low                                   d6
5471 #define texels_wide_high                                  d7
5472 #define texels_wide                                       q3
5473
5474
5475 setup_sprites_16bpp_flush:
5476   push   { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
5477   add    r1, r0, #psx_gpu_saved_tmp_offset
5478   vstmia r1, { d0 - d3 }
5479   bl     flush_render_block_buffer
5480   pop    { r0 - r3, EXTRA_UNSAVED_REGS r12 }
5481   add    lr, r0, #psx_gpu_saved_tmp_offset
5482   vldmia lr, { d0 - d3 }
5483
5484   add block, psx_gpu, #psx_gpu_blocks_offset
5485   mov num_blocks, block_width
5486
5487   pop { pc }
5488
5489 function(setup_sprite_16bpp)
5490   push { r3 - r11, lr }
5491   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5492
5493   ldr v, [sp, #4*(10+0)]
5494   add fb_ptr, fb_ptr, y, lsl #11
5495
5496   ldr width, [sp, #4*(10+1)]
5497   add fb_ptr, fb_ptr, x, lsl #1
5498
5499   ldr height, [sp, #4*(10+2)]
5500   and left_offset, u, #0x7
5501
5502   add texture_offset_base, u, u
5503   add width_rounded, width, #7
5504
5505   add texture_offset_base, texture_offset_base, v, lsl #11
5506   mov left_mask_bits, #0xFF
5507   
5508   ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
5509   add width_rounded, width_rounded, left_offset
5510
5511   ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
5512   sub fb_ptr, fb_ptr, left_offset, lsl #1
5513
5514   add texture_mask, texture_mask_width, texture_mask_width
5515   mov right_mask_bits, #0xFE
5516
5517   and right_width, width_rounded, #0x7
5518   mvn left_mask_bits, left_mask_bits, lsl left_offset
5519
5520   add texture_mask, texture_mask, texture_mask_height, lsl #11
5521   mov block_width, width_rounded, lsr #3
5522
5523   mov right_mask_bits, right_mask_bits, lsl right_width
5524   movw fb_ptr_pitch, #(2048 + 16)
5525
5526   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5527   vmov block_masks, left_mask_bits, right_mask_bits
5528
5529   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5530   add block, psx_gpu, #psx_gpu_blocks_offset
5531
5532   bic texture_offset_base, texture_offset_base, #0xF
5533   cmp block_width, #1
5534
5535   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
5536   add block, block, num_blocks, lsl #6
5537
5538   bne 0f
5539
5540   vext.32 block_masks_shifted, block_masks, block_masks, #1
5541   vorr.u32 block_masks, block_masks, block_masks_shifted
5542   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5543
5544  1:
5545   add num_blocks, num_blocks, #1
5546   cmp num_blocks, #MAX_BLOCKS
5547   blgt setup_sprites_16bpp_flush
5548
5549   and texture_block_ptr, texture_offset_base, texture_mask
5550   subs height, height, #1
5551
5552   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5553   vld1.u32 { texels }, [texture_block_ptr, :128]
5554
5555   vst1.u32 { texels }, [block, :128]
5556   add block, block, #40
5557
5558   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5559   pld [fb_ptr]
5560
5561   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5562
5563   add block, block, #24
5564   add texture_offset_base, texture_offset_base, #2048
5565   add fb_ptr, fb_ptr, #2048
5566   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5567   bne 1b
5568
5569   pop { r3 - r11, pc }
5570
5571  0:
5572   add num_blocks, num_blocks, block_width
5573   mov texture_offset, texture_offset_base
5574
5575   cmp num_blocks, #MAX_BLOCKS
5576   blgt setup_sprites_16bpp_flush
5577
5578   add texture_offset_base, texture_offset_base, #2048
5579   and texture_block_ptr, texture_offset, texture_mask
5580
5581   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5582   vld1.u32 { texels }, [texture_block_ptr, :128]  
5583
5584   vst1.u32 { texels }, [block, :128]
5585   add block, block, #40
5586
5587   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5588   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5589   pld [fb_ptr]
5590
5591   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5592   subs blocks_remaining, block_width, #2
5593
5594   add texture_offset, texture_offset, #16
5595   add fb_ptr, fb_ptr, #16
5596
5597   vmov.u8 draw_mask_fb_ptr, #0
5598
5599   add block, block, #24
5600   beq 2f
5601
5602  1:
5603   and texture_block_ptr, texture_offset, texture_mask
5604   subs blocks_remaining, blocks_remaining, #1
5605
5606   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5607   vld1.u32 { texels }, [texture_block_ptr, :128]
5608
5609   vst1.u32 { texels }, [block, :128]
5610   add block, block, #40
5611
5612   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5613   pld [fb_ptr]
5614
5615   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5616   
5617   add texture_offset, texture_offset, #16
5618   add fb_ptr, fb_ptr, #16
5619
5620   add block, block, #24
5621   bne 1b
5622
5623  2:
5624   and texture_block_ptr, texture_offset, texture_mask
5625   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5626
5627   vld1.u32 { texels }, [texture_block_ptr, :128]
5628   vdup.u8 draw_mask_fb_ptr, block_masks[4]
5629
5630   vst1.u32 { texels }, [block, :128]
5631   add block, block, #40
5632
5633   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5634   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5635   
5636   add block, block, #24
5637   subs height, height, #1
5638
5639   add fb_ptr, fb_ptr, fb_ptr_pitch
5640   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5641
5642   bne 0b
5643   nop
5644
5645   pop { r3 - r11, pc }
5646
5647
5648 // 4x version
5649 // FIXME: duplicate code with normal version :(
5650 #undef draw_mask_fb_ptr
5651
5652 function(setup_sprite_16bpp_4x)
5653   push { r3 - r11, lr }
5654   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5655
5656   ldr v, [sp, #4*(10+0)]
5657   add fb_ptr, fb_ptr, y, lsl #11
5658
5659   ldr width, [sp, #4*(10+1)]
5660   add fb_ptr, fb_ptr, x, lsl #1
5661
5662   ldr height, [sp, #4*(10+2)]
5663   and left_offset, u, #0x7
5664
5665   add texture_offset_base, u, u
5666   add width_rounded, width, #7
5667
5668   add texture_offset_base, texture_offset_base, v, lsl #11
5669   movw left_mask_bits, #0xFFFF
5670   
5671   ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
5672   add width_rounded, width_rounded, left_offset
5673
5674   lsl left_offset, #1
5675
5676   ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
5677   sub fb_ptr, fb_ptr, left_offset, lsl #1
5678
5679   add texture_mask, texture_mask_width, texture_mask_width
5680   movw right_mask_bits, #0xFFFC
5681
5682   and right_width, width_rounded, #0x7
5683   mvn left_mask_bits, left_mask_bits, lsl left_offset
5684
5685   lsl right_width, #1
5686
5687   add texture_mask, texture_mask, texture_mask_height, lsl #11
5688   mov block_width, width_rounded, lsr #3
5689
5690   mov right_mask_bits, right_mask_bits, lsl right_width
5691   movw fb_ptr_pitch, #(2048 + 16) * 2
5692
5693   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5694   vmov block_masks, left_mask_bits, right_mask_bits
5695
5696   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5697   add block, psx_gpu, #psx_gpu_blocks_offset
5698
5699   bic texture_offset_base, texture_offset_base, #0xF
5700   cmp block_width, #1
5701
5702   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
5703   add block, block, num_blocks, lsl #6
5704
5705   lsl block_width, #2
5706   bne 0f
5707
5708   vext.32 block_masks_shifted, block_masks, block_masks, #1
5709   vorr.u32 block_masks, block_masks, block_masks_shifted
5710   vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5711   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5712
5713  1:
5714   add num_blocks, num_blocks, block_width
5715   cmp num_blocks, #MAX_BLOCKS
5716   blgt setup_sprites_16bpp_flush
5717
5718   and texture_block_ptr, texture_offset_base, texture_mask
5719   subs height, height, #1
5720
5721   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5722   vld1.u32 { texels }, [texture_block_ptr, :128]
5723
5724   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5725
5726   add texture_offset_base, texture_offset_base, #2048
5727   add fb_ptr, fb_ptr, #2048*2
5728   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5729   bne 1b
5730
5731   pop { r3 - r11, pc }
5732
5733  0:
5734   add num_blocks, num_blocks, block_width
5735   mov texture_offset, texture_offset_base
5736
5737   vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5738   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5739
5740   cmp num_blocks, #MAX_BLOCKS
5741   blgt setup_sprites_16bpp_flush
5742
5743   add texture_offset_base, texture_offset_base, #2048
5744   and texture_block_ptr, texture_offset, texture_mask
5745
5746   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5747   vld1.u32 { texels }, [texture_block_ptr, :128]
5748
5749   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5750
5751   subs blocks_remaining, block_width, #2*4
5752   add texture_offset, texture_offset, #16
5753
5754   vmov.u8 draw_mask_fb_ptr_a, #0
5755   vmov.u8 draw_mask_fb_ptr_b, #0
5756
5757   add fb_ptr, fb_ptr, #16*2
5758   beq 2f
5759
5760  1:
5761   and texture_block_ptr, texture_offset, texture_mask
5762   subs blocks_remaining, blocks_remaining, #4
5763
5764   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5765   vld1.u32 { texels }, [texture_block_ptr, :128]
5766
5767   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5768   add texture_offset, texture_offset, #16
5769
5770   add fb_ptr, fb_ptr, #16*2
5771   bgt 1b
5772
5773  2:
5774   vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5775   vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5776
5777   and texture_block_ptr, texture_offset, texture_mask
5778   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5779
5780   vld1.u32 { texels }, [texture_block_ptr, :128]
5781
5782   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5783   subs height, height, #1
5784
5785   add fb_ptr, fb_ptr, fb_ptr_pitch
5786   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5787
5788   bne 0b
5789   nop
5790
5791   pop { r3 - r11, pc }
5792
5793
5794 #undef width
5795 #undef right_width
5796 #undef right_mask_bits
5797 #undef color
5798 #undef height
5799 #undef blocks_remaining
5800 #undef colors
5801 #undef right_mask
5802 #undef test_mask
5803 #undef draw_mask
5804
5805 #define psx_gpu                                           r0
5806 #define x                                                 r1
5807 #define y                                                 r2
5808 #define width                                             r3
5809 #define right_width                                       r5
5810 #define right_mask_bits                                   r6
5811 #define fb_ptr                                            r7
5812 #define color                                             r8
5813 #define height                                            r9
5814 #define fb_ptr_pitch                                      r12
5815
5816 // referenced by setup_sprites_16bpp_flush
5817 #define num_blocks                                        r4
5818 #define block                                             r5
5819 #define block_width                                       r11
5820
5821 #define color_r                                           r1
5822 #define color_g                                           r2
5823 #define color_b                                           r8
5824 #define blocks_remaining                                  r6
5825
5826 #define colors                                            q0
5827 #define right_mask                                        q1
5828 #define test_mask                                         q2
5829 #define draw_mask                                         q2
5830 #define draw_mask_bits_fb_ptr                             d6
5831
5832
5833 .align 3
5834
5835 function(setup_sprite_untextured_512)
5836   push { r4 - r11, r14 }
5837
5838   ldr width, [sp, #4*(9+1)]
5839   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5840
5841   ldr height, [sp, #4*(9+2)]
5842   add fb_ptr, fb_ptr, y, lsl #11
5843
5844   add fb_ptr, fb_ptr, x, lsl #1
5845   sub right_width, width, #1
5846
5847   ldr color, [sp, #4*(9+3)]
5848   and right_width, #7
5849
5850   add block_width, width, #7
5851   add right_width, #1
5852
5853   lsr block_width, #3
5854   mov right_mask_bits, #0xff
5855
5856   sub fb_ptr_pitch, block_width, #1
5857   lsl right_mask_bits, right_width
5858
5859   lsl fb_ptr_pitch, #3+1
5860   ubfx color_r, color, #3, #5
5861
5862   rsb fb_ptr_pitch, #1024*2
5863   ubfx color_g, color, #11, #5
5864
5865   vld1.u32 { test_mask }, [psx_gpu, :128]
5866   ubfx color_b, color, #19, #5
5867
5868   vdup.u16 right_mask, right_mask_bits
5869   orr color, color_r, color_b, lsl #10
5870
5871   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5872   orr color, color, color_g, lsl #5
5873
5874   vtst.u16 right_mask, right_mask, test_mask
5875   add block, psx_gpu, #psx_gpu_blocks_offset
5876
5877   vdup.u16 colors, color
5878   add block, block, num_blocks, lsl #6
5879
5880
5881 setup_sprite_untextured_height_loop:
5882   add num_blocks, block_width
5883   sub blocks_remaining, block_width, #1
5884
5885   cmp num_blocks, #MAX_BLOCKS
5886   blgt setup_sprites_16bpp_flush
5887
5888   cmp blocks_remaining, #0
5889   ble 1f
5890
5891   vmov.u8 draw_mask, #0 /* zero_mask */
5892   vmov.u8 draw_mask_bits_fb_ptr, #0
5893
5894  0:
5895   vst1.u32 { draw_mask }, [block, :128]!
5896   subs blocks_remaining, #1
5897
5898   vst1.u32 { colors }, [block, :128]
5899   add block, block, #24
5900
5901   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5902   vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
5903   
5904   add block, block, #24
5905   add fb_ptr, #8*2
5906   bgt 0b
5907
5908  1:
5909   vst1.u32 { right_mask }, [block, :128]!
5910   subs height, #1
5911
5912   vst1.u32 { colors }, [block, :128]
5913   add block, block, #24
5914
5915   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5916   vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
5917   
5918   add block, block, #24
5919   add fb_ptr, fb_ptr_pitch
5920
5921   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5922   bgt setup_sprite_untextured_height_loop
5923
5924   pop { r4 - r11, pc }
5925
5926
5927
5928 #undef texture_page_ptr
5929 #undef vram_ptr
5930 #undef dirty_textures_mask
5931 #undef current_texture_mask
5932
5933 #define psx_gpu                                           r0
5934 #define current_texture_page                              r1
5935 #define texture_page_ptr                                  r2
5936 #define vram_ptr_a                                        r3
5937 #define current_texture_page_x                            r12
5938 #define current_texture_page_y                            r4
5939 #define dirty_textures_mask                               r5
5940 #define tile_y                                            r6
5941 #define tile_x                                            r7
5942 #define sub_y                                             r8
5943 #define current_texture_mask                              r9
5944 #define c_4096                                            r10
5945 #define vram_ptr_b                                        r11
5946
5947 #define texel_block_a                                     d0
5948 #define texel_block_b                                     d1
5949 #define texel_block_expanded_a                            q1
5950 #define texel_block_expanded_b                            q2
5951 #define texel_block_expanded_ab                           q2
5952 #define texel_block_expanded_c                            q3
5953 #define texel_block_expanded_d                            q0
5954 #define texel_block_expanded_cd                           q3
5955
5956 function(update_texture_4bpp_cache)
5957   push  { r3 - r11, r14 }
5958   vpush { q0 - q3 }
5959
5960   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
5961
5962   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
5963   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
5964
5965   and current_texture_page_x, current_texture_page, #0xF
5966   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
5967
5968   mov current_texture_page_y, current_texture_page, lsr #4
5969   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
5970
5971   add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5972   mov tile_y, #16
5973
5974   add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7  
5975   bic dirty_textures_mask, current_texture_mask
5976   
5977   mov tile_x, #16
5978   str dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
5979
5980   mov sub_y, #8
5981   movw c_4096, #4096
5982
5983   add vram_ptr_b, vram_ptr_a, #2048
5984
5985  0:
5986   vld1.u32 { texel_block_a }, [vram_ptr_a, :64], c_4096
5987   vld1.u32 { texel_block_b }, [vram_ptr_b, :64], c_4096
5988
5989   vmovl.u8 texel_block_expanded_a, texel_block_a
5990   vshll.u8 texel_block_expanded_b, texel_block_a, #4
5991   vmovl.u8 texel_block_expanded_c, texel_block_b
5992   vshll.u8 texel_block_expanded_d, texel_block_b, #4
5993
5994   vbic.u16 texel_block_expanded_a, #0x00F0
5995   vbic.u16 texel_block_expanded_b, #0x00F0
5996   vbic.u16 texel_block_expanded_c, #0x00F0
5997   vbic.u16 texel_block_expanded_d, #0x00F0
5998
5999   vorr.u16 texel_block_expanded_ab, texel_block_expanded_a,                    \
6000    texel_block_expanded_b
6001   vorr.u16 texel_block_expanded_cd, texel_block_expanded_c,                    \
6002    texel_block_expanded_d
6003
6004   vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd },               \
6005    [texture_page_ptr, :256]!
6006
6007   subs sub_y, sub_y, #1
6008   bne 0b
6009
6010   mov sub_y, #8
6011   add vram_ptr_a, vram_ptr_a, #8
6012   add vram_ptr_b, vram_ptr_b, #8
6013
6014   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6015   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6016
6017   subs tile_x, tile_x, #1
6018   bne 0b
6019
6020   mov tile_x, #16
6021   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6022   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6023
6024   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6025   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6026
6027   subs tile_y, tile_y, #1
6028   bne 0b
6029
6030   vpop { q0 - q3 }
6031   pop  { r3 - r11, pc }
6032
6033
6034 #undef current_texture_page
6035
6036 #define psx_gpu                                           r0
6037 #define texture_page                                      r1
6038 #define texture_page_ptr                                  r2
6039 #define vram_ptr_a                                        r3
6040 #define texture_page_x                                    r12
6041 #define texture_page_y                                    r4
6042 #define current_texture_page                              r5
6043 #define tile_y                                            r6
6044 #define tile_x                                            r7
6045 #define sub_y                                             r8
6046 #define c_4096                                            r10
6047 #define vram_ptr_b                                        r11
6048
6049
6050 #undef texels_a
6051 #undef texels_b
6052
6053 #define texels_a                                          q0
6054 #define texels_b                                          q1
6055 #define texels_c                                          q2
6056 #define texels_d                                          q3
6057
6058
6059 function(update_texture_8bpp_cache_slice)
6060   stmdb sp!, { r4 - r11, r14 }
6061
6062   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
6063   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
6064
6065   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
6066   mov tile_y, #16
6067
6068   and texture_page_x, texture_page, #0xF
6069   mov texture_page_y, texture_page, lsr #4
6070
6071   add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7  
6072   mov tile_x, #8
6073
6074   add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6075   eor current_texture_page, current_texture_page, texture_page
6076
6077   ands current_texture_page, current_texture_page, #0x1
6078   mov sub_y, #4
6079
6080   addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6081   movw c_4096, #4096
6082
6083   add vram_ptr_b, vram_ptr_a, #2048
6084
6085  0:
6086   vld1.u32 { texels_a }, [vram_ptr_a, :128], c_4096
6087   vld1.u32 { texels_b }, [vram_ptr_b, :128], c_4096
6088   vld1.u32 { texels_c }, [vram_ptr_a, :128], c_4096
6089   vld1.u32 { texels_d }, [vram_ptr_b, :128], c_4096
6090
6091   vst1.u32 { texels_a, texels_b }, [texture_page_ptr, :256]!
6092   vst1.u32 { texels_c, texels_d }, [texture_page_ptr, :256]!
6093
6094   subs sub_y, sub_y, #1
6095   bne 0b
6096
6097   mov sub_y, #4
6098
6099   add vram_ptr_a, vram_ptr_a, #16
6100   add vram_ptr_b, vram_ptr_b, #16
6101
6102   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6103   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6104
6105   subs tile_x, tile_x, #1
6106   bne 0b
6107
6108   mov tile_x, #8
6109
6110   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6111   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6112
6113   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6114   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6115
6116   subs tile_y, tile_y, #1
6117   add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6118
6119   bne 0b
6120
6121   ldmia sp!, { r4 - r11, pc }
6122
6123
6124 /* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6125 function(scale2x_tiles8)
6126   push { r4, r14 }
6127
6128   mov r4, r1
6129   add r12, r0, #1024*2
6130   mov r14, r2
6131
6132 0:
6133   pld [r1, #1024*2]
6134   vld1.u16 { q0 }, [r1, :128]!
6135   vld1.u16 { q2 }, [r1, :128]!
6136   vmov q1, q0
6137   vmov q3, q2
6138   vzip.16 q0, q1
6139   vzip.16 q2, q3
6140   subs r14, #2
6141   vst1.u16 { q0, q1 }, [r0, :128]!
6142   vst1.u16 { q0, q1 }, [r12, :128]!
6143   blt 1f
6144   vst1.u16 { q2, q3 }, [r0, :128]!
6145   vst1.u16 { q2, q3 }, [r12, :128]!
6146   bgt 0b
6147 1:
6148   subs r3, #1
6149   mov r14, r2
6150   add r0, #1024*2*2
6151   add r4, #1024*2
6152   sub r0, r0, r2, lsl #4+1
6153   mov r1, r4
6154   add r12, r0, #1024*2
6155   bgt 0b
6156   nop
6157
6158   pop { r4, pc }
6159
6160 // vim:filetype=armasm