da47756efcaea45cd3704fd5338e88c3895daa75
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation; either version 2 of
8  * the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  */
15
16 #define MAX_SPANS                                         512
17 #define MAX_BLOCKS                                        64
18 #define MAX_BLOCKS_PER_ROW                                128
19
20 #define RENDER_STATE_MASK_EVALUATE                        0x20
21 #define RENDER_FLAGS_MODULATE_TEXELS                      0x1
22 #define RENDER_FLAGS_BLEND                                0x2
23 #define RENDER_INTERLACE_ENABLED                          0x1
24
25 #include "psx_gpu_offsets.h"
26
27 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
28
29 #define edge_data_left_x_offset                           0
30 #define edge_data_num_blocks_offset                       2
31 #define edge_data_right_mask_offset                       4
32 #define edge_data_y_offset                                6
33
34 .syntax unified
35 .text
36
37 #if 0
38 #define save_abi_regs() \
39   vpush {q4-q7}
40 #define restore_abi_regs() \
41   vpop  {q4-q7}
42 #else
43 #define save_abi_regs()
44 #define restore_abi_regs()
45 #endif
46
47 #define psx_gpu                                           r0
48 #define v_a                                               r1
49 #define v_b                                               r2
50 #define v_c                                               r3
51
52 #define x0                                                r4
53 #define x1                                                r5
54 #define x2                                                r6
55 #define x0_x1                                             r5
56 #define x1_x2                                             r6
57 #define y0                                                r7
58 #define y1                                                r8
59 #define y2                                                r9
60 #define y0_y1                                             r7
61 #define y1_y2                                             r8
62 #define b0                                                r9
63 #define b1                                                r10
64 #define b2                                                r11
65 #define b0_b1                                             r10
66 #define b1_b2                                             r11
67
68
69 #define area_r_s                                          r5
70
71 #define g_bx0                                             r2
72 #define g_bx                                              r3
73 #define g_bx2                                             r4
74 #define g_bx3                                             r5
75 #define b_base                                            r6
76 #define g_by                                              r8
77
78 #define gs_bx                                             r7
79 #define gs_by                                             r10
80
81 #define ga_bx                                             g_bx
82 #define ga_by                                             g_by
83
84 #define gw_bx_h                                           g_bx
85 #define gw_by_h                                           g_by
86
87 #define gw_bx_l                                           r11
88 #define gw_by_l                                           gw_bx_l
89
90 #define store_a                                           r0
91 #define store_b                                           r1
92 #define store_inc                                         r5
93
94
95 #define v0                                                q0
96 #define uvrgb0                                            d0
97 #define x0_y0                                             d1
98
99 #define v1                                                q1
100 #define uvrgb1                                            d2
101 #define x1_y1                                             d3
102
103 #define v2                                                q2
104 #define uvrgb2                                            d4
105 #define x2_y2                                             d5
106
107 #define x0_ab                                             q3
108 #define uvrg_xxxx0                                        q3
109 #define uvrg0                                             d6
110 #define xxxx0                                             d7
111
112 #define x1_ab                                             q4
113 #define uvrg_xxxx1                                        q4
114 #define uvrg1                                             d8
115 #define xxxx1                                             d9
116
117 #define x2_ab                                             q5
118 #define uvrg_xxxx2                                        q5
119 #define uvrg2                                             d10
120 #define xxxx2                                             d11
121
122 #define y0_ab                                             q6
123 #define yyyy_uvrg0                                        q6
124 #define yyyy0                                             d12
125 #define uvrg0b                                            d13
126
127 #define y1_ab                                             q7
128 #define yyyy_uvrg1                                        q7
129 #define yyyy1                                             d14
130 #define uvrg1b                                            d15
131
132 #define y2_ab                                             q8
133 #define yyyy_uvrg2                                        q8
134 #define yyyy2                                             d16
135 #define uvrg2b                                            d17
136
137 #define d0_ab                                             q9
138 #define d0_a                                              d18
139 #define d0_b                                              d19
140
141 #define d1_ab                                             q10
142 #define d1_a                                              d20
143 #define d1_b                                              d21
144
145 #define d2_ab                                             q11
146 #define d2_a                                              d22
147 #define d2_b                                              d23
148
149 #define d3_ab                                             q12
150 #define d3_a                                              d24
151 #define d3_b                                              d25
152
153 #define ga_uvrg_x                                         q1
154 #define ga_uvrg_y                                         q4
155
156 #define dx                                                x0_x1
157 #define dy                                                y0_y1
158 #define db                                                b0_b1
159
160 #define uvrg_base                                         q11
161
162 #define gs_uvrg_x                                         q5
163 #define gs_uvrg_y                                         q6
164
165 #define g_uvrg_x                                          q1
166 #define ga_uv_x                                           d2
167 #define g_uv_x                                            d2
168 #define ga_rg_x                                           d3
169 #define g_rg_x                                            d3
170
171 #define g_uvrg_y                                          q4
172 #define ga_uv_y                                           d8
173 #define g_uv_y                                            d8
174 #define ga_rg_y                                           d9
175 #define g_rg_y                                            d9
176
177 #define gw_uv_x                                           q1
178 #define gw_rg_x                                           q2
179 #define gw_uv_y                                           q4
180 #define gw_rg_y                                           q3
181
182 #define w_mask                                            q9
183 #define w_mask_l                                          d18
184
185 #define r_shift                                           q10
186
187 #define uvrg_dx0                                          q0
188 #define uvrg_dx0l                                         d0
189 #define uvrg_dx0h                                         d1
190
191 #define uvrg_dx1                                          q1
192 #define uvrg_dx1l                                         d2
193 #define uvrg_dx1h                                         d3
194
195 #define uvrg_dx2                                          q2
196 #define uvrg_dx2l                                         d4
197 #define uvrg_dx2h                                         d5
198
199 #define uvrg_dx3                                          q3
200 #define uvrg_dx3l                                         d6
201 #define uvrg_dx3h                                         d7
202
203 #define uvrgb_phase                                       q13
204
205 .align 4
206
207 #include "arm_features.h"
208
209 #define function(name) FUNCTION(name):
210
211 #ifndef TEXRELS_FORBIDDEN
212
213 #define JT_OP_REL(table_label, index_reg, temp)
214 #define JT_OP(x...) x
215 #define JTE(start, target) target
216
217 #else
218
219 #define JT_OP_REL(table_label, index_reg, temp)                                \
220   adr temp, table_label;                                                       \
221   ldr temp, [temp, index_reg, lsl #2];                                         \
222   add pc, pc, temp                                                             \
223
224 #define JT_OP(x...)
225 #define JTE(start, target) (target - start)
226
227 #endif
228
229 #ifdef __MACH__
230 #define flush_render_block_buffer _flush_render_block_buffer
231 #define setup_sprite_untextured_simple _setup_sprite_untextured_simple
232 #define update_texture_8bpp_cache _update_texture_8bpp_cache
233 #endif
234
235 @ r0: psx_gpu
236 @ r1: v_a
237 @ r2: v_b
238 @ r3: v_c
239
240 function(compute_all_gradients)
241   // First compute the triangle area reciprocal and shift. The division will
242   // happen concurrently with much of the work which follows.
243   @ r12 = psx_gpu->triangle_area
244   ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
245   stmdb sp!, { r4 - r11, lr }
246   save_abi_regs()
247
248   @ load exponent of 62 into upper half of double
249   movw r4, #0
250   clz r14, r12                       @ r14 = shift
251
252   movt r4, #((62 + 1023) << 4)
253   mov r12, r12, lsl r14              @ r12 = triangle_area_normalized
254
255   @ load area normalized into lower half of double
256   mov r5, r12, lsr #10
257   vmov.f64 d30, r5, r4               @ d30 = (1 << 62) + ta_n
258
259   movt r4, #((1022 + 31) << 4)
260   mov r5, r12, lsl #20
261
262   add r4, r4, r12, lsr #11
263   vmov.f64 d31, r5, r4
264
265   vdiv.f64 d30, d30, d31             @ d30 = ((1 << 62) + ta_n) / ta_n
266
267   // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
268   // ( d0       *  d1      ) - ( d2       *  d3      ) =
269   // ( m0                  ) - ( m1                  ) = gradient
270
271   // This is split to do 12 elements at a time over three sets: a, b, and c.
272   // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
273   // two of the slots are unused.
274
275   // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
276   // is g.
277
278   // First type is:  uvrg bxxx xxxx 
279   // Second type is: yyyy ybyy uvrg 
280   // Since x_a and y_c are the same the same variable is used for both. 
281
282   vld1.u32 { v0 }, [v_a, :128]       @ v0 = { uvrg0, b0, x0, y0 }
283   ldrsh x0, [v_a, #8]                @ load x0
284
285   vld1.u32 { v1 }, [v_b, :128]       @ v1 = { uvrg1, b1, x1, y1}
286   ldrh x1, [v_b, #8]                 @ load x1
287
288   vld1.u32 { v2 }, [v_c, :128]       @ v2 = { uvrg2, b2, x2, y2 }
289   ldrh x2, [v_c, #8]                 @ load x2
290
291   vmovl.u8 uvrg_xxxx0, uvrgb0        @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
292   ldrh y0, [v_a, #10]                @ load y0
293
294   vmovl.u8 uvrg_xxxx1, uvrgb1        @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
295   ldrh y1, [v_b, #10]                @ load y1
296
297   vmovl.u8 uvrg_xxxx2, uvrgb2        @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
298   ldrh y2, [v_c, #10]                @ load y2
299
300   vmov.u8 uvrg0b, uvrg0              @ uvrg0b = { uv0, rg0 }
301   vdup.u16 xxxx0, x0_y0[0]           @ xxxx0 = { xx0, xx0 }
302
303   orr x1_x2, x1, x2, lsl #16         @ x1_x2 = { x1, x2 }
304   pkhbt x0_x1, x0, x1, lsl #16       @ x0_x1 = { x0, x1 }
305
306   vmov.u8 uvrg1b, uvrg1              @ uvrg1b = { uv1, rg1 }
307   vdup.u16 xxxx1, x1_y1[0]           @ xxxx1 = { xx1, xx1 }
308
309   vmov.u8 uvrg2b, uvrg2              @ uvrg2b = { uv2, rg2 }
310   vdup.u16 xxxx2, x2_y2[0]           @ xxxx2 = { xx2, xx2 }
311
312   ldrb b2, [v_c, #4]                 @ load b2
313   orr y0_y1, y0, y1, lsl #16         @ y0_y1 = { y0, y1 }
314
315   ldrb b1, [v_b, #4]                 @ load b1
316   orr y1_y2, y1, y2, lsl #16         @ y1_y2 = { y1, y2 }
317
318   vdup.u16 yyyy0, x0_y0[1]           @ yyyy0 = { yy0, yy0 }
319   vsub.s16 d0_ab, x1_ab, x0_ab
320
321   ldrb b0, [v_a, #4]                 @ load b0
322   orr b1_b2, b1, b2, lsl #16         @ b1_b2 = { b1, b2 }
323
324   vdup.u16 yyyy1, x1_y1[1]           @ yyyy1 = { yy1, yy1 }
325   vsub.s16 d2_ab, x2_ab, x1_ab
326
327   vdup.u16 yyyy2, x2_y2[1]           @ yyyy2 = { yy2, yy2 }
328   vsub.s16 d1_ab, y2_ab, y1_ab
329
330   orr b0_b1, b0, b1, lsl #16         @ b1_b2 = { b1, b2 }
331   ssub16 dx, x1_x2, x0_x1            @ dx = { x1 - x0, x2 - x1 }
332
333   ssub16 dy, y1_y2, y0_y1            @ dy = { y1 - y0, y2 - y1 }
334   ssub16 db, b1_b2, b0_b1            @ db = { b1 - b0, b2 - b1 }
335
336   vsub.s16 d3_ab, y1_ab, y0_ab
337   smusdx ga_by, dx, db               @ ga_by = ((x1 - x0) * (b2 - b1)) -
338                                      @         ((x2 - X1) * (b1 - b0)) 
339   vmull.s16 ga_uvrg_x, d0_a, d1_a
340   smusdx ga_bx, db, dy               @ ga_bx = ((b1 - b0) * (y2 - y1)) -
341                                      @         ((b2 - b1) * (y1 - y0))
342   vmlsl.s16 ga_uvrg_x, d2_a, d3_a
343   movs gs_bx, ga_bx, asr #31
344
345   vmull.s16 ga_uvrg_y, d0_b, d1_b
346   rsbmi ga_bx, ga_bx, #0
347
348   @ r12 = psx_gpu->uvrgb_phase
349   ldr r12, [psx_gpu, #psx_gpu_uvrgb_phase_offset]
350
351   vmlsl.s16 ga_uvrg_y, d2_b, d3_b
352   movs gs_by, ga_by, asr #31
353
354   vshr.u64 d0, d30, #22
355   add b_base, r12, b0, lsl #16
356
357   vdup.u32 uvrgb_phase, r12
358
359   rsbmi ga_by, ga_by, #0
360   vclt.s32 gs_uvrg_x, ga_uvrg_x, #0  @ gs_uvrg_x = ga_uvrg_x < 0
361
362   @ r12 = psx_gpu->triangle_winding_offset
363   ldrb r12, [psx_gpu, #psx_gpu_triangle_winding_offset]
364   vclt.s32 gs_uvrg_y, ga_uvrg_y, #0  @ gs_uvrg_y = ga_uvrg_y < 0
365
366   rsb r12, r12, #0                   @ r12 = -(triangle->winding)
367
368   vdup.u32 w_mask, r12               @ w_mask = { -w, -w, -w, -w }
369   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
370
371   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
372   vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
373
374   vadd.u32 uvrg_base, uvrgb_phase
375   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
376
377   vmov area_r_s, s0                  @ area_r_s = triangle_reciprocal
378   vabs.s32 ga_uvrg_y, ga_uvrg_y      @ ga_uvrg_y = abs(ga_uvrg_y)
379
380   vmull.u32 gw_rg_x, ga_rg_x, d0[0]
381   vmull.u32 gw_uv_x, ga_uv_x, d0[0]
382   vmull.u32 gw_rg_y, ga_rg_y, d0[0]
383   vmull.u32 gw_uv_y, ga_uv_y, d0[0]
384
385   vshl.u64 gw_rg_x, gw_rg_x, r_shift
386   vshl.u64 gw_uv_x, gw_uv_x, r_shift
387   vshl.u64 gw_rg_y, gw_rg_y, r_shift
388   vshl.u64 gw_uv_y, gw_uv_y, r_shift
389
390   veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
391   vmovn.u64 g_uv_x, gw_uv_x
392
393   veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
394   vmovn.u64 g_rg_x, gw_rg_x
395
396   veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
397   vmovn.u64 g_uv_y, gw_uv_y
398
399   vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
400   vmovn.u64 g_rg_y, gw_rg_y
401
402   veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
403   mov ga_bx, ga_bx, lsl #13
404
405   vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
406   mov ga_by, ga_by, lsl #13
407
408   vdup.u32 x0_y0, x0
409   umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
410
411   vshl.u32 g_uvrg_x, g_uvrg_x, #4
412   vshl.u32 g_uvrg_y, g_uvrg_y, #4
413
414   umull gw_by_l, gw_by_h, ga_by, area_r_s
415   vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
416
417   eor gs_bx, gs_bx, r12
418   vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
419
420   veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
421   eor gs_by, gs_by, r12
422
423   rsb r11, r14, #0                   @ r11 = negative shift for scalar lsr
424   add store_a, psx_gpu, #psx_gpu_uvrg_offset
425
426   sub r11, r11, #(32 - 13)
427
428   add store_b, store_a, #16
429   mov store_inc, #32
430
431   vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
432   vst1.u32 { uvrg_base }, [store_a, :128], store_inc
433
434   vst1.u32 { uvrg_dx1 }, [store_b, :128], store_inc
435   mov g_bx, gw_bx_h, lsr r11
436
437   vst1.u32 { g_uvrg_y }, [store_a, :128], store_inc
438   mov g_by, gw_by_h, lsr r11
439
440   vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l },                     \
441    [store_b, :128], store_inc
442   eor g_bx, g_bx, gs_bx
443
444   vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h },                     \
445    [store_b, :128], store_inc
446   sub g_bx, g_bx, gs_bx
447
448   lsl g_bx, g_bx, #4  
449   eor g_by, g_by, gs_by
450
451   mls b_base, g_bx, x0, b_base
452   sub g_by, g_by, gs_by
453
454   lsl g_by, g_by, #4
455   mov g_bx0, #0
456
457   add g_bx2, g_bx, g_bx
458   add g_bx3, g_bx, g_bx2
459
460   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
461
462   restore_abi_regs()
463   ldmia sp!, { r4 - r11, pc }
464
465
466 #define psx_gpu                                  r0
467 #define v_a                                      r1
468 #define v_b                                      r2
469 #define v_c                                      r3
470
471 #define temp                                     r14
472
473 #define x_a                                      r4
474 #define x_b                                      r5
475 #define x_c                                      r6
476 #define y_a                                      r1
477 #define y_b                                      r2
478 #define y_c                                      r3
479
480 #define height_minor_a                           r7
481 #define height_minor_b                           r8
482 #define height_major                             r9
483 #define height                                   r9
484
485 #define reciprocal_table_ptr                     r10
486
487 #define edge_alt_low                             r4
488 #define edge_alt_high                            r5
489 #define edge_dx_dy_alt                           r6
490 #define edge_shift_alt                           r10
491
492 #define edge_dx_dy_alt_low                       r4
493 #define edge_dx_dy_alt_high                      r5
494
495 #define span_edge_data                           r4
496 #define span_uvrg_offset                         r5
497 #define span_b_offset                            r6
498
499 #define clip                                     r14
500
501 #define b                                        r11
502 #define b_dy                                     r12
503
504
505 #define alternate_x                              q0
506 #define alternate_dx_dy                          q1
507 #define alternate_x_32                           q2
508
509 #define alternate_x_low                          d0
510 #define alternate_x_high                         d1
511 #define alternate_dx_dy_low                      d2
512 #define alternate_dx_dy_high                     d3
513 #define alternate_x_32_low                       d4
514 #define alternate_x_32_high                      d5
515
516 #define left_x                                   q3
517 #define right_x                                  q4
518 #define left_dx_dy                               q5
519 #define right_dx_dy                              q6
520 #define left_edge                                q7
521 #define right_edge                               q8
522
523 #define left_x_low                               d6
524 #define left_x_high                              d7
525 #define right_x_low                              d8
526 #define right_x_high                             d9
527 #define left_dx_dy_low                           d10
528 #define left_dx_dy_high                          d11
529 #define right_dx_dy_low                          d12
530 #define right_dx_dy_high                         d13
531 #define left_edge_low                            d14
532 #define left_edge_high                           d15
533 #define right_edge_low                           d16
534 #define right_edge_high                          d17
535
536 #define y_mid_point                              d18
537 #define c_0x0004                                 d19
538
539 #define left_right_x_16                          q11
540 #define span_shifts_y                            q12
541 #define c_0x0001                                 q13
542
543 #define span_shifts                              d24
544 #define y_x4                                     d25
545 #define c_0xFFFE                                 d26
546 #define c_0x0007                                 d27
547
548 #define left_right_x_16_low                      d22
549 #define left_right_x_16_high                     d23
550
551 #define uvrg                                     q14
552 #define uvrg_dy                                  q15
553
554 #define alternate_x_16                           d4
555
556 #define v_clip                                   q3
557 #define v_clip_low                               d6
558
559 #define right_x_32                               q10
560 #define left_x_32                                q11
561 #define alternate_select                         d24
562
563 #define right_x_32_low                           d20
564 #define right_x_32_high                          d21
565 #define left_x_32_low                            d22
566 #define left_x_32_high                           d23
567
568 #define edges_xy                                 q0
569 #define edges_dx_dy                              d2
570 #define edge_shifts                              d3
571 #define edge_shifts_64                           q2
572
573 #define edges_xy_left                            d0
574 #define edges_xy_right                           d1
575
576 #define height_reciprocals                       d6
577 #define heights                                  d7
578
579 #define widths                                   d8
580 #define c_0x01                                   d9
581 #define x_starts                                 d10
582 #define x_ends                                   d11
583
584 #define heights_b                                d12
585 #define edges_dx_dy_64                           q10
586
587 #define edges_dx_dy_64_left                      d20
588 #define edges_dx_dy_64_right                     d21
589
590
591 #define setup_spans_prologue()                                                 \
592   stmdb sp!, { r4 - r11, lr };                                                 \
593   save_abi_regs();                                                             \
594                                                                                \
595   ldrsh x_a, [v_a, #8];                                                        \
596   ldrsh x_b, [v_b, #8];                                                        \
597   ldrsh x_c, [v_c, #8];                                                        \
598   ldrsh y_a, [v_a, #10];                                                       \
599   ldrsh y_b, [v_b, #10];                                                       \
600   ldrsh y_c, [v_c, #10];                                                       \
601                                                                                \
602   add temp, psx_gpu, #psx_gpu_uvrg_offset;                                     \
603   vld1.32 { uvrg }, [temp];                                                    \
604   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
605   vld1.32 { uvrg_dy }, [temp];                                                 \
606   ldr reciprocal_table_ptr, [psx_gpu, #psx_gpu_reciprocal_table_ptr_offset];   \
607                                                                                \
608   vmov.u32 c_0x01, #0x01                                                       \
609
610 #define setup_spans_load_b()                                                   \
611   ldr b, [psx_gpu, #psx_gpu_b_offset];                                         \
612   ldr b_dy, [psx_gpu, #psx_gpu_b_dy_offset]                                    \
613
614 #define setup_spans_prologue_b()                                               \
615   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
616   add temp, psx_gpu, #psx_gpu_viewport_start_x_offset;                         \
617                                                                                \
618   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
619   vmov.u16 c_0x0004, #0x0004;                                                  \
620                                                                                \
621   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
622   vmov.u16 c_0x0001, #0x0001;                                                  \
623                                                                                \
624   vld1.u16 { left_edge_low[], left_edge_high[] }, [temp];                      \
625   add temp, psx_gpu, #psx_gpu_viewport_end_x_offset;                           \
626                                                                                \
627   vld1.u16 { right_edge_low[], right_edge_high[] }, [temp];                    \
628   vadd.u16 right_edge, right_edge, c_0x0001;                                   \
629                                                                                \
630   vmov.u16 c_0x0007, #0x0007;                                                  \
631   vmvn.u16 c_0xFFFE, #0x0001                                                   \
632
633
634 #define compute_edge_delta_x2()                                                \
635   ldr temp, [reciprocal_table_ptr, height, lsl #2];                            \
636                                                                                \
637   vdup.u32 heights, height;                                                    \
638   vsub.u32 widths, x_ends, x_starts;                                           \
639                                                                                \
640   vdup.u32 edge_shifts, temp;                                                  \
641   vsub.u32 heights_b, heights, c_0x01;                                         \
642   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
643                                                                                \
644   vmla.s32 heights_b, x_starts, heights;                                       \
645   vbic.u16 edge_shifts, #0xE0;                                                 \
646   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
647   vmull.s32 edges_xy, heights_b, height_reciprocals                            \
648
649 #define width_alt                 r6
650 #define height_reciprocal_alt     r11
651 #define height_b_alt              r12
652
653 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
654   vmov heights, height_a, height_b;                                            \
655   ldr temp, [reciprocal_table_ptr, height_a, lsl #2];                          \
656   vmov.u32 edge_shifts[0], temp;                                               \
657   ldr temp, [reciprocal_table_ptr, height_b, lsl #2];                          \
658   vmov.u32 edge_shifts[1], temp;                                               \
659   ldr edge_shift_alt, [reciprocal_table_ptr, height_minor_b, lsl #2];          \
660                                                                                \
661   vsub.u32 widths, x_ends, x_starts;                                           \
662   sub width_alt, x_c, start_c;                                                 \
663                                                                                \
664   vsub.u32 heights_b, heights, c_0x01;                                         \
665   sub height_b_alt, height_minor_b, #1;                                        \
666                                                                                \
667   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
668   lsr height_reciprocal_alt, edge_shift_alt, #10;                              \
669                                                                                \
670   vmla.s32 heights_b, x_starts, heights;                                       \
671   mla height_b_alt, height_minor_b, start_c, height_b_alt;                     \
672                                                                                \
673   vbic.u16 edge_shifts, #0xE0;                                                 \
674   and edge_shift_alt, edge_shift_alt, #0x1F;                                   \
675                                                                                \
676   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
677   mul edge_dx_dy_alt, width_alt, height_reciprocal_alt;                        \
678                                                                                \
679   vmull.s32 edges_xy, heights_b, height_reciprocals;                           \
680   smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt       \
681
682
683 #define setup_spans_adjust_y_up()                                              \
684   vsub.u32 y_x4, y_x4, c_0x0004                                                \
685
686 #define setup_spans_adjust_y_down()                                            \
687   vadd.u32 y_x4, y_x4, c_0x0004                                                \
688
689 #define setup_spans_adjust_interpolants_up()                                   \
690   vsub.u32 uvrg, uvrg, uvrg_dy;                                                \
691   sub b, b, b_dy                                                               \
692
693 #define setup_spans_adjust_interpolants_down()                                 \
694   vadd.u32 uvrg, uvrg, uvrg_dy;                                                \
695   add b, b, b_dy                                                               \
696
697
698 #define setup_spans_clip_interpolants_increment()                              \
699   mla b, b_dy, clip, b;                                                        \
700   vmla.s32 uvrg, uvrg_dy, v_clip                                               \
701
702 #define setup_spans_clip_interpolants_decrement()                              \
703   mls b, b_dy, clip, b;                                                        \
704   vmls.s32 uvrg, uvrg_dy, v_clip                                               \
705
706 #define setup_spans_clip_alternate_yes()                                       \
707   smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip                      \
708
709 #define setup_spans_clip_alternate_no()                                        \
710
711 #define setup_spans_clip(direction, alternate_active)                          \
712   vdup.u32 v_clip, clip;                                                       \
713   setup_spans_clip_alternate_##alternate_active();                             \
714   setup_spans_clip_interpolants_##direction();                                 \
715   vmlal.s32 edges_xy, edges_dx_dy, v_clip_low                                  \
716
717
718 #define setup_spans_adjust_edges_alternate_no(left_index, right_index)         \
719   vmovl.s32 edge_shifts_64, edge_shifts;                                       \
720   vmovl.s32 edges_dx_dy_64, edges_dx_dy;                                       \
721                                                                                \
722   vshl.s64 edges_xy, edges_xy, edge_shifts_64;                                 \
723   vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64;                     \
724                                                                                \
725   vmov left_x_low, edges_xy_##left_index;                                      \
726   vmov right_x_low, edges_xy_##right_index;                                    \
727                                                                                \
728   vmov left_dx_dy_low, edges_dx_dy_64_##left_index;                            \
729   vmov left_dx_dy_high, edges_dx_dy_64_##left_index;                           \
730   vmov right_dx_dy_low, edges_dx_dy_64_##right_index;                          \
731   vmov right_dx_dy_high, edges_dx_dy_64_##right_index;                         \
732                                                                                \
733   vadd.u64 left_x_high, left_x_low, left_dx_dy_low;                            \
734   vadd.u64 right_x_high, right_x_low, right_dx_dy_low;                         \
735                                                                                \
736   vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy;                                 \
737   vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy                               \
738
739
740 #define setup_spans_adjust_edges_alternate_yes(left_index, right_index)        \
741   setup_spans_adjust_edges_alternate_no(left_index, right_index);              \
742                                                                                \
743   vdup.u16 y_mid_point, y_b;                                                   \
744   rsb temp, edge_shift_alt, #32;                                               \
745                                                                                \
746   lsl edge_alt_high, edge_alt_high, edge_shift_alt;                            \
747   orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp;                    \
748   lsl edge_alt_low, edge_alt_low, edge_shift_alt;                              \
749   vmov alternate_x_low, edge_alt_low, edge_alt_high;                           \
750                                                                                \
751   asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp;                               \
752   lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt;                      \
753   vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high;           \
754   vmov alternate_dx_dy_high, alternate_dx_dy_low;                              \
755                                                                                \
756   vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low;             \
757   vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy                   \
758
759
760 #define setup_spans_y_select_up()                                              \
761   vclt.s16 alternate_select, y_x4, y_mid_point                                 \
762
763 #define setup_spans_y_select_down()                                            \
764   vcgt.s16 alternate_select, y_x4, y_mid_point                                 \
765
766
767 #define setup_spans_alternate_select_left()                                    \
768   vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select               \
769
770 #define setup_spans_alternate_select_right()                                   \
771   vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select              \
772
773
774 #define setup_spans_set_x4_alternate_yes(alternate, direction)                 \
775   vshrn.s64 alternate_x_32_low, alternate_x, #32;                              \
776   vshrn.s64 left_x_32_low, left_x, #32;                                        \
777   vshrn.s64 right_x_32_low, right_x, #32;                                      \
778                                                                                \
779   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
780   vadd.u64 left_x, left_x, left_dx_dy;                                         \
781   vadd.u64 right_x, right_x, right_dx_dy;                                      \
782                                                                                \
783   vshrn.s64 alternate_x_32_high, alternate_x, #32;                             \
784   vshrn.s64 left_x_32_high, left_x, #32;                                       \
785   vshrn.s64 right_x_32_high, right_x, #32;                                     \
786                                                                                \
787   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
788   vadd.u64 left_x, left_x, left_dx_dy;                                         \
789   vadd.u64 right_x, right_x, right_dx_dy;                                      \
790                                                                                \
791   vmovn.u32 alternate_x_16, alternate_x_32;                                    \
792   setup_spans_y_select_##direction();                                          \
793   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
794                                                                                \
795   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
796   setup_spans_alternate_select_##alternate();                                  \
797                                                                                \
798   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
799   str b, [span_b_offset], #4;                                                  \
800   setup_spans_adjust_interpolants_##direction();                               \
801                                                                                \
802   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
803                                                                                \
804   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
805   str b, [span_b_offset], #4;                                                  \
806   setup_spans_adjust_interpolants_##direction();                               \
807                                                                                \
808   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
809                                                                                \
810   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
811   str b, [span_b_offset], #4;                                                  \
812   setup_spans_adjust_interpolants_##direction();                               \
813                                                                                \
814   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
815   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
816   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
817                                                                                \
818   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
819   str b, [span_b_offset], #4;                                                  \
820   setup_spans_adjust_interpolants_##direction();                               \
821                                                                                \
822   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
823   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
824                                                                                \
825   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
826                                                                                \
827   setup_spans_adjust_y_##direction()                                           \
828
829
830 #define setup_spans_set_x4_alternate_no(alternate, direction)                  \
831   vshrn.s64 left_x_32_low, left_x, #32;                                        \
832   vshrn.s64 right_x_32_low, right_x, #32;                                      \
833                                                                                \
834   vadd.u64 left_x, left_x, left_dx_dy;                                         \
835   vadd.u64 right_x, right_x, right_dx_dy;                                      \
836                                                                                \
837   vshrn.s64 left_x_32_high, left_x, #32;                                       \
838   vshrn.s64 right_x_32_high, right_x, #32;                                     \
839                                                                                \
840   vadd.u64 left_x, left_x, left_dx_dy;                                         \
841   vadd.u64 right_x, right_x, right_dx_dy;                                      \
842                                                                                \
843   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
844   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
845                                                                                \
846   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
847   str b, [span_b_offset], #4;                                                  \
848   setup_spans_adjust_interpolants_##direction();                               \
849                                                                                \
850   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
851                                                                                \
852   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
853   str b, [span_b_offset], #4;                                                  \
854   setup_spans_adjust_interpolants_##direction();                               \
855                                                                                \
856   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
857                                                                                \
858   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
859   str b, [span_b_offset], #4;                                                  \
860   setup_spans_adjust_interpolants_##direction();                               \
861                                                                                \
862   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
863   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
864   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
865                                                                                \
866   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
867   str b, [span_b_offset], #4;                                                  \
868   setup_spans_adjust_interpolants_##direction();                               \
869                                                                                \
870   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
871   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
872                                                                                \
873   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
874                                                                                \
875   setup_spans_adjust_y_##direction()                                           \
876
877
878 #define edge_adjust_low           r11
879 #define edge_adjust_high          r12
880
881 #define setup_spans_alternate_adjust_yes()                                     \
882   smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a;     \
883   subs edge_alt_low, edge_alt_low, edge_adjust_low;                            \
884   sbc edge_alt_high, edge_alt_high, edge_adjust_high                           \
885
886 #define setup_spans_alternate_adjust_no()                                      \
887
888
889 #define setup_spans_down(left_index, right_index, alternate, alternate_active) \
890   setup_spans_alternate_adjust_##alternate_active();                           \
891   setup_spans_load_b();                                                        \
892                                                                                \
893   ldrsh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset];                       \
894   subs y_c, y_c, temp;                                                         \
895   subgt height, height, y_c;                                                   \
896   addgt height, height, #1;                                                    \
897                                                                                \
898   ldrsh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset];                     \
899   subs clip, temp, y_a;                                                        \
900   ble 0f;                                                                      \
901                                                                                \
902   sub height, height, clip;                                                    \
903   add y_a, y_a, clip;                                                          \
904   setup_spans_clip(increment, alternate_active);                               \
905                                                                                \
906  0:                                                                            \
907   cmp height, #0;                                                              \
908   ble 1f;                                                                      \
909                                                                                \
910   orr temp, y_a, y_a, lsl #16;                                                 \
911   add temp, temp, #(1 << 16);                                                  \
912   add y_a, temp, #2;                                                           \
913   add y_a, y_a, #(2 << 16);                                                    \
914   vmov y_x4, temp, y_a;                                                        \
915                                                                                \
916   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
917    right_index);                                                               \
918   setup_spans_prologue_b();                                                    \
919                                                                                \
920   strh height, [psx_gpu, #psx_gpu_num_spans_offset];                           \
921                                                                                \
922  2:                                                                            \
923   setup_spans_set_x4_alternate_##alternate_active(alternate, down);            \
924   subs height, height, #4;                                                     \
925   bhi 2b;                                                                      \
926                                                                                \
927  1:                                                                            \
928
929
930 #define setup_spans_alternate_pre_increment_yes()                              \
931   adds edge_alt_low, edge_alt_low, edge_dx_dy_alt;                             \
932   adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31                    \
933
934 #define setup_spans_alternate_pre_increment_no()                               \
935
936
937 #define setup_spans_up_decrement_yes()                                         \
938   suble height, height, #1                                                     \
939
940 #define setup_spans_up_decrement_no()                                          \
941
942
943 #define setup_spans_up(left_index, right_index, alternate, alternate_active)   \
944   setup_spans_alternate_adjust_##alternate_active();                           \
945   setup_spans_load_b();                                                        \
946   sub y_a, y_a, #1;                                                            \
947                                                                                \
948   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset];                      \
949   subs temp, temp, y_c;                                                        \
950   subgt height, height, temp;                                                  \
951   setup_spans_up_decrement_##alternate_active();                               \
952                                                                                \
953   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset];                        \
954   subs clip, y_a, temp;                                                        \
955   ble 0f;                                                                      \
956                                                                                \
957   sub height, height, clip;                                                    \
958   sub y_a, y_a, clip;                                                          \
959   setup_spans_clip(decrement, alternate_active);                               \
960                                                                                \
961  0:                                                                            \
962   cmp height, #0;                                                              \
963   ble 1f;                                                                      \
964                                                                                \
965   orr temp, y_a, y_a, lsl #16;                                                 \
966   sub temp, temp, #(1 << 16);                                                  \
967   sub y_a, temp, #2;                                                           \
968   sub y_a, y_a, #(2 << 16);                                                    \
969   vmov y_x4, temp, y_a;                                                        \
970                                                                                \
971   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
972                                                                                \
973   setup_spans_alternate_pre_increment_##alternate_active();                    \
974   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
975    right_index);                                                               \
976   setup_spans_adjust_interpolants_up();                                        \
977   setup_spans_prologue_b();                                                    \
978                                                                                \
979   strh height, [psx_gpu, #psx_gpu_num_spans_offset];                           \
980                                                                                \
981  2:                                                                            \
982   setup_spans_set_x4_alternate_##alternate_active(alternate, up);              \
983   subs height, height, #4;                                                     \
984   bhi 2b;                                                                      \
985                                                                                \
986  1:                                                                            \
987
988
989 #define setup_spans_epilogue()                                                 \
990   restore_abi_regs();                                                          \
991   ldmia sp!, { r4 - r11, pc }                                                  \
992
993
994 #define setup_spans_up_up(minor, major)                                        \
995   setup_spans_prologue();                                                      \
996   sub height_minor_a, y_a, y_b;                                                \
997   sub height_minor_b, y_b, y_c;                                                \
998   sub height, y_a, y_c;                                                        \
999                                                                                \
1000   vdup.u32 x_starts, x_a;                                                      \
1001   vmov x_ends, x_c, x_b;                                                       \
1002                                                                                \
1003   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1004   setup_spans_up(major, minor, minor, yes);                                    \
1005   setup_spans_epilogue()                                                       \
1006
1007 function(setup_spans_up_left)
1008   setup_spans_up_up(left, right)
1009
1010 function(setup_spans_up_right)
1011   setup_spans_up_up(right, left)
1012
1013 #define setup_spans_down_down(minor, major)                                    \
1014   setup_spans_prologue();                                                      \
1015   sub height_minor_a, y_b, y_a;                                                \
1016   sub height_minor_b, y_c, y_b;                                                \
1017   sub height, y_c, y_a;                                                        \
1018                                                                                \
1019   vdup.u32 x_starts, x_a;                                                      \
1020   vmov x_ends, x_c, x_b;                                                       \
1021                                                                                \
1022   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1023   setup_spans_down(major, minor, minor, yes);                                  \
1024   setup_spans_epilogue()                                                       \
1025
1026 function(setup_spans_down_left)
1027   setup_spans_down_down(left, right)
1028
1029 function(setup_spans_down_right)
1030   setup_spans_down_down(right, left)
1031
1032
1033 #define setup_spans_up_flat()                                                  \
1034   sub height, y_a, y_c;                                                        \
1035                                                                                \
1036   compute_edge_delta_x2();                                                     \
1037   setup_spans_up(left, right, none, no);                                       \
1038   setup_spans_epilogue()                                                       \
1039
1040 function(setup_spans_up_a)
1041   setup_spans_prologue()
1042
1043   vmov x_starts, x_a, x_b
1044   vdup.u32 x_ends, x_c
1045
1046   setup_spans_up_flat()
1047
1048 function(setup_spans_up_b)
1049   setup_spans_prologue()
1050
1051   vdup.u32 x_starts, x_a
1052   vmov x_ends, x_b, x_c
1053
1054   setup_spans_up_flat()
1055
1056 #define setup_spans_down_flat()                                                \
1057   sub height, y_c, y_a;                                                        \
1058                                                                                \
1059   compute_edge_delta_x2();                                                     \
1060   setup_spans_down(left, right, none, no);                                     \
1061   setup_spans_epilogue()                                                       \
1062
1063 function(setup_spans_down_a)
1064   setup_spans_prologue()
1065
1066   vmov x_starts, x_a, x_b
1067   vdup.u32 x_ends, x_c
1068
1069   setup_spans_down_flat()
1070
1071 function(setup_spans_down_b)
1072   setup_spans_prologue()
1073
1074   vdup.u32 x_starts, x_a
1075   vmov x_ends, x_b, x_c
1076
1077   setup_spans_down_flat()
1078
1079
1080 #define middle_y                                          r9
1081
1082 #define edges_xy_b                                        q11
1083 #define edges_dx_dy_b                                     d26
1084 #define edge_shifts_b                                     d27
1085 #define edges_dx_dy_and_shifts_b                          q13
1086 #define height_increment                                  d20
1087
1088 #define edges_dx_dy_and_shifts                            q1
1089
1090 #define edges_xy_b_left                                   d22
1091 #define edges_xy_b_right                                  d23
1092
1093 #define setup_spans_up_down_load_edge_set_b()                                  \
1094   vmov edges_xy, edges_xy_b;                                                   \
1095   vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b                        \
1096
1097
1098 function(setup_spans_up_down)
1099   setup_spans_prologue()
1100
1101   // s32 middle_y = y_a;
1102   sub height_minor_a, y_a, y_b
1103   sub height_minor_b, y_c, y_a
1104   sub height_major, y_c, y_b
1105
1106   vmov x_starts, x_a, x_c
1107   vdup.u32 x_ends, x_b
1108
1109   compute_edge_delta_x3(x_a, height_minor_a, height_major)
1110
1111   mov temp, #0
1112   vmov height_increment, temp, height_minor_b
1113   vmlal.s32 edges_xy, edges_dx_dy, height_increment
1114
1115   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1116   vmov edges_xy_b_right, edges_xy_right
1117
1118   vmov edge_shifts_b, edge_shifts
1119   vmov.u32 edge_shifts_b[0], edge_shift_alt
1120
1121   vneg.s32 edges_dx_dy_b, edges_dx_dy
1122   vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1123
1124   mov middle_y, y_a
1125   
1126   setup_spans_load_b()
1127   sub y_a, y_a, #1
1128
1129   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
1130   subs temp, temp, y_b
1131   subgt height_minor_a, height_minor_a, temp
1132
1133   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
1134   subs clip, y_a, temp
1135   ble 0f
1136
1137   sub height_minor_a, height_minor_a, clip
1138   sub y_a, y_a, clip
1139   setup_spans_clip(decrement, no)
1140
1141  0:                                                                
1142   cmp height_minor_a, #0
1143   ble 3f
1144
1145   orr temp, y_a, y_a, lsl #16
1146   sub temp, temp, #(1 << 16)
1147   sub y_a, temp, #2
1148   sub y_a, y_a, #(2 << 16)
1149   vmov y_x4, temp, y_a
1150
1151   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1152
1153   strh height_minor_a, [psx_gpu, #psx_gpu_num_spans_offset]
1154
1155   setup_spans_adjust_edges_alternate_no(left, right); 
1156   setup_spans_adjust_interpolants_up()
1157   setup_spans_up_down_load_edge_set_b()
1158
1159   setup_spans_prologue_b()
1160
1161
1162  2: 
1163   setup_spans_set_x4_alternate_no(none, up)
1164   subs height_minor_a, height_minor_a, #4
1165   bhi 2b
1166
1167   add span_edge_data, span_edge_data, height_minor_a, lsl #3
1168   add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1169   add span_b_offset, span_b_offset, height_minor_a, lsl #2
1170
1171  4:
1172   add temp, psx_gpu, #psx_gpu_uvrg_offset
1173   vld1.32 { uvrg }, [temp]
1174   mov y_a, middle_y
1175   
1176   setup_spans_load_b()
1177
1178   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
1179   subs y_c, y_c, temp
1180   subgt height_minor_b, height_minor_b, y_c
1181   addgt height_minor_b, height_minor_b, #1
1182
1183   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
1184   subs clip, temp, y_a
1185   ble 0f
1186
1187   sub height_minor_b, height_minor_b, clip
1188   add y_a, y_a, clip
1189   setup_spans_clip(increment, no)
1190
1191  0:
1192   cmp height_minor_b, #0
1193   ble 1f
1194
1195   orr temp, y_a, y_a, lsl #16
1196   add temp, temp, #(1 << 16) 
1197   add y_a, temp, #2
1198   add y_a, y_a, #(2 << 16)
1199   vmov y_x4, temp, y_a
1200
1201   setup_spans_adjust_edges_alternate_no(left, right)
1202
1203   ldrh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1204   add temp, temp, height_minor_b
1205
1206   cmp temp, #MAX_SPANS
1207   beq 5f
1208
1209   strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1210
1211  2:                                                     
1212   setup_spans_set_x4_alternate_no(none, down)
1213   subs height_minor_b, height_minor_b, #4
1214   bhi 2b
1215
1216  1:
1217   setup_spans_epilogue()
1218
1219  3:
1220   setup_spans_up_down_load_edge_set_b()
1221   setup_spans_prologue_b()
1222   bal 4b
1223
1224  5:
1225   // FIXME: overflow corner case
1226   sub temp, temp, height_minor_b
1227   bics height_minor_b, #3
1228   add temp, temp, height_minor_b
1229   strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1230   bne 2b
1231   bal 1b
1232
1233 #undef span_uvrg_offset
1234 #undef span_edge_data
1235 #undef span_b_offset
1236 #undef left_x
1237 #undef b
1238
1239 #define psx_gpu                                  r0
1240 #define num_spans                                r1
1241 #define span_uvrg_offset                         r2
1242 #define span_edge_data                           r3
1243 #define span_b_offset                            r4
1244 #define b_dx                                     r5
1245 #define span_num_blocks                          r6
1246 #define y                                        r7
1247 #define left_x                                   r8
1248 #define b                                        r9
1249 #define dither_offset_ptr                        r10
1250 #define block_ptr_a                              r11
1251 #define fb_ptr                                   r12
1252 #define num_blocks                               r14
1253
1254 #define uvrg_dx_ptr                              r2
1255 #define texture_mask_ptr                         r3
1256 #define dither_shift                             r8
1257 #define dither_row                               r10
1258
1259 #define c_32                                     r7
1260 #define b_dx4                                    r8
1261 #define b_dx8                                    r9
1262 #define block_ptr_b                              r10
1263
1264 #define block_span_ptr                           r10
1265 #define right_mask                               r8
1266
1267 #define color                                    r2
1268 #define color_r                                  r3
1269 #define color_g                                  r4
1270 #define color_b                                  r5
1271
1272 #undef uvrg
1273
1274 #define u_block                                  q0
1275 #define v_block                                  q1
1276 #define r_block                                  q2
1277 #define g_block                                  q3
1278 #define b_block                                  q4
1279
1280 #define uv_dx4                                   d10
1281 #define rg_dx4                                   d11
1282 #define uv_dx8                                   d12
1283 #define rg_dx8                                   d13
1284 #define b_whole_8                                d14
1285 #define fb_mask_ptrs                             d15
1286
1287 #define uvrg_dx4                                 q5
1288 #define uvrg_dx8                                 q6
1289 #define uv_dx8                                   d12
1290 #define rg_dx8                                   d13
1291
1292 #define u_whole                                  q8
1293 #define v_whole                                  q9
1294 #define r_whole                                  q10
1295 #define g_whole                                  q11
1296 #define b_whole                                  q12
1297
1298 #define u_whole_low                              d16
1299 #define u_whole_high                             d17
1300 #define v_whole_low                              d18
1301 #define v_whole_high                             d19
1302 #define r_whole_low                              d20
1303 #define r_whole_high                             d21
1304 #define g_whole_low                              d22
1305 #define g_whole_high                             d23
1306 #define b_whole_low                              d24
1307 #define b_whole_high                             d25
1308
1309 #define dx4                                      q13
1310 #define dx8                                      q13
1311
1312 #define u_whole_8                                d26
1313 #define v_whole_8                                d27
1314 #define u_whole_8b                               d24
1315 #define r_whole_8                                d24
1316 #define g_whole_8                                d25
1317
1318 #define uv_whole_8                               q13
1319 #define uv_whole_8b                              q14
1320
1321 #define dither_offsets                           q14
1322 #define texture_mask                             q15
1323 #define texture_mask_u                           d30
1324 #define texture_mask_v                           d31
1325
1326 #define dither_offsets_short                     d28
1327
1328 #define v_left_x                                 q8
1329 #define uvrg                                     q9
1330 #define block_span                               q10
1331
1332 #define uv                                       d18
1333 #define rg                                       d19
1334
1335 #define draw_mask                                q1
1336 #define draw_mask_edge                           q13
1337 #define test_mask                                q0
1338
1339 #define uvrg_dx                                  q3
1340
1341 #define colors                                   q2
1342
1343 #define setup_blocks_texture_swizzled()                                        \
1344   vand.u8 u_whole_8b, u_whole_8, texture_mask_u;                               \
1345   vsli.u8 u_whole_8, v_whole_8, #4;                                            \
1346   vsri.u8 v_whole_8, u_whole_8b, #4                                            \
1347
1348 #define setup_blocks_texture_unswizzled()                                      \
1349
1350
1351 #define setup_blocks_shaded_textured_builder(swizzling)                        \
1352 .align 3;                                                                      \
1353                                                                                \
1354 function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
1355   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
1356   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1357                                                                                \
1358   vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128];                                   \
1359   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1360                                                                                \
1361   cmp num_spans, #0;                                                           \
1362   bxeq lr;                                                                     \
1363                                                                                \
1364   stmdb sp!, { r4 - r11, r14 };                                                \
1365   save_abi_regs();                                                             \
1366   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1367                                                                                \
1368   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
1369   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1370                                                                                \
1371   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16];     \
1372   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1373                                                                                \
1374   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1375   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1376                                                                                \
1377   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
1378   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1379                                                                                \
1380   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1381                                                                                \
1382  0:                                                                            \
1383   vmov.u8 fb_mask_ptrs, #0;                                                    \
1384                                                                                \
1385   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
1386   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1387                                                                                \
1388   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
1389   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
1390                                                                                \
1391   cmp span_num_blocks, #0;                                                     \
1392   beq 1f;                                                                      \
1393                                                                                \
1394   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
1395   add num_blocks, span_num_blocks, num_blocks;                                 \
1396                                                                                \
1397   cmp num_blocks, #MAX_BLOCKS;                                                 \
1398   bgt 2f;                                                                      \
1399                                                                                \
1400  3:                                                                            \
1401   ldr b, [span_b_offset];                                                      \
1402   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1403                                                                                \
1404   vdup.u32 v_left_x, left_x;                                                   \
1405   and y, y, #0x3;                                                              \
1406                                                                                \
1407   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
1408   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1409                                                                                \
1410   mla b, b_dx, left_x, b;                                                      \
1411   and dither_shift, left_x, #0x03;                                             \
1412                                                                                \
1413   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
1414   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1415                                                                                \
1416   mov dither_shift, dither_shift, lsl #3;                                      \
1417   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1418                                                                                \
1419   mov c_32, #32;                                                               \
1420   subs span_num_blocks, span_num_blocks, #1;                                   \
1421                                                                                \
1422   mov dither_row, dither_row, ror dither_shift;                                \
1423   mov b_dx4, b_dx, lsl #2;                                                     \
1424                                                                                \
1425   vdup.u32 dither_offsets_short, dither_row;                                   \
1426   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1427                                                                                \
1428   vdup.u32 b_block, b;                                                         \
1429   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1430                                                                                \
1431   vdup.u32 u_block, uv[0];                                                     \
1432   mov b_dx8, b_dx, lsl #3;                                                     \
1433                                                                                \
1434   vdup.u32 v_block, uv[1];                                                     \
1435   vdup.u32 r_block, rg[0];                                                     \
1436   vdup.u32 g_block, rg[1];                                                     \
1437                                                                                \
1438   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1439                                                                                \
1440   vadd.u32 u_block, u_block, block_span;                                       \
1441   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1442                                                                                \
1443   vadd.u32 v_block, v_block, block_span;                                       \
1444   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1445                                                                                \
1446   vadd.u32 r_block, r_block, block_span;                                       \
1447   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1448                                                                                \
1449   vadd.u32 g_block, g_block, block_span;                                       \
1450   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
1451                                                                                \
1452   vadd.u32 b_block, b_block, block_span;                                       \
1453   add block_ptr_b, block_ptr_a, #16;                                           \
1454                                                                                \
1455   vshrn.u32 u_whole_low, u_block, #16;                                         \
1456   vshrn.u32 v_whole_low, v_block, #16;                                         \
1457   vshrn.u32 r_whole_low, r_block, #16;                                         \
1458   vshrn.u32 g_whole_low, g_block, #16;                                         \
1459                                                                                \
1460   vdup.u32 dx4, uv_dx4[0];                                                     \
1461   vshrn.u32 b_whole_low, b_block, #16;                                         \
1462                                                                                \
1463   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1464   vdup.u32 dx4, uv_dx4[1];                                                     \
1465                                                                                \
1466   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1467   vdup.u32 dx4, rg_dx4[0];                                                     \
1468                                                                                \
1469   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1470   vdup.u32 dx4, rg_dx4[1];                                                     \
1471                                                                                \
1472   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1473   vdup.u32 dx4, b_dx4;                                                         \
1474                                                                                \
1475   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1476   vdup.u32 dx8, uv_dx8[0];                                                     \
1477                                                                                \
1478   vadd.u32 u_block, u_block, dx8;                                              \
1479   vdup.u32 dx8, uv_dx8[1];                                                     \
1480                                                                                \
1481   vadd.u32 v_block, v_block, dx8;                                              \
1482   vdup.u32 dx8, rg_dx8[0];                                                     \
1483                                                                                \
1484   vadd.u32 r_block, r_block, dx8;                                              \
1485   vdup.u32 dx8, rg_dx8[1];                                                     \
1486                                                                                \
1487   vadd.u32 g_block, g_block, dx8;                                              \
1488   vdup.u32 dx8, b_dx8;                                                         \
1489                                                                                \
1490   vadd.u32 b_block, b_block, dx8;                                              \
1491   vmovn.u16 u_whole_8, u_whole;                                                \
1492                                                                                \
1493   vmovn.u16 v_whole_8, v_whole;                                                \
1494                                                                                \
1495   vmovn.u16 b_whole_8, b_whole;                                                \
1496   pld [fb_ptr];                                                                \
1497   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1498                                                                                \
1499   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1500   setup_blocks_texture_##swizzling();                                          \
1501                                                                                \
1502   vmovn.u16 r_whole_8, r_whole;                                                \
1503   beq 5f;                                                                      \
1504                                                                                \
1505  4:                                                                            \
1506   vmovn.u16 g_whole_8, g_whole;                                                \
1507   vshrn.u32 u_whole_low, u_block, #16;                                         \
1508                                                                                \
1509   vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32;                 \
1510   vshrn.u32 v_whole_low, v_block, #16;                                         \
1511                                                                                \
1512   vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32;                \
1513   vshrn.u32 r_whole_low, r_block, #16;                                         \
1514                                                                                \
1515   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1516   vshrn.u32 g_whole_low, g_block, #16;                                         \
1517                                                                                \
1518   vdup.u32 dx4, uv_dx4[0];                                                     \
1519   vshrn.u32 b_whole_low, b_block, #16;                                         \
1520                                                                                \
1521   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1522   vdup.u32 dx4, uv_dx4[1];                                                     \
1523                                                                                \
1524   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1525   vdup.u32 dx4, rg_dx4[0];                                                     \
1526                                                                                \
1527   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1528   vdup.u32 dx4, rg_dx4[1];                                                     \
1529                                                                                \
1530   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1531   vdup.u32 dx4, b_dx4;                                                         \
1532                                                                                \
1533   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1534   vdup.u32 dx8, uv_dx8[0];                                                     \
1535                                                                                \
1536   vadd.u32 u_block, u_block, dx8;                                              \
1537   vdup.u32 dx8, uv_dx8[1];                                                     \
1538                                                                                \
1539   vadd.u32 v_block, v_block, dx8;                                              \
1540   vdup.u32 dx8, rg_dx8[0];                                                     \
1541                                                                                \
1542   vadd.u32 r_block, r_block, dx8;                                              \
1543   vdup.u32 dx8, rg_dx8[1];                                                     \
1544                                                                                \
1545   vadd.u32 g_block, g_block, dx8;                                              \
1546   vdup.u32 dx8, b_dx8;                                                         \
1547                                                                                \
1548   vadd.u32 b_block, b_block, dx8;                                              \
1549   vmovn.u16 u_whole_8, u_whole;                                                \
1550                                                                                \
1551   add fb_ptr, fb_ptr, #16;                                                     \
1552   vmovn.u16 v_whole_8, v_whole;                                                \
1553                                                                                \
1554   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1555   vmovn.u16 b_whole_8, b_whole;                                                \
1556                                                                                \
1557   pld [fb_ptr];                                                                \
1558                                                                                \
1559   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1560   subs span_num_blocks, span_num_blocks, #1;                                   \
1561                                                                                \
1562   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1563   setup_blocks_texture_##swizzling();                                          \
1564                                                                                \
1565   vmovn.u16 r_whole_8, r_whole;                                                \
1566   bne 4b;                                                                      \
1567                                                                                \
1568  5:                                                                            \
1569   vmovn.u16 g_whole_8, g_whole;                                                \
1570   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
1571                                                                                \
1572   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
1573   vdup.u8 draw_mask, right_mask;                                               \
1574                                                                                \
1575   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1576   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1577   vzip.u8 u_whole_8, v_whole_8;                                                \
1578                                                                                \
1579   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1580   vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32;                \
1581   vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32;                          \
1582   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1583   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1584                                                                                \
1585  1:                                                                            \
1586   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1587   add span_b_offset, span_b_offset, #4;                                        \
1588                                                                                \
1589   add span_edge_data, span_edge_data, #8;                                      \
1590   subs num_spans, num_spans, #1;                                               \
1591                                                                                \
1592   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1593   bne 0b;                                                                      \
1594                                                                                \
1595   restore_abi_regs();                                                          \
1596   ldmia sp!, { r4 - r11, pc };                                                 \
1597                                                                                \
1598  2:                                                                            \
1599   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1600   vpush { texture_mask };                                                      \
1601   vpush { uvrg_dx4 };                                                          \
1602                                                                                \
1603   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
1604   bl flush_render_block_buffer;                                                \
1605   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
1606                                                                                \
1607   vpop { uvrg_dx4 };                                                           \
1608   vpop { texture_mask };                                                       \
1609                                                                                \
1610   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1611   vmov.u8 fb_mask_ptrs, #0;                                                    \
1612                                                                                \
1613   mov num_blocks, span_num_blocks;                                             \
1614   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1615   bal 3b                                                                       \
1616
1617
1618 setup_blocks_shaded_textured_builder(swizzled)
1619 setup_blocks_shaded_textured_builder(unswizzled)
1620
1621
1622 #define setup_blocks_unshaded_textured_builder(swizzling)                      \
1623 .align 3;                                                                      \
1624                                                                                \
1625 function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
1626   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
1627   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1628                                                                                \
1629   vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128];                                   \
1630   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1631                                                                                \
1632   cmp num_spans, #0;                                                           \
1633   bxeq lr;                                                                     \
1634                                                                                \
1635   stmdb sp!, { r4 - r11, r14 };                                                \
1636   save_abi_regs();                                                             \
1637   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1638                                                                                \
1639   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1640                                                                                \
1641   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16];     \
1642   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1643                                                                                \
1644   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1645   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1646                                                                                \
1647   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1648                                                                                \
1649   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1650                                                                                \
1651  0:                                                                            \
1652   vmov.u8 fb_mask_ptrs, #0;                                                    \
1653                                                                                \
1654   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
1655   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1656                                                                                \
1657   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
1658   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
1659                                                                                \
1660   cmp span_num_blocks, #0;                                                     \
1661   beq 1f;                                                                      \
1662                                                                                \
1663   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
1664   add num_blocks, span_num_blocks, num_blocks;                                 \
1665                                                                                \
1666   cmp num_blocks, #MAX_BLOCKS;                                                 \
1667   bgt 2f;                                                                      \
1668                                                                                \
1669  3:                                                                            \
1670   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1671                                                                                \
1672   vdup.u32 v_left_x, left_x;                                                   \
1673   and y, y, #0x3;                                                              \
1674                                                                                \
1675   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
1676   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1677                                                                                \
1678   and dither_shift, left_x, #0x03;                                             \
1679                                                                                \
1680   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
1681   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1682                                                                                \
1683   mov dither_shift, dither_shift, lsl #3;                                      \
1684   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1685                                                                                \
1686   mov c_32, #32;                                                               \
1687   subs span_num_blocks, span_num_blocks, #1;                                   \
1688                                                                                \
1689   mov dither_row, dither_row, ror dither_shift;                                \
1690                                                                                \
1691   vdup.u32 dither_offsets_short, dither_row;                                   \
1692   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1693                                                                                \
1694   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1695                                                                                \
1696   vdup.u32 u_block, uv[0];                                                     \
1697                                                                                \
1698   vdup.u32 v_block, uv[1];                                                     \
1699   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1700                                                                                \
1701   vadd.u32 u_block, u_block, block_span;                                       \
1702   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1703                                                                                \
1704   vadd.u32 v_block, v_block, block_span;                                       \
1705   add block_ptr_b, block_ptr_a, #16;                                           \
1706                                                                                \
1707   vshrn.u32 u_whole_low, u_block, #16;                                         \
1708   vshrn.u32 v_whole_low, v_block, #16;                                         \
1709                                                                                \
1710   vdup.u32 dx4, uv_dx4[0];                                                     \
1711                                                                                \
1712   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1713   vdup.u32 dx4, uv_dx4[1];                                                     \
1714                                                                                \
1715   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1716   vdup.u32 dx8, uv_dx8[0];                                                     \
1717                                                                                \
1718   vadd.u32 u_block, u_block, dx8;                                              \
1719   vdup.u32 dx8, uv_dx8[1];                                                     \
1720                                                                                \
1721   vadd.u32 v_block, v_block, dx8;                                              \
1722   vmovn.u16 u_whole_8, u_whole;                                                \
1723                                                                                \
1724   vmovn.u16 v_whole_8, v_whole;                                                \
1725                                                                                \
1726   pld [fb_ptr];                                                                \
1727   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1728                                                                                \
1729   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1730   setup_blocks_texture_##swizzling();                                          \
1731                                                                                \
1732   beq 5f;                                                                      \
1733                                                                                \
1734  4:                                                                            \
1735   vshrn.u32 u_whole_low, u_block, #16;                                         \
1736                                                                                \
1737   vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32;                 \
1738   vshrn.u32 v_whole_low, v_block, #16;                                         \
1739                                                                                \
1740   add block_ptr_b, block_ptr_b, #32;                                           \
1741   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1742                                                                                \
1743   vdup.u32 dx4, uv_dx4[0];                                                     \
1744   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1745   vdup.u32 dx4, uv_dx4[1];                                                     \
1746                                                                                \
1747   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1748   vdup.u32 dx8, uv_dx8[0];                                                     \
1749                                                                                \
1750   vadd.u32 u_block, u_block, dx8;                                              \
1751   vdup.u32 dx8, uv_dx8[1];                                                     \
1752                                                                                \
1753   vadd.u32 v_block, v_block, dx8;                                              \
1754   vmovn.u16 u_whole_8, u_whole;                                                \
1755                                                                                \
1756   add fb_ptr, fb_ptr, #16;                                                     \
1757   vmovn.u16 v_whole_8, v_whole;                                                \
1758                                                                                \
1759   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1760   pld [fb_ptr];                                                                \
1761                                                                                \
1762   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1763   subs span_num_blocks, span_num_blocks, #1;                                   \
1764                                                                                \
1765   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1766   setup_blocks_texture_##swizzling();                                          \
1767                                                                                \
1768   bne 4b;                                                                      \
1769                                                                                \
1770  5:                                                                            \
1771   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
1772                                                                                \
1773   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
1774   vdup.u8 draw_mask, right_mask;                                               \
1775                                                                                \
1776   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1777   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1778   vzip.u8 u_whole_8, v_whole_8;                                                \
1779                                                                                \
1780   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1781   add block_ptr_b, block_ptr_b, #32;                                           \
1782   vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32;                          \
1783   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1784   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1785                                                                                \
1786  1:                                                                            \
1787   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1788   add span_edge_data, span_edge_data, #8;                                      \
1789   subs num_spans, num_spans, #1;                                               \
1790                                                                                \
1791   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1792   bne 0b;                                                                      \
1793                                                                                \
1794   restore_abi_regs();                                                          \
1795   ldmia sp!, { r4 - r11, pc };                                                 \
1796                                                                                \
1797  2:                                                                            \
1798   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1799   vpush { texture_mask };                                                      \
1800   vpush { uvrg_dx4 };                                                          \
1801                                                                                \
1802   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
1803   bl flush_render_block_buffer;                                                \
1804   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
1805                                                                                \
1806   vpop { uvrg_dx4 };                                                           \
1807   vpop { texture_mask };                                                       \
1808                                                                                \
1809   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1810   vmov.u8 fb_mask_ptrs, #0;                                                    \
1811                                                                                \
1812   mov num_blocks, span_num_blocks;                                             \
1813   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1814   bal 3b                                                                       \
1815
1816
1817 setup_blocks_unshaded_textured_builder(swizzled)
1818 setup_blocks_unshaded_textured_builder(unswizzled)
1819
1820
1821 .align 3
1822
1823 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1824   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
1825   veor.u32 draw_mask, draw_mask, draw_mask
1826
1827   cmp num_spans, #0
1828   bxeq lr
1829
1830   stmdb sp!, { r4 - r11, r14 }
1831   save_abi_regs()
1832   vld1.u32 { test_mask }, [psx_gpu, :128]
1833
1834   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
1835
1836   ubfx color_r, color, #3, #5
1837   ubfx color_g, color, #11, #5
1838   ubfx color_b, color, #19, #5
1839
1840   orr color, color_r, color_b, lsl #10
1841   orr color, color, color_g, lsl #5
1842
1843   vdup.u16 colors, color
1844
1845   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
1846   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1847
1848   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1849   add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1850
1851  0:
1852   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1853   ldrh y, [span_edge_data, #edge_data_y_offset]
1854
1855   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
1856
1857   cmp span_num_blocks, #0
1858   beq 1f
1859
1860   ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
1861   add num_blocks, span_num_blocks, num_blocks
1862
1863   cmp num_blocks, #MAX_BLOCKS
1864   bgt 2f
1865
1866  3:
1867   add fb_ptr, fb_ptr, y, lsl #11
1868   and y, y, #0x3
1869
1870   add fb_ptr, fb_ptr, left_x, lsl #1
1871   mov c_32, #32
1872
1873   subs span_num_blocks, span_num_blocks, #1
1874
1875   add block_ptr_b, block_ptr_a, #16
1876   pld [fb_ptr]
1877
1878   vmov.u32 fb_mask_ptrs[1], fb_ptr
1879   beq 5f
1880
1881  4:
1882   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_32
1883   vst1.u32 { colors }, [block_ptr_b, :128], c_32
1884   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
1885
1886   add fb_ptr, fb_ptr, #16
1887   add block_ptr_b, block_ptr_b, #32
1888
1889   pld [fb_ptr]
1890
1891   vmov.u32 fb_mask_ptrs[1], fb_ptr
1892   subs span_num_blocks, span_num_blocks, #1
1893
1894   bne 4b
1895
1896  5:
1897   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]
1898
1899   vdup.u8 draw_mask_edge, right_mask
1900   vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1901
1902   vst1.u32 { colors }, [block_ptr_b, :128], c_32
1903   vst1.u32 { draw_mask_edge }, [block_ptr_a, :128], c_32
1904   add block_ptr_b, block_ptr_b, #32
1905   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
1906
1907  1:
1908   add span_edge_data, span_edge_data, #8
1909   subs num_spans, num_spans, #1
1910
1911   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
1912   bne 0b
1913
1914   restore_abi_regs()
1915   ldmia sp!, { r4 - r11, pc }
1916                                                                            
1917  2:
1918   vpush { colors }
1919
1920   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
1921   bl flush_render_block_buffer
1922   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
1923
1924   vpop { colors }
1925
1926   vld1.u32 { test_mask }, [psx_gpu, :128]
1927   veor.u32 draw_mask, draw_mask, draw_mask
1928
1929   mov num_blocks, span_num_blocks
1930   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1931   bal 3b
1932
1933
1934 #define mask_msb_scalar                                   r14
1935
1936 #define msb_mask                                          q15
1937
1938 #define pixels_low                                        d16
1939
1940 #define msb_mask_low                                      d30
1941 #define msb_mask_high                                     d31
1942
1943
1944 .align 3
1945
1946 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1947   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
1948
1949   cmp num_spans, #0
1950   bxeq lr
1951
1952   stmdb sp!, { r4 - r11, r14 }
1953
1954   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
1955
1956   ubfx color_r, color, #3, #5
1957   ubfx color_g, color, #11, #5
1958
1959   ldrh mask_msb_scalar, [psx_gpu, #psx_gpu_mask_msb_offset]
1960   ubfx color_b, color, #19, #5
1961
1962   orr color, color_r, color_b, lsl #10
1963   orr color, color, color_g, lsl #5
1964   orr color, color, mask_msb_scalar
1965
1966   vdup.u16 colors, color
1967
1968   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1969   orr color, color, color, lsl #16
1970
1971
1972  0:
1973   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1974   ldrh y, [span_edge_data, #edge_data_y_offset]
1975
1976   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
1977
1978   cmp span_num_blocks, #0
1979   beq 1f
1980
1981   ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
1982
1983   add fb_ptr, fb_ptr, y, lsl #11
1984   subs span_num_blocks, span_num_blocks, #1
1985
1986   add fb_ptr, fb_ptr, left_x, lsl #1
1987   beq 3f
1988
1989  2:
1990   vst1.u32 { colors }, [fb_ptr]!
1991   subs span_num_blocks, span_num_blocks, #1
1992
1993   bne 2b
1994
1995  3:
1996   ldrb right_mask, [span_edge_data, #edge_data_right_mask_offset]
1997
1998   cmp right_mask, #0x0
1999   beq 5f
2000
2001   tst right_mask, #0xF
2002   streq color, [fb_ptr], #4
2003   moveq right_mask, right_mask, lsr #4
2004   streq color, [fb_ptr], #4
2005
2006   tst right_mask, #0x3
2007   streq color, [fb_ptr], #4
2008   moveq right_mask, right_mask, lsr #2
2009
2010   tst right_mask, #0x1
2011   strheq color, [fb_ptr]
2012
2013  1:
2014   add span_edge_data, span_edge_data, #8
2015   subs num_spans, num_spans, #1
2016   bne 0b
2017
2018   ldmia sp!, { r4 - r11, pc }
2019                                                                            
2020  5:
2021   vst1.u32 { colors }, [fb_ptr]
2022   bal 1b
2023
2024
2025 #undef c_64
2026
2027 #define c_64                                              r7
2028 #define rg_dx_ptr                                         r2
2029
2030
2031 #undef r_block
2032 #undef g_block
2033 #undef b_block
2034 #undef r_whole
2035 #undef g_whole
2036 #undef b_whole
2037 #undef r_whole_low
2038 #undef r_whole_high
2039 #undef g_whole_low
2040 #undef g_whole_high
2041 #undef b_whole_low
2042 #undef b_whole_high
2043 #undef r_whole_8
2044 #undef g_whole_8
2045 #undef b_whole_8
2046 #undef dither_offsets
2047 #undef rg_dx4
2048 #undef rg_dx8
2049 #undef dx4
2050 #undef dx8
2051 #undef v_left_x
2052 #undef uvrg
2053 #undef block_span
2054 #undef rg
2055 #undef draw_mask
2056 #undef test_mask
2057
2058 #define r_block                                           q0
2059 #define g_block                                           q1
2060 #define b_block                                           q2
2061
2062 #define r_whole                                           q3
2063 #define g_whole                                           q4
2064 #define b_whole                                           q5
2065
2066 #define r_whole_low                                       d6
2067 #define r_whole_high                                      d7
2068 #define g_whole_low                                       d8
2069 #define g_whole_high                                      d9
2070 #define b_whole_low                                       d10
2071 #define b_whole_high                                      d11
2072
2073 #define gb_whole_8                                        q6
2074
2075 #define g_whole_8                                         d12
2076 #define b_whole_8                                         d13
2077
2078 #define r_whole_8                                         d14
2079
2080 #define pixels                                            q8
2081
2082 #define rg_dx4                                            d18
2083 #define rg_dx8                                            d19
2084
2085 #define dx4                                               q10
2086 #define dx8                                               q10
2087
2088 #define v_left_x                                          d6
2089 #define uvrg                                              q4
2090 #define block_span                                        q5
2091
2092 #define rg                                                d9
2093
2094 #define d64_1                                             d22
2095 #define d64_128                                           d23
2096
2097 #define d128_4                                            q12
2098 #define d128_0x7                                          q13
2099
2100 #define d64_4                                             d24
2101
2102 #define dither_offsets                                    q14
2103 #define draw_mask                                         q15
2104
2105 #define dither_offsets_low                                d28
2106
2107 #define rg_dx                                             d0
2108 #define test_mask                                         q10
2109
2110
2111 #define setup_blocks_shaded_untextured_dither_a_dithered()                     \
2112   vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low;                           \
2113   vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets;                             \
2114
2115 #define setup_blocks_shaded_untextured_dither_b_dithered()                     \
2116   vqsub.u8 r_whole_8, r_whole_8, d64_4;                                        \
2117   vqsub.u8 gb_whole_8, gb_whole_8, d128_4                                      \
2118
2119 #define setup_blocks_shaded_untextured_dither_a_undithered()                   \
2120
2121 #define setup_blocks_shaded_untextured_dither_b_undithered()                   \
2122
2123
2124 #define setup_blocks_shaded_untextured_indirect_builder(dithering)             \
2125 .align 3;                                                                      \
2126                                                                                \
2127 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
2128   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
2129   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2130                                                                                \
2131   vld1.u32 { rg_dx }, [rg_dx_ptr, :64];                                        \
2132                                                                                \
2133   cmp num_spans, #0;                                                           \
2134   bxeq lr;                                                                     \
2135                                                                                \
2136   stmdb sp!, { r4 - r11, r14 };                                                \
2137   save_abi_regs();                                                             \
2138   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2139                                                                                \
2140   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
2141   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2142                                                                                \
2143   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2144                                                                                \
2145   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
2146   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2147                                                                                \
2148   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2149   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2150                                                                                \
2151   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
2152   vmov.u8 d64_1, #1;                                                           \
2153                                                                                \
2154   vmov.u8 d128_4, #4;                                                          \
2155   vmov.u8 d64_128, #128;                                                       \
2156                                                                                \
2157   vmov.u8 d128_0x7, #0x7;                                                      \
2158                                                                                \
2159  0:                                                                            \
2160   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
2161   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2162                                                                                \
2163   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
2164   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
2165                                                                                \
2166   cmp span_num_blocks, #0;                                                     \
2167   beq 1f;                                                                      \
2168                                                                                \
2169   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
2170   add num_blocks, span_num_blocks, num_blocks;                                 \
2171                                                                                \
2172   cmp num_blocks, #MAX_BLOCKS;                                                 \
2173   bgt 2f;                                                                      \
2174                                                                                \
2175  3:                                                                            \
2176   ldr b, [span_b_offset];                                                      \
2177   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2178                                                                                \
2179   vdup.u32 v_left_x, left_x;                                                   \
2180   and y, y, #0x3;                                                              \
2181                                                                                \
2182   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
2183   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2184                                                                                \
2185   mla b, b_dx, left_x, b;                                                      \
2186   and dither_shift, left_x, #0x03;                                             \
2187                                                                                \
2188   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
2189   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2190                                                                                \
2191   mov dither_shift, dither_shift, lsl #3;                                      \
2192   vmla.u32 rg, rg_dx, v_left_x;                                                \
2193                                                                                \
2194   mov c_64, #64;                                                               \
2195   subs span_num_blocks, span_num_blocks, #1;                                   \
2196                                                                                \
2197   mov dither_row, dither_row, ror dither_shift;                                \
2198   mov b_dx4, b_dx, lsl #2;                                                     \
2199                                                                                \
2200   vdup.u32 dither_offsets, dither_row;                                         \
2201   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2202                                                                                \
2203   vdup.u32 b_block, b;                                                         \
2204   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2205                                                                                \
2206   mov b_dx8, b_dx, lsl #3;                                                     \
2207   vdup.u32 r_block, rg[0];                                                     \
2208   vdup.u32 g_block, rg[1];                                                     \
2209                                                                                \
2210   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2211                                                                                \
2212   vadd.u32 r_block, r_block, block_span;                                       \
2213   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2214                                                                                \
2215   vadd.u32 g_block, g_block, block_span;                                       \
2216   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
2217                                                                                \
2218   vadd.u32 b_block, b_block, block_span;                                       \
2219   add block_ptr_b, block_ptr_a, #16;                                           \
2220                                                                                \
2221   vshrn.u32 r_whole_low, r_block, #16;                                         \
2222   vshrn.u32 g_whole_low, g_block, #16;                                         \
2223   vshrn.u32 b_whole_low, b_block, #16;                                         \
2224   vdup.u32 dx4, rg_dx4[0];                                                     \
2225                                                                                \
2226   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2227   vdup.u32 dx4, rg_dx4[1];                                                     \
2228                                                                                \
2229   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2230   vdup.u32 dx4, b_dx4;                                                         \
2231                                                                                \
2232   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2233   vdup.u32 dx8, rg_dx8[0];                                                     \
2234                                                                                \
2235   vadd.u32 r_block, r_block, dx8;                                              \
2236   vdup.u32 dx8, rg_dx8[1];                                                     \
2237                                                                                \
2238   vadd.u32 g_block, g_block, dx8;                                              \
2239   vdup.u32 dx8, b_dx8;                                                         \
2240                                                                                \
2241   vadd.u32 b_block, b_block, dx8;                                              \
2242                                                                                \
2243   vmovn.u16 r_whole_8, r_whole;                                                \
2244   vmovn.u16 g_whole_8, g_whole;                                                \
2245   vmovn.u16 b_whole_8, b_whole;                                                \
2246                                                                                \
2247   beq 5f;                                                                      \
2248   veor.u32 draw_mask, draw_mask, draw_mask;                                    \
2249                                                                                \
2250  4:                                                                            \
2251   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2252   vshrn.u32 r_whole_low, r_block, #16;                                         \
2253                                                                                \
2254   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2255   vshrn.u32 g_whole_low, g_block, #16;                                         \
2256                                                                                \
2257   vshrn.u32 b_whole_low, b_block, #16;                                         \
2258   str fb_ptr, [block_ptr_a, #44];                                              \
2259                                                                                \
2260   vdup.u32 dx4, rg_dx4[0];                                                     \
2261   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2262   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2263                                                                                \
2264   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2265   vdup.u32 dx4, rg_dx4[1];                                                     \
2266                                                                                \
2267   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2268   vdup.u32 dx4, b_dx4;                                                         \
2269                                                                                \
2270   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2271   vdup.u32 dx8, rg_dx8[0];                                                     \
2272                                                                                \
2273   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2274   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2275   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2276                                                                                \
2277   vadd.u32 r_block, r_block, dx8;                                              \
2278   vdup.u32 dx8, rg_dx8[1];                                                     \
2279                                                                                \
2280   vadd.u32 g_block, g_block, dx8;                                              \
2281   vdup.u32 dx8, b_dx8;                                                         \
2282                                                                                \
2283   vadd.u32 b_block, b_block, dx8;                                              \
2284   add fb_ptr, fb_ptr, #16;                                                     \
2285                                                                                \
2286   vmovn.u16 r_whole_8, r_whole;                                                \
2287   vmovn.u16 g_whole_8, g_whole;                                                \
2288   vmovn.u16 b_whole_8, b_whole;                                                \
2289                                                                                \
2290   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64;                           \
2291   vst1.u32 { pixels }, [block_ptr_b, :128], c_64;                              \
2292                                                                                \
2293   pld [fb_ptr];                                                                \
2294                                                                                \
2295   subs span_num_blocks, span_num_blocks, #1;                                   \
2296   bne 4b;                                                                      \
2297                                                                                \
2298  5:                                                                            \
2299   str fb_ptr, [block_ptr_a, #44];                                              \
2300   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2301                                                                                \
2302   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
2303   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2304                                                                                \
2305   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2306   vdup.u8 draw_mask, right_mask;                                               \
2307                                                                                \
2308   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2309   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
2310                                                                                \
2311   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
2312                                                                                \
2313   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2314   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2315   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2316                                                                                \
2317   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64;                           \
2318   vst1.u32 { pixels }, [block_ptr_b, :128], c_64;                              \
2319                                                                                \
2320  1:                                                                            \
2321   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2322   add span_b_offset, span_b_offset, #4;                                        \
2323                                                                                \
2324   add span_edge_data, span_edge_data, #8;                                      \
2325   subs num_spans, num_spans, #1;                                               \
2326                                                                                \
2327   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
2328   bne 0b;                                                                      \
2329                                                                                \
2330   restore_abi_regs();                                                          \
2331   ldmia sp!, { r4 - r11, pc };                                                 \
2332                                                                                \
2333  2:                                                                            \
2334   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
2335   vpush { rg_dx4 };                                                            \
2336                                                                                \
2337   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
2338   bl flush_render_block_buffer;                                                \
2339   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
2340                                                                                \
2341   vpop { rg_dx4 };                                                             \
2342                                                                                \
2343   vmov.u8 d64_1, #1;                                                           \
2344   vmov.u8 d128_4, #4;                                                          \
2345   vmov.u8 d64_128, #128;                                                       \
2346   vmov.u8 d128_0x7, #0x7;                                                      \
2347                                                                                \
2348   vadd.u32 rg_dx8, rg_dx4, rg_dx4;                                             \
2349                                                                                \
2350   mov num_blocks, span_num_blocks;                                             \
2351   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2352   bal 3b                                                                       \
2353
2354
2355 setup_blocks_shaded_untextured_indirect_builder(undithered)
2356 setup_blocks_shaded_untextured_indirect_builder(dithered)
2357
2358
2359 #undef draw_mask
2360
2361 #define mask_msb_ptr                                      r14
2362
2363 #define draw_mask                                         q0
2364 #define pixels_low                                        d16
2365 #define pixels_high                                       d17
2366
2367
2368
2369 #define setup_blocks_shaded_untextured_direct_builder(dithering)               \
2370 .align 3;                                                                      \
2371                                                                                \
2372 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
2373   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
2374   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2375                                                                                \
2376   vld1.u32 { rg_dx }, [rg_dx_ptr, :64];                                        \
2377                                                                                \
2378   cmp num_spans, #0;                                                           \
2379   bxeq lr;                                                                     \
2380                                                                                \
2381   stmdb sp!, { r4 - r11, r14 };                                                \
2382   save_abi_regs();                                                             \
2383   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2384                                                                                \
2385   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
2386   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2387                                                                                \
2388   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2389   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2390                                                                                \
2391   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2392   vmov.u8 d64_1, #1;                                                           \
2393                                                                                \
2394   vmov.u8 d128_4, #4;                                                          \
2395   vmov.u8 d64_128, #128;                                                       \
2396                                                                                \
2397   vmov.u8 d128_0x7, #0x7;                                                      \
2398   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
2399   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
2400                                                                                \
2401  0:                                                                            \
2402   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
2403   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2404                                                                                \
2405   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
2406   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
2407                                                                                \
2408   cmp span_num_blocks, #0;                                                     \
2409   beq 1f;                                                                      \
2410                                                                                \
2411   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
2412   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2413                                                                                \
2414   ldr b, [span_b_offset];                                                      \
2415   vdup.u32 v_left_x, left_x;                                                   \
2416   and y, y, #0x3;                                                              \
2417                                                                                \
2418   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
2419   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2420                                                                                \
2421   mla b, b_dx, left_x, b;                                                      \
2422   and dither_shift, left_x, #0x03;                                             \
2423                                                                                \
2424   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
2425   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2426                                                                                \
2427   mov dither_shift, dither_shift, lsl #3;                                      \
2428   vmla.u32 rg, rg_dx, v_left_x;                                                \
2429                                                                                \
2430   subs span_num_blocks, span_num_blocks, #1;                                   \
2431                                                                                \
2432   mov dither_row, dither_row, ror dither_shift;                                \
2433   mov b_dx4, b_dx, lsl #2;                                                     \
2434                                                                                \
2435   vdup.u32 dither_offsets, dither_row;                                         \
2436   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2437                                                                                \
2438   vdup.u32 b_block, b;                                                         \
2439   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2440                                                                                \
2441   mov b_dx8, b_dx, lsl #3;                                                     \
2442   vdup.u32 r_block, rg[0];                                                     \
2443   vdup.u32 g_block, rg[1];                                                     \
2444                                                                                \
2445   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2446                                                                                \
2447   vadd.u32 r_block, r_block, block_span;                                       \
2448   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2449                                                                                \
2450   vadd.u32 g_block, g_block, block_span;                                       \
2451   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
2452                                                                                \
2453   vadd.u32 b_block, b_block, block_span;                                       \
2454   add block_ptr_b, block_ptr_a, #16;                                           \
2455                                                                                \
2456   vshrn.u32 r_whole_low, r_block, #16;                                         \
2457   vshrn.u32 g_whole_low, g_block, #16;                                         \
2458   vshrn.u32 b_whole_low, b_block, #16;                                         \
2459   vdup.u32 dx4, rg_dx4[0];                                                     \
2460                                                                                \
2461   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2462   vdup.u32 dx4, rg_dx4[1];                                                     \
2463                                                                                \
2464   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2465   vdup.u32 dx4, b_dx4;                                                         \
2466                                                                                \
2467   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2468   vdup.u32 dx8, rg_dx8[0];                                                     \
2469                                                                                \
2470   vadd.u32 r_block, r_block, dx8;                                              \
2471   vdup.u32 dx8, rg_dx8[1];                                                     \
2472                                                                                \
2473   vadd.u32 g_block, g_block, dx8;                                              \
2474   vdup.u32 dx8, b_dx8;                                                         \
2475                                                                                \
2476   vadd.u32 b_block, b_block, dx8;                                              \
2477                                                                                \
2478   vmovn.u16 r_whole_8, r_whole;                                                \
2479   vmovn.u16 g_whole_8, g_whole;                                                \
2480   vmovn.u16 b_whole_8, b_whole;                                                \
2481                                                                                \
2482   beq 3f;                                                                      \
2483                                                                                \
2484  2:                                                                            \
2485   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2486   vshrn.u32 r_whole_low, r_block, #16;                                         \
2487                                                                                \
2488   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2489   vshrn.u32 g_whole_low, g_block, #16;                                         \
2490                                                                                \
2491   vshrn.u32 b_whole_low, b_block, #16;                                         \
2492                                                                                \
2493   vdup.u32 dx4, rg_dx4[0];                                                     \
2494   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2495   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2496                                                                                \
2497   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2498   vdup.u32 dx4, rg_dx4[1];                                                     \
2499                                                                                \
2500   vmov pixels, msb_mask;                                                       \
2501   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2502   vdup.u32 dx4, b_dx4;                                                         \
2503                                                                                \
2504   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2505   vdup.u32 dx8, rg_dx8[0];                                                     \
2506                                                                                \
2507   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2508   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2509   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2510                                                                                \
2511   vadd.u32 r_block, r_block, dx8;                                              \
2512   vdup.u32 dx8, rg_dx8[1];                                                     \
2513                                                                                \
2514   vadd.u32 g_block, g_block, dx8;                                              \
2515   vdup.u32 dx8, b_dx8;                                                         \
2516                                                                                \
2517   vadd.u32 b_block, b_block, dx8;                                              \
2518                                                                                \
2519   vmovn.u16 r_whole_8, r_whole;                                                \
2520   vmovn.u16 g_whole_8, g_whole;                                                \
2521   vmovn.u16 b_whole_8, b_whole;                                                \
2522                                                                                \
2523   vst1.u32 { pixels }, [fb_ptr]!;                                              \
2524   subs span_num_blocks, span_num_blocks, #1;                                   \
2525   bne 2b;                                                                      \
2526                                                                                \
2527  3:                                                                            \
2528   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2529                                                                                \
2530   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
2531   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2532                                                                                \
2533   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2534   rbit right_mask, right_mask;                                                 \
2535   vmov pixels, msb_mask;                                                       \
2536   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2537   clz right_mask, right_mask;                                                  \
2538                                                                                \
2539   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2540   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2541   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2542                                                                                \
2543   JT_OP_REL(100f, right_mask, temp);                                           \
2544   JT_OP(ldr pc, [pc, right_mask, lsl #2]);                                     \
2545   nop;                                                                         \
2546  100:                                                                          \
2547   nop;                                                                         \
2548   .word JTE(100b, 4f);                                                         \
2549   .word JTE(100b, 5f);                                                         \
2550   .word JTE(100b, 6f);                                                         \
2551   .word JTE(100b, 7f);                                                         \
2552   .word JTE(100b, 8f);                                                         \
2553   .word JTE(100b, 9f);                                                         \
2554   .word JTE(100b, 10f);                                                        \
2555   .word JTE(100b, 11f);                                                        \
2556                                                                                \
2557  4:                                                                            \
2558   vst1.u16 { pixels_low[0] }, [fb_ptr];                                        \
2559   bal 1f;                                                                      \
2560                                                                                \
2561  5:                                                                            \
2562   vst1.u32 { pixels_low[0] }, [fb_ptr];                                        \
2563   bal 1f;                                                                      \
2564                                                                                \
2565  6:                                                                            \
2566   vst1.u32 { pixels_low[0] }, [fb_ptr]!;                                       \
2567   vst1.u16 { pixels_low[2] }, [fb_ptr];                                        \
2568   bal 1f;                                                                      \
2569                                                                                \
2570  7:                                                                            \
2571   vst1.u32 { pixels_low }, [fb_ptr];                                           \
2572   bal 1f;                                                                      \
2573                                                                                \
2574  8:                                                                            \
2575   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2576   vst1.u16 { pixels_high[0] }, [fb_ptr];                                       \
2577   bal 1f;                                                                      \
2578                                                                                \
2579  9:                                                                            \
2580   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2581   vst1.u32 { pixels_high[0] }, [fb_ptr]!;                                      \
2582   bal 1f;                                                                      \
2583                                                                                \
2584  10:                                                                           \
2585   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2586   vst1.u32 { pixels_high[0] }, [fb_ptr]!;                                      \
2587   vst1.u16 { pixels_high[2] }, [fb_ptr];                                       \
2588   bal 1f;                                                                      \
2589                                                                                \
2590  11:                                                                           \
2591   vst1.u32 { pixels }, [fb_ptr];                                               \
2592   bal 1f;                                                                      \
2593                                                                                \
2594  1:                                                                            \
2595   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2596   add span_b_offset, span_b_offset, #4;                                        \
2597                                                                                \
2598   add span_edge_data, span_edge_data, #8;                                      \
2599   subs num_spans, num_spans, #1;                                               \
2600                                                                                \
2601   bne 0b;                                                                      \
2602                                                                                \
2603   restore_abi_regs();                                                          \
2604   ldmia sp!, { r4 - r11, pc }                                                  \
2605
2606 setup_blocks_shaded_untextured_direct_builder(undithered)
2607 setup_blocks_shaded_untextured_direct_builder(dithered)
2608
2609
2610 #undef psx_gpu
2611 #undef num_blocks
2612 #undef triangle
2613 #undef c_64
2614
2615 #define psx_gpu                                  r0
2616 #define block_ptr                                r1
2617 #define num_blocks                               r2
2618 #define uv_01                                    r3
2619 #define uv_23                                    r4
2620 #define uv_45                                    r5
2621 #define uv_67                                    r6
2622 #define uv_0                                     r7
2623 #define uv_1                                     r3
2624 #define uv_2                                     r8
2625 #define uv_3                                     r4
2626 #define uv_4                                     r9
2627 #define uv_5                                     r5
2628 #define uv_6                                     r10
2629 #define uv_7                                     r6
2630 #define texture_ptr                              r11
2631
2632 #define pixel_0                                  r7
2633 #define pixel_1                                  r3
2634 #define pixel_2                                  r8
2635 #define pixel_3                                  r4
2636 #define pixel_4                                  r9
2637 #define pixel_5                                  r5
2638 #define pixel_6                                  r10
2639 #define pixel_7                                  r6
2640
2641 #define pixels_a                                 r7
2642 #define pixels_b                                 r9
2643 #define pixels_c                                 r8
2644 #define pixels_d                                 r10
2645
2646 #define c_64                                     r0
2647
2648 #define clut_ptr                                 r12
2649 #define current_texture_mask                     r5
2650 #define dirty_textures_mask                      r6
2651
2652 #define texels                                   d0
2653
2654 #define clut_low_a                               d2
2655 #define clut_low_b                               d3
2656 #define clut_high_a                              d4
2657 #define clut_high_b                              d5
2658
2659 #define clut_a                                   q1
2660 #define clut_b                                   q2
2661
2662 #define texels_low                               d6
2663 #define texels_high                              d7
2664
2665 .align 3
2666
2667 function(texture_blocks_untextured)
2668   bx lr
2669
2670
2671 .align 3
2672
2673 function(texture_blocks_4bpp)
2674   stmdb sp!, { r3 - r11, r14 }
2675   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2676
2677   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2678   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2679
2680   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2681   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]
2682
2683   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
2684   vuzp.u8 clut_a, clut_b
2685
2686   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
2687   tst dirty_textures_mask, current_texture_mask
2688
2689   bne 1f
2690   mov c_64, #64
2691
2692 0:
2693   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2694
2695   uxtah uv_0, texture_ptr, uv_01
2696   uxtah uv_1, texture_ptr, uv_01, ror #16
2697
2698   uxtah uv_2, texture_ptr, uv_23
2699   uxtah uv_3, texture_ptr, uv_23, ror #16
2700
2701   uxtah uv_4, texture_ptr, uv_45
2702   ldrb pixel_0, [uv_0]
2703
2704   uxtah uv_5, texture_ptr, uv_45, ror #16
2705   ldrb pixel_1, [uv_1]
2706
2707   uxtah uv_6, texture_ptr, uv_67
2708   ldrb pixel_2, [uv_2]
2709
2710   uxtah uv_7, texture_ptr, uv_67, ror #16
2711   ldrb pixel_3, [uv_3]
2712
2713   ldrb pixel_4, [uv_4]
2714   subs num_blocks, num_blocks, #1
2715
2716   ldrb pixel_5, [uv_5]
2717   orr pixels_a, pixel_0, pixel_1, lsl #8
2718
2719   ldrb pixel_6, [uv_6]
2720   orr pixels_b, pixel_4, pixel_5, lsl #8
2721
2722   ldrb pixel_7, [uv_7]
2723   orr pixels_a, pixels_a, pixel_2, lsl #16
2724
2725   orr pixels_b, pixels_b, pixel_6, lsl #16
2726   orr pixels_a, pixels_a, pixel_3, lsl #24
2727
2728   orr pixels_b, pixels_b, pixel_7, lsl #24
2729   vmov texels, pixels_a, pixels_b
2730
2731   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2732   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2733
2734   vst2.u8 { texels_low, texels_high }, [block_ptr, :128], c_64
2735   bne 0b
2736
2737   ldmia sp!, { r3 - r11, pc }
2738
2739 1:
2740   stmdb sp!, { r1 - r2 }  
2741   bl update_texture_4bpp_cache
2742
2743   mov c_64, #64
2744   ldmia sp!, { r1 - r2 }
2745   bal 0b
2746
2747
2748 .align 3
2749
2750 function(texture_blocks_8bpp)
2751   stmdb sp!, { r3 - r11, r14 }
2752   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2753
2754   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2755   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2756
2757   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2758   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
2759
2760   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]
2761   tst dirty_textures_mask, current_texture_mask
2762
2763   bne 1f
2764   nop
2765
2766 0:
2767   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2768
2769   uxtah uv_0, texture_ptr, uv_01
2770   uxtah uv_1, texture_ptr, uv_01, ror #16
2771
2772   uxtah uv_2, texture_ptr, uv_23
2773   uxtah uv_3, texture_ptr, uv_23, ror #16
2774
2775   uxtah uv_4, texture_ptr, uv_45
2776   ldrb pixel_0, [uv_0]
2777
2778   uxtah uv_5, texture_ptr, uv_45, ror #16
2779   ldrb pixel_1, [uv_1]
2780
2781   uxtah uv_6, texture_ptr, uv_67
2782   ldrb pixel_2, [uv_2]
2783
2784   uxtah uv_7, texture_ptr, uv_67, ror #16
2785   ldrb pixel_3, [uv_3]
2786
2787   ldrb pixel_4, [uv_4]
2788   add pixel_0, pixel_0, pixel_0
2789
2790   ldrb pixel_5, [uv_5]
2791   add pixel_1, pixel_1, pixel_1
2792
2793   ldrb pixel_6, [uv_6]
2794   add pixel_2, pixel_2, pixel_2
2795
2796   ldrb pixel_7, [uv_7]
2797   add pixel_3, pixel_3, pixel_3
2798
2799   ldrh pixel_0, [clut_ptr, pixel_0]
2800   add pixel_4, pixel_4, pixel_4
2801
2802   ldrh pixel_1, [clut_ptr, pixel_1]
2803   add pixel_5, pixel_5, pixel_5
2804
2805   ldrh pixel_2, [clut_ptr, pixel_2]
2806   add pixel_6, pixel_6, pixel_6
2807
2808   ldrh pixel_3, [clut_ptr, pixel_3]
2809   add pixel_7, pixel_7, pixel_7
2810
2811   ldrh pixel_4, [clut_ptr, pixel_4]
2812   orr pixels_a, pixel_0, pixel_1, lsl #16
2813
2814   ldrh pixel_5, [clut_ptr, pixel_5]
2815   orr pixels_c, pixel_2, pixel_3, lsl #16
2816
2817   ldrh pixel_6, [clut_ptr, pixel_6]
2818   subs num_blocks, num_blocks, #1
2819
2820   ldrh pixel_7, [clut_ptr, pixel_7]
2821   orr pixels_b, pixel_4, pixel_5, lsl #16
2822
2823   orr pixels_d, pixel_6, pixel_7, lsl #16
2824   stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d } 
2825
2826   add block_ptr, block_ptr, #64
2827   bne 0b
2828
2829   ldmia sp!, { r3 - r11, pc }
2830
2831 1:
2832   stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
2833
2834   bl update_texture_8bpp_cache
2835
2836   ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
2837   bal 0b
2838
2839
2840 #undef uv_0
2841 #undef uv_1
2842 #undef uv_2
2843 #undef uv_3
2844 #undef uv_4
2845 #undef uv_5
2846 #undef uv_6
2847 #undef uv_7
2848
2849 #undef pixel_0
2850 #undef pixel_1
2851 #undef pixel_2
2852 #undef pixel_3
2853 #undef pixel_4
2854 #undef pixel_5
2855 #undef pixel_6
2856 #undef pixel_7
2857
2858 #undef texture_ptr
2859
2860 #undef pixels_a
2861 #undef pixels_b
2862 #undef pixels_c
2863 #undef pixels_d
2864
2865 #define psx_gpu                                  r0
2866 #define block_ptr                                r1
2867 #define num_blocks                               r2
2868
2869 #define uv_0                                     r3
2870 #define uv_1                                     r4
2871 #define u_0                                      r3
2872 #define u_1                                      r4
2873 #define v_0                                      r5
2874 #define v_1                                      r6
2875
2876 #define uv_2                                     r5
2877 #define uv_3                                     r6
2878 #define u_2                                      r5
2879 #define u_3                                      r6
2880 #define v_2                                      r7
2881 #define v_3                                      r8
2882
2883 #define uv_4                                     r7
2884 #define uv_5                                     r8
2885 #define u_4                                      r7
2886 #define u_5                                      r8
2887 #define v_4                                      r9
2888 #define v_5                                      r10
2889
2890 #define uv_6                                     r9
2891 #define uv_7                                     r10
2892 #define u_6                                      r9
2893 #define u_7                                      r10
2894 #define v_6                                      r11
2895 #define v_7                                      r0
2896
2897 #define pixel_0                                  r3
2898 #define pixel_1                                  r4
2899 #define pixel_2                                  r5
2900 #define pixel_3                                  r6
2901 #define pixel_4                                  r7
2902 #define pixel_5                                  r8
2903 #define pixel_6                                  r9
2904 #define pixel_7                                  r10
2905
2906 #define pixels_a                                 r3
2907 #define pixels_b                                 r5
2908 #define pixels_c                                 r7
2909 #define pixels_d                                 r9
2910
2911 #define texture_ptr                              r12
2912
2913
2914 .align 3
2915
2916 function(texture_blocks_16bpp)
2917   stmdb sp!, { r3 - r11, r14 }
2918   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2919
2920   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2921   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2922
2923 0:
2924   ldrh uv_0, [block_ptr]
2925   subs num_blocks, num_blocks, #1
2926
2927   ldrh uv_1, [block_ptr, #2]
2928
2929   and v_0, uv_0, #0xFF00
2930   and v_1, uv_1, #0xFF00
2931
2932   and u_0, uv_0, #0xFF
2933   and u_1, uv_1, #0xFF
2934
2935   add uv_0, u_0, v_0, lsl #2
2936   ldrh uv_2, [block_ptr, #4]
2937
2938   add uv_1, u_1, v_1, lsl #2
2939   ldrh uv_3, [block_ptr, #6]
2940
2941   add uv_0, uv_0, uv_0
2942   add uv_1, uv_1, uv_1
2943
2944   and v_2, uv_2, #0xFF00
2945   and v_3, uv_3, #0xFF00
2946
2947   and u_2, uv_2, #0xFF
2948   and u_3, uv_3, #0xFF
2949
2950   add uv_2, u_2, v_2, lsl #2
2951   ldrh uv_4, [block_ptr, #8]
2952
2953   add uv_3, u_3, v_3, lsl #2
2954   ldrh uv_5, [block_ptr, #10]
2955
2956   add uv_2, uv_2, uv_2
2957   add uv_3, uv_3, uv_3
2958
2959   and v_4, uv_4, #0xFF00
2960   and v_5, uv_5, #0xFF00
2961
2962   and u_4, uv_4, #0xFF
2963   and u_5, uv_5, #0xFF
2964
2965   add uv_4, u_4, v_4, lsl #2
2966   ldrh uv_6, [block_ptr, #12]
2967
2968   add uv_5, u_5, v_5, lsl #2
2969   ldrh uv_7, [block_ptr, #14]
2970
2971   add uv_4, uv_4, uv_4
2972   ldrh pixel_0, [texture_ptr, uv_0]
2973
2974   add uv_5, uv_5, uv_5
2975   ldrh pixel_1, [texture_ptr, uv_1]
2976
2977   and v_6, uv_6, #0xFF00
2978   ldrh pixel_2, [texture_ptr, uv_2]
2979
2980   and v_7, uv_7, #0xFF00
2981   ldrh pixel_3, [texture_ptr, uv_3]
2982
2983   and u_6, uv_6, #0xFF
2984   ldrh pixel_4, [texture_ptr, uv_4]
2985
2986   and u_7, uv_7, #0xFF
2987   ldrh pixel_5, [texture_ptr, uv_5]
2988
2989   add uv_6, u_6, v_6, lsl #2
2990   add uv_7, u_7, v_7, lsl #2
2991
2992   add uv_6, uv_6, uv_6
2993   add uv_7, uv_7, uv_7
2994
2995   orr pixels_a, pixel_0, pixel_1, lsl #16
2996   orr pixels_b, pixel_2, pixel_3, lsl #16
2997
2998   ldrh pixel_6, [texture_ptr, uv_6]
2999   orr pixels_c, pixel_4, pixel_5, lsl #16
3000
3001   ldrh pixel_7, [texture_ptr, uv_7]
3002   orr pixels_d, pixel_6, pixel_7, lsl #16
3003
3004   stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
3005   add block_ptr, block_ptr, #64
3006
3007   bne 0b
3008
3009   ldmia sp!, { r3 - r11, pc }
3010
3011
3012 #undef num_blocks
3013
3014 #undef test_mask
3015 #undef texels
3016 #undef pixels_b
3017 #undef pixels
3018 #undef d64_1
3019 #undef d64_4
3020 #undef d64_128
3021 #undef draw_mask
3022 #undef msb_mask
3023 #undef msb_mask_low
3024 #undef msb_mask_high
3025 #undef fb_pixels
3026
3027 #undef c_32
3028 #undef fb_ptr
3029 #undef mask_msb_ptr
3030
3031 #define psx_gpu                                  r0
3032 #define num_blocks                               r1
3033 #define color_ptr                                r2
3034 #define colors_scalar                            r2
3035 #define colors_scalar_compare                    r3
3036 #define mask_msb_ptr                             r2
3037
3038 #define block_ptr_load_a                         r0
3039 #define block_ptr_store                          r3
3040 #define block_ptr_load_b                         r12
3041 #define c_32                                     r2
3042
3043 #define c_48                                     r4
3044 #define fb_ptr                                   r14
3045 #define draw_mask_bits_scalar                    r5
3046
3047 #define d128_0x07                                q0
3048 #define d128_0x1F                                q1
3049 #define d128_0x8000                              q2
3050 #define test_mask                                q3
3051 #define texels                                   q4
3052 #define colors_rg                                q5
3053 #define colors_b_dm_bits                         q6
3054 #define texels_rg                                q7
3055 #define pixels_r                                 q8
3056 #define pixels_g                                 q9
3057 #define pixels_b                                 q10
3058 #define pixels                                   q11
3059 #define zero_mask                                q4
3060 #define draw_mask                                q12
3061 #define msb_mask                                 q13
3062
3063 #define fb_pixels                                q8
3064
3065 #define pixels_gb_low                            q9
3066
3067 #define colors_r                                 d10
3068 #define colors_g                                 d11
3069 #define colors_b                                 d12
3070 #define draw_mask_bits                           d13
3071 #define texels_r                                 d14
3072 #define texels_g                                 d15
3073 #define pixels_r_low                             d16
3074 #define pixels_g_low                             d18
3075 #define pixels_b_low                             d19
3076 #define msb_mask_low                             d26
3077 #define msb_mask_high                            d27
3078
3079 #define d64_1                                    d28
3080 #define d64_4                                    d29
3081 #define d64_128                                  d30
3082 #define texels_b                                 d31
3083
3084 #define shade_blocks_textured_modulated_prologue_indirect()                    \
3085   mov c_48, #48;                                                               \
3086   add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset                         \
3087
3088 #define shade_blocks_textured_modulated_prologue_direct()                      \
3089   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3090   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]            \
3091
3092
3093 #define shade_blocks_textured_modulated_prologue_shaded(dithering, target)     \
3094   
3095 #define shade_blocks_textured_false_modulation_check_undithered(target)        \
3096   ldr colors_scalar, [psx_gpu, #psx_gpu_triangle_color_offset];                \
3097   movw colors_scalar_compare, #0x8080;                                         \
3098                                                                                \
3099   movt colors_scalar_compare, #0x80;                                           \
3100   cmp colors_scalar, colors_scalar_compare;                                    \
3101   beq shade_blocks_textured_unmodulated_##target                               \
3102
3103 #define shade_blocks_textured_false_modulation_check_dithered(target)          \
3104
3105 #define shade_blocks_textured_modulated_prologue_unshaded(dithering, target)   \
3106   shade_blocks_textured_false_modulation_check_##dithering(target);            \
3107   add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset;                      \
3108   vld1.u32 { colors_r[] }, [color_ptr, :32];                                   \
3109   vdup.u8 colors_g, colors_r[1];                                               \
3110   vdup.u8 colors_b, colors_r[2];                                               \
3111   vdup.u8 colors_r, colors_r[0]                                                \
3112
3113
3114 #define shade_blocks_textured_modulated_load_dithered(target)                  \
3115   vld1.u32 { target }, [block_ptr_load_b, :128]                                \
3116
3117 #define shade_blocks_textured_modulated_load_last_dithered(target)             \
3118   vld1.u32 { target }, [block_ptr_load_b, :128], c_32                          \
3119
3120 #define shade_blocks_textured_modulated_load_undithered(target)                \
3121
3122 #define shade_blocks_textured_modulated_load_last_undithered(target)           \
3123   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3124
3125 #define shade_blocks_textured_modulate_dithered(channel)                       \
3126   vmlal.u8 pixels_##channel, texels_##channel, colors_##channel                \
3127
3128 #define shade_blocks_textured_modulate_undithered(channel)                     \
3129   vmull.u8 pixels_##channel, texels_##channel, colors_##channel                \
3130
3131
3132 #define shade_blocks_textured_modulated_store_draw_mask_indirect(offset)       \
3133   vst1.u32 { draw_mask }, [block_ptr_store, :128]!                             \
3134
3135 #define shade_blocks_textured_modulated_store_draw_mask_direct(offset)         \
3136   ldr fb_ptr, [block_ptr_load_b, #(offset - 64)];                              \
3137   vld1.u32 { fb_pixels }, [fb_ptr];                                            \
3138   vbit.u16 pixels, fb_pixels, draw_mask                                        \
3139
3140 #define shade_blocks_textured_modulated_store_pixels_indirect()                \
3141   vst1.u32 { pixels }, [block_ptr_store, :128], c_48                           \
3142
3143 #define shade_blocks_textured_modulated_store_pixels_direct()                  \
3144   vst1.u32 { pixels }, [fb_ptr]                                                \
3145
3146
3147 #define shade_blocks_textured_modulated_load_rg_shaded()                       \
3148   vld1.u32 { colors_r, colors_g }, [block_ptr_load_b, :128], c_32              \
3149
3150 #define shade_blocks_textured_modulated_load_rg_unshaded()                     \
3151   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3152
3153 #define shade_blocks_textured_modulated_load_bdm_shaded()                      \
3154   vld1.u32 { colors_b, draw_mask_bits }, [block_ptr_load_a, :128], c_32        \
3155
3156 #define shade_blocks_textured_modulated_load_bdm_unshaded()                    \
3157   ldr draw_mask_bits_scalar, [block_ptr_load_a, #8];                           \
3158   add block_ptr_load_a, block_ptr_load_a, #32                                  \
3159
3160 #define shade_blocks_textured_modulated_expand_draw_mask_shaded()              \
3161   vdup.u16 draw_mask, draw_mask_bits[0]                                        \
3162
3163 #define shade_blocks_textured_modulated_expand_draw_mask_unshaded()            \
3164   vdup.u16 draw_mask, draw_mask_bits_scalar                                    \
3165
3166
3167 #define shade_blocks_textured_modulated_apply_msb_mask_indirect()              \
3168
3169 #define shade_blocks_textured_modulated_apply_msb_mask_direct()                \
3170   vorr.u16 pixels, pixels, msb_mask                                            \
3171
3172
3173 #define shade_blocks_textured_modulated_builder(shading, dithering, target)    \
3174 .align 3;                                                                      \
3175                                                                                \
3176 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
3177   save_abi_regs();                                                             \
3178   shade_blocks_textured_modulated_prologue_##shading(dithering, target);       \
3179   stmdb sp!, { r4 - r5, lr };                                                  \
3180   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3181                                                                                \
3182   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
3183                                                                                \
3184   shade_blocks_textured_modulated_prologue_##target();                         \
3185                                                                                \
3186   add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset;                       \
3187   mov c_32, #32;                                                               \
3188                                                                                \
3189   add block_ptr_load_b, block_ptr_load_a, #16;                                 \
3190   vmov.u8 d64_1, #1;                                                           \
3191   vmov.u8 d64_4, #4;                                                           \
3192   vmov.u8 d64_128, #128;                                                       \
3193                                                                                \
3194   vld1.u32 { texels }, [block_ptr_load_a, :128], c_32;                         \
3195   vmov.u8 d128_0x07, #0x07;                                                    \
3196                                                                                \
3197   shade_blocks_textured_modulated_load_rg_##shading();                         \
3198   vmov.u8 d128_0x1F, #0x1F;                                                    \
3199                                                                                \
3200   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3201   vmov.u16 d128_0x8000, #0x8000;                                               \
3202                                                                                \
3203   vmovn.u16 texels_r, texels;                                                  \
3204   vshrn.u16 texels_g, texels, #5;                                              \
3205                                                                                \
3206   vshrn.u16 texels_b, texels, #7;                                              \
3207   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3208                                                                                \
3209   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3210   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3211                                                                                \
3212   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3213   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3214                                                                                \
3215   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3216   vshr.u8 texels_b, texels_b, #3;                                              \
3217                                                                                \
3218   shade_blocks_textured_modulate_##dithering(r);                               \
3219   shade_blocks_textured_modulate_##dithering(g);                               \
3220   shade_blocks_textured_modulate_##dithering(b);                               \
3221                                                                                \
3222   vand.u16 pixels, texels, d128_0x8000;                                        \
3223   vceq.u16 zero_mask, texels, #0;                                              \
3224                                                                                \
3225   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3226   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3227   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3228                                                                                \
3229   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3230   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3231   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3232   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3233                                                                                \
3234   subs num_blocks, num_blocks, #1;                                             \
3235   beq 1f;                                                                      \
3236                                                                                \
3237  .align 3;                                                                     \
3238                                                                                \
3239  0:                                                                            \
3240   vld1.u32 { texels }, [block_ptr_load_a, :128], c_32;                         \
3241   shade_blocks_textured_modulated_load_rg_##shading();                         \
3242   vshrn.u16 texels_g, texels, #5;                                              \
3243                                                                                \
3244   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3245   vshrn.u16 texels_b, texels, #7;                                              \
3246                                                                                \
3247   pld [block_ptr_load_a];                                                      \
3248   vmovn.u16 texels_r, texels;                                                  \
3249   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3250                                                                                \
3251   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3252   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3253   shade_blocks_textured_modulated_store_draw_mask_##target(-4);                \
3254                                                                                \
3255   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3256   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3257                                                                                \
3258   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3259   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3260                                                                                \
3261   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3262   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3263                                                                                \
3264   shade_blocks_textured_modulated_store_pixels_##target();                     \
3265   vshr.u8 texels_b, texels_b, #3;                                              \
3266                                                                                \
3267   shade_blocks_textured_modulate_##dithering(r);                               \
3268   shade_blocks_textured_modulate_##dithering(g);                               \
3269   shade_blocks_textured_modulate_##dithering(b);                               \
3270                                                                                \
3271   vand.u16 pixels, texels, d128_0x8000;                                        \
3272   vceq.u16 zero_mask, texels, #0;                                              \
3273                                                                                \
3274   subs num_blocks, num_blocks, #1;                                             \
3275                                                                                \
3276   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3277   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3278   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3279                                                                                \
3280   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3281   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3282   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3283   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3284                                                                                \
3285   bne 0b;                                                                      \
3286                                                                                \
3287  1:                                                                            \
3288   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3289   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3290   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3291                                                                                \
3292   shade_blocks_textured_modulated_store_draw_mask_##target(28);                \
3293   shade_blocks_textured_modulated_store_pixels_##target();                     \
3294                                                                                \
3295   ldmia sp!, { r4 - r5, lr };                                                  \
3296   restore_abi_regs();                                                          \
3297   bx lr                                                                        \
3298
3299
3300 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3301 shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3302 shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3303 shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3304
3305 shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3306 shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3307 shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3308 shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3309
3310
3311 #undef c_64
3312 #undef fb_ptr
3313 #undef color_ptr
3314
3315 #undef color_r
3316 #undef color_g
3317 #undef color_b
3318
3319 #undef test_mask
3320 #undef pixels
3321 #undef draw_mask
3322 #undef zero_mask
3323 #undef fb_pixels
3324 #undef msb_mask
3325 #undef msb_mask_low
3326 #undef msb_mask_high
3327
3328 #define psx_gpu                                  r0
3329 #define num_blocks                               r1
3330 #define mask_msb_ptr                             r2
3331 #define color_ptr                                r3
3332
3333 #define block_ptr_load                           r0
3334 #define draw_mask_store_ptr                      r3
3335 #define draw_mask_bits_ptr                       r12
3336 #define draw_mask_ptr                            r12
3337 #define pixel_store_ptr                          r14
3338
3339 #define fb_ptr_cmp                               r4
3340
3341 #define fb_ptr                                   r3
3342 #define fb_ptr_next                              r14
3343
3344 #define c_64                                     r2
3345
3346 #define test_mask                                q0
3347 #define pixels                                   q1
3348 #define draw_mask                                q2
3349 #define zero_mask                                q3
3350 #define draw_mask_combined                       q4
3351 #define fb_pixels                                q5
3352 #define fb_pixels_next                           q6
3353 #define msb_mask                                 q7
3354
3355 #define draw_mask_low                            d4
3356 #define draw_mask_high                           d5
3357 #define msb_mask_low                             d14
3358 #define msb_mask_high                            d15
3359
3360 .align 3
3361 function(shade_blocks_textured_unmodulated_indirect)
3362   stmdb sp!, { r4, r14 }
3363   save_abi_regs()
3364   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3365
3366   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3367   add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3368
3369   vld1.u32 { test_mask }, [psx_gpu, :128]
3370   add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3371
3372   mov c_64, #64
3373   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3374
3375   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3376   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3377    [draw_mask_bits_ptr, :16], c_64
3378   vceq.u16 zero_mask, pixels, #0
3379
3380   vtst.u16 draw_mask, draw_mask, test_mask
3381   vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
3382
3383   subs num_blocks, num_blocks, #1
3384   beq 1f
3385
3386  0:
3387   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3388   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3389
3390   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3391    [draw_mask_bits_ptr, :16], c_64
3392   vceq.u16 zero_mask, pixels, #0
3393
3394   vtst.u16 draw_mask, draw_mask, test_mask
3395   vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
3396
3397   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
3398   subs num_blocks, num_blocks, #1
3399
3400   bne 0b
3401
3402  1:
3403   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3404   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
3405
3406   restore_abi_regs()
3407   ldmia sp!, { r4, pc }
3408
3409
3410 .align 3
3411
3412 function(shade_blocks_textured_unmodulated_direct)
3413   stmdb sp!, { r4, r14 }
3414   save_abi_regs()
3415   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3416
3417   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3418   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3419
3420   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
3421   mov c_64, #64
3422
3423   vld1.u32 { test_mask }, [psx_gpu, :128]
3424   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3425
3426   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3427    [draw_mask_bits_ptr, :16], c_64
3428   ldr fb_ptr_next, [block_ptr_load, #44]
3429
3430   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3431   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3432   vceq.u16 zero_mask, pixels, #0
3433   vtst.u16 draw_mask, draw_mask, test_mask
3434
3435   subs num_blocks, num_blocks, #1
3436   beq 1f
3437
3438  0:
3439   mov fb_ptr, fb_ptr_next
3440   ldr fb_ptr_next, [block_ptr_load, #44]
3441
3442   vorr.u16 pixels, pixels, msb_mask
3443
3444   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3445   vmov fb_pixels, fb_pixels_next
3446
3447   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3448    [draw_mask_bits_ptr, :16], c_64
3449   vbif.u16 fb_pixels, pixels, draw_mask_combined
3450
3451   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3452   pld [fb_ptr_next, #64]
3453
3454   add fb_ptr_cmp, fb_ptr_cmp, #14
3455   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3456
3457   cmp fb_ptr_cmp, #28
3458   bls 4f
3459
3460   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3461   vceq.u16 zero_mask, pixels, #0
3462
3463   vst1.u16 { fb_pixels }, [fb_ptr]
3464   vtst.u16 draw_mask, draw_mask, test_mask
3465
3466  3:
3467   subs num_blocks, num_blocks, #1
3468   bne 0b
3469
3470  1:
3471   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3472   vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3473
3474   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
3475
3476   restore_abi_regs()
3477   ldmia sp!, { r4, pc }
3478
3479  4:
3480   vst1.u16 { fb_pixels }, [fb_ptr]
3481   vceq.u16 zero_mask, pixels, #0
3482
3483   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3484   vtst.u16 draw_mask, draw_mask, test_mask
3485
3486   bal 3b
3487
3488
3489 function(shade_blocks_unshaded_untextured_indirect)
3490   bx lr
3491
3492 .align 3
3493
3494 function(shade_blocks_unshaded_untextured_direct)
3495   stmdb sp!, { r4, r14 }
3496   save_abi_regs()
3497   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3498
3499   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3500   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3501
3502   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
3503   add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3504
3505   add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3506   vld1.u16 { pixels }, [color_ptr, :128]
3507
3508   mov c_64, #64
3509   vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
3510
3511   vorr.u16 pixels, pixels, msb_mask
3512   subs num_blocks, num_blocks, #1
3513
3514   ldr fb_ptr_next, [block_ptr_load], #64
3515
3516   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3517   beq 1f
3518
3519  0:
3520   vmov fb_pixels, fb_pixels_next
3521   mov fb_ptr, fb_ptr_next
3522   ldr fb_ptr_next, [block_ptr_load], #64
3523
3524   vbif.u16 fb_pixels, pixels, draw_mask
3525   vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
3526
3527   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3528   add fb_ptr_cmp, fb_ptr_cmp, #14
3529   cmp fb_ptr_cmp, #28
3530   bls 4f
3531
3532   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3533   vst1.u16 { fb_pixels }, [fb_ptr]
3534
3535  3:
3536   subs num_blocks, num_blocks, #1
3537   bne 0b
3538
3539  1:
3540   vbif.u16 fb_pixels_next, pixels, draw_mask
3541   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
3542
3543   restore_abi_regs()
3544   ldmia sp!, { r4, pc }
3545
3546  4:
3547   vst1.u16 { fb_pixels }, [fb_ptr]
3548   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3549   bal 3b
3550
3551
3552 #undef draw_mask_ptr
3553 #undef c_64
3554 #undef fb_ptr
3555 #undef fb_ptr_next
3556 #undef fb_ptr_cmp
3557
3558 #define psx_gpu                                  r0
3559 #define num_blocks                               r1
3560 #define msb_mask_ptr                             r2
3561 #define pixel_ptr                                r3
3562 #define draw_mask_ptr                            r0
3563 #define c_64                                     r2
3564 #define fb_ptr                                   r12
3565 #define fb_ptr_next                              r14
3566 #define fb_ptr_cmp                               r4
3567
3568 #undef msb_mask
3569 #undef draw_mask
3570 #undef pixels
3571 #undef fb_pixels
3572 #undef d128_0x8000
3573 #undef msb_mask_low
3574 #undef msb_mask_high
3575 #undef draw_mask_next
3576 #undef pixels_g
3577 #undef blend_pixels
3578 #undef fb_pixels_next
3579
3580 #define msb_mask                                 q0
3581 #define draw_mask                                q1
3582 #define pixels                                   q2
3583 #define fb_pixels                                q3
3584 #define blend_pixels                             q4
3585 #define pixels_no_msb                            q5
3586 #define blend_mask                               q6
3587 #define fb_pixels_no_msb                         q7
3588 #define d128_0x8000                              q8
3589 #define d128_0x0421                              q9
3590 #define fb_pixels_next                           q10
3591 #define blend_pixels_next                        q11
3592 #define pixels_next                              q12
3593 #define draw_mask_next                           q13
3594 #define write_mask                               q14
3595
3596 #define pixels_rb                                q5
3597 #define pixels_mg                                q7
3598 #define pixels_g                                 q7
3599 #define d128_0x7C1F                              q8
3600 #define d128_0x03E0                              q9
3601 #define fb_pixels_rb                             q10
3602 #define fb_pixels_g                              q11
3603 #define fb_pixels_masked                         q11
3604 #define d128_0x83E0                              q15
3605 #define pixels_fourth                            q7
3606 #define d128_0x1C07                              q12
3607 #define d128_0x00E0                              q13
3608 #define d128_0x80E0                              q13
3609
3610 #define msb_mask_low                             d0
3611 #define msb_mask_high                            d1
3612
3613 #define blend_blocks_average_set_blend_mask_textured(source)                   \
3614   vclt.s16 blend_mask, source, #0                                              \
3615
3616 #define blend_blocks_average_set_stp_bit_textured()                            \
3617   vorr.u16 blend_pixels, #0x8000                                               \
3618
3619 #define blend_blocks_average_combine_textured(source)                          \
3620   vbif.u16 blend_pixels, source, blend_mask                                    \
3621   
3622 #define blend_blocks_average_set_blend_mask_untextured(source)                 \
3623
3624 #define blend_blocks_average_set_stp_bit_untextured()                          \
3625
3626 #define blend_blocks_average_combine_untextured(source)                        \
3627
3628 #define blend_blocks_average_mask_set_on()                                     \
3629   vclt.s16 write_mask, fb_pixels_next, #0                                      \
3630
3631 #define blend_blocks_average_mask_copy_on()                                    \
3632   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3633
3634 #define blend_blocks_average_mask_copy_b_on()                                  \
3635   vorr.u16 draw_mask_next, draw_mask_next, write_mask                          \
3636
3637 #define blend_blocks_average_mask_set_off()                                    \
3638
3639 #define blend_blocks_average_mask_copy_off()                                   \
3640   vmov draw_mask, draw_mask_next                                               \
3641
3642 #define blend_blocks_average_mask_copy_b_off()                                 \
3643
3644 #define blend_blocks_average_builder(texturing, mask_evaluate)                 \
3645 .align 3;                                                                      \
3646                                                                                \
3647 function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
3648   stmdb sp!, { r4, r14 };                                                      \
3649   save_abi_regs();                                                             \
3650   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3651   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3652                                                                                \
3653   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3654   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3655                                                                                \
3656   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3657   mov c_64, #64;                                                               \
3658                                                                                \
3659   vmov.u16 d128_0x8000, #0x8000;                                               \
3660   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
3661   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3662                                                                                \
3663   vmov.u16 d128_0x0421, #0x0400;                                               \
3664   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
3665                                                                                \
3666   vorr.u16 d128_0x0421, #0x0021;                                               \
3667   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3668                                                                                \
3669   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3670   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3671   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3672   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3673   blend_blocks_average_mask_set_##mask_evaluate();                             \
3674   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3675                                                                                \
3676   subs num_blocks, num_blocks, #1;                                             \
3677   beq 1f;                                                                      \
3678                                                                                \
3679  0:                                                                            \
3680   mov fb_ptr, fb_ptr_next;                                                     \
3681   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3682                                                                                \
3683   vmov pixels, pixels_next;                                                    \
3684   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
3685                                                                                \
3686   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3687                                                                                \
3688   blend_blocks_average_mask_copy_##mask_evaluate();                            \
3689   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
3690                                                                                \
3691   blend_blocks_average_set_blend_mask_##texturing(pixels);                     \
3692   blend_blocks_average_set_stp_bit_##texturing();                              \
3693   vmov fb_pixels, fb_pixels_next;                                              \
3694   blend_blocks_average_combine_##texturing(pixels);                            \
3695                                                                                \
3696   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3697   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3698   cmp fb_ptr_cmp, #28;                                                         \
3699   bls 2f;                                                                      \
3700                                                                                \
3701   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3702   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3703                                                                                \
3704   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3705   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3706                                                                                \
3707   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3708   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3709                                                                                \
3710   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3711   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3712   blend_blocks_average_mask_set_##mask_evaluate();                             \
3713   vst1.u16 { fb_pixels }, [fb_ptr];                                            \
3714                                                                                \
3715  3:                                                                            \
3716   subs num_blocks, num_blocks, #1;                                             \
3717   bne 0b;                                                                      \
3718                                                                                \
3719  1:                                                                            \
3720   blend_blocks_average_mask_copy_b_##mask_evaluate();                          \
3721   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3722                                                                                \
3723   blend_blocks_average_set_blend_mask_##texturing(pixels_next);                \
3724   blend_blocks_average_set_stp_bit_##texturing();                              \
3725   blend_blocks_average_combine_##texturing(pixels_next);                       \
3726                                                                                \
3727   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3728   vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next;                       \
3729   vst1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3730                                                                                \
3731   restore_abi_regs();                                                          \
3732   ldmia sp!, { r4, pc };                                                       \
3733                                                                                \
3734  2:                                                                            \
3735   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3736   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3737   vst1.u16 { fb_pixels }, [fb_ptr];                                            \
3738                                                                                \
3739   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3740   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3741   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3742   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3743   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3744   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3745                                                                                \
3746   bal 3b                                                                       \
3747
3748 blend_blocks_average_builder(textured, off)
3749 blend_blocks_average_builder(untextured, off)
3750 blend_blocks_average_builder(textured, on)
3751 blend_blocks_average_builder(untextured, on)
3752
3753
3754 #define blend_blocks_add_mask_set_on()                                         \
3755   vclt.s16 write_mask, fb_pixels, #0                                           \
3756
3757 #define blend_blocks_add_mask_copy_on()                                        \
3758   vorr.u16 draw_mask, draw_mask, write_mask                                    \
3759
3760 #define blend_blocks_add_mask_set_off()                                        \
3761
3762 #define blend_blocks_add_mask_copy_off()                                       \
3763
3764
3765 #define blend_blocks_add_textured_builder(mask_evaluate)                       \
3766 .align 3;                                                                      \
3767                                                                                \
3768 function(blend_blocks_textured_add_##mask_evaluate)                            \
3769   stmdb sp!, { r4, r14 };                                                      \
3770   save_abi_regs();                                                             \
3771   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3772   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3773                                                                                \
3774   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3775   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3776                                                                                \
3777   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3778   mov c_64, #64;                                                               \
3779                                                                                \
3780   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3781   vmov.u16 d128_0x03E0, #0x0300;                                               \
3782   vmov.u16 d128_0x83E0, #0x8000;                                               \
3783   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3784   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3785   vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0;                              \
3786                                                                                \
3787   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3788   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3789   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3790   vclt.s16 blend_mask, pixels, #0;                                             \
3791   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3792   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3793   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3794                                                                                \
3795   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3796   vorr.u16 pixels, pixels, msb_mask;                                           \
3797   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3798   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3799   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3800   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3801   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3802   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3803   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3804   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3805                                                                                \
3806   subs num_blocks, num_blocks, #1;                                             \
3807   beq 1f;                                                                      \
3808                                                                                \
3809  0:                                                                            \
3810   mov fb_ptr, fb_ptr_next;                                                     \
3811                                                                                \
3812   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3813                                                                                \
3814   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3815   vclt.s16 blend_mask, pixels, #0;                                             \
3816                                                                                \
3817   vorr.u16 pixels, pixels, msb_mask;                                           \
3818   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3819   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3820                                                                                \
3821   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3822   pld [fb_ptr_next, #64];                                                      \
3823                                                                                \
3824   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3825   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3826                                                                                \
3827   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3828   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3829                                                                                \
3830   cmp fb_ptr_cmp, #28;                                                         \
3831   bls 2f;                                                                      \
3832                                                                                \
3833   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3834   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3835   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3836   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3837   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3838   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3839   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3840                                                                                \
3841  3:                                                                            \
3842   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3843   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3844   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3845   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3846   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3847                                                                                \
3848   subs num_blocks, num_blocks, #1;                                             \
3849   bne 0b;                                                                      \
3850                                                                                \
3851  1:                                                                            \
3852   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3853   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3854   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
3855                                                                                \
3856   restore_abi_regs();                                                          \
3857   ldmia sp!, { r4, pc };                                                       \
3858                                                                                \
3859  2:                                                                            \
3860   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3861   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3862                                                                                \
3863   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3864   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3865   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3866   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3867   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3868   bal 3b                                                                       \
3869
3870
3871 #define blend_blocks_add_untextured_builder(mask_evaluate)                     \
3872 .align 3;                                                                      \
3873                                                                                \
3874 function(blend_blocks_untextured_add_##mask_evaluate)                          \
3875   stmdb sp!, { r4, r14 };                                                      \
3876   save_abi_regs();                                                             \
3877   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3878   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3879                                                                                \
3880   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3881   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3882                                                                                \
3883   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3884   mov c_64, #64;                                                               \
3885                                                                                \
3886   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3887   vmov.u16 d128_0x03E0, #0x0300;                                               \
3888   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3889   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3890                                                                                \
3891   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3892   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3893   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3894   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3895   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3896   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3897                                                                                \
3898   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3899   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3900   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3901   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3902   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3903   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3904   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3905   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3906                                                                                \
3907   subs num_blocks, num_blocks, #1;                                             \
3908   beq 1f;                                                                      \
3909                                                                                \
3910  0:                                                                            \
3911   mov fb_ptr, fb_ptr_next;                                                     \
3912                                                                                \
3913   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3914                                                                                \
3915   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3916                                                                                \
3917   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3918   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3919   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3920                                                                                \
3921   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3922   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3923                                                                                \
3924   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3925   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3926   cmp fb_ptr_cmp, #28;                                                         \
3927   bls 2f;                                                                      \
3928                                                                                \
3929   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3930   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3931   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3932   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3933   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3934   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3935                                                                                \
3936  3:                                                                            \
3937   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3938   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3939   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3940   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3941   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3942                                                                                \
3943   subs num_blocks, num_blocks, #1;                                             \
3944   bne 0b;                                                                      \
3945                                                                                \
3946  1:                                                                            \
3947   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3948   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3949   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3950   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
3951                                                                                \
3952   restore_abi_regs();                                                          \
3953   ldmia sp!, { r4, pc };                                                       \
3954                                                                                \
3955  2:                                                                            \
3956   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3957   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3958                                                                                \
3959   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3960   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3961   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3962   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3963   bal 3b                                                                       \
3964
3965
3966 blend_blocks_add_textured_builder(off)
3967 blend_blocks_add_textured_builder(on)
3968 blend_blocks_add_untextured_builder(off)
3969 blend_blocks_add_untextured_builder(on)
3970
3971 #define blend_blocks_subtract_set_blend_mask_textured()                        \
3972   vclt.s16 blend_mask, pixels_next, #0                                         \
3973
3974 #define blend_blocks_subtract_combine_textured()                               \
3975   vbif.u16 blend_pixels, pixels, blend_mask                                    \
3976
3977 #define blend_blocks_subtract_set_stp_textured()                               \
3978   vorr.u16 blend_pixels, #0x8000                                               \
3979
3980 #define blend_blocks_subtract_msb_mask_textured()                              \
3981   vorr.u16 pixels, pixels_next, msb_mask                                       \
3982
3983 #define blend_blocks_subtract_set_blend_mask_untextured()                      \
3984
3985 #define blend_blocks_subtract_combine_untextured()                             \
3986
3987 #define blend_blocks_subtract_set_stp_untextured()                             \
3988   vorr.u16 blend_pixels, blend_pixels, msb_mask                                \
3989
3990 #define blend_blocks_subtract_msb_mask_untextured()                            \
3991
3992
3993 #define blend_blocks_subtract_mask_set_on()                                    \
3994   vclt.s16 write_mask, fb_pixels, #0                                           \
3995
3996 #define blend_blocks_subtract_mask_copy_on()                                   \
3997   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3998
3999 #define blend_blocks_subtract_mask_set_off()                                   \
4000
4001 #define blend_blocks_subtract_mask_copy_off()                                  \
4002   vmov draw_mask, draw_mask_next                                               \
4003
4004
4005 #define blend_blocks_subtract_builder(texturing, mask_evaluate)                \
4006 .align 3;                                                                      \
4007                                                                                \
4008 function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
4009   stmdb sp!, { r4, r14 };                                                      \
4010   save_abi_regs();                                                             \
4011   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4012   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4013                                                                                \
4014   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4015   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4016                                                                                \
4017   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4018   mov c_64, #64;                                                               \
4019                                                                                \
4020   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4021   vmov.u16 d128_0x03E0, #0x0300;                                               \
4022   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4023   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4024                                                                                \
4025   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
4026   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4027   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
4028   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4029   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4030   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4031   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4032                                                                                \
4033   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4034   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4035   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4036   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4037   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4038                                                                                \
4039   subs num_blocks, num_blocks, #1;                                             \
4040   beq 1f;                                                                      \
4041                                                                                \
4042  0:                                                                            \
4043   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4044   mov fb_ptr, fb_ptr_next;                                                     \
4045   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4046                                                                                \
4047   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
4048   blend_blocks_subtract_msb_mask_##texturing();                                \
4049                                                                                \
4050   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
4051   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4052   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4053   blend_blocks_subtract_set_stp_##texturing();                                 \
4054   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4055   blend_blocks_subtract_combine_##texturing();                                 \
4056   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4057   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4058                                                                                \
4059   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4060   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4061   cmp fb_ptr_cmp, #28;                                                         \
4062   bls 2f;                                                                      \
4063                                                                                \
4064   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4065   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4066   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4067   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4068   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4069   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4070   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4071                                                                                \
4072  3:                                                                            \
4073   subs num_blocks, num_blocks, #1;                                             \
4074   bne 0b;                                                                      \
4075                                                                                \
4076  1:                                                                            \
4077   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4078                                                                                \
4079   blend_blocks_subtract_msb_mask_##texturing();                                \
4080   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4081   blend_blocks_subtract_set_stp_##texturing();                                 \
4082   blend_blocks_subtract_combine_##texturing();                                 \
4083   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4084   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4085                                                                                \
4086   restore_abi_regs();                                                          \
4087   ldmia sp!, { r4, pc };                                                       \
4088                                                                                \
4089  2:                                                                            \
4090   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4091   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4092   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4093   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4094   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4095   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4096   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4097   bal 3b                                                                       \
4098
4099
4100 blend_blocks_subtract_builder(textured, off)
4101 blend_blocks_subtract_builder(textured, on)
4102 blend_blocks_subtract_builder(untextured, off)
4103 blend_blocks_subtract_builder(untextured, on)
4104
4105
4106 #define blend_blocks_add_fourth_textured_builder(mask_evaluate)                \
4107 .align 3;                                                                      \
4108                                                                                \
4109 function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
4110   stmdb sp!, { r4, r14 };                                                      \
4111   save_abi_regs();                                                             \
4112   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4113   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4114                                                                                \
4115   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4116   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4117                                                                                \
4118   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4119   mov c_64, #64;                                                               \
4120                                                                                \
4121   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4122   vmov.u16 d128_0x03E0, #0x0300;                                               \
4123   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4124   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4125   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4126   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4127   vorr.u16 d128_0x1C07, #0x0007;                                               \
4128                                                                                \
4129   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4130   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4131   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4132   vclt.s16 blend_mask, pixels, #0;                                             \
4133   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4134   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4135   vshr.s16 pixels_fourth, pixels, #2;                                          \
4136   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4137                                                                                \
4138   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4139   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4140   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4141   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4142   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4143   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4144   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4145   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4146                                                                                \
4147   subs num_blocks, num_blocks, #1;                                             \
4148   beq 1f;                                                                      \
4149                                                                                \
4150  0:                                                                            \
4151   mov fb_ptr, fb_ptr_next;                                                     \
4152   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4153                                                                                \
4154   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4155   vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
4156   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4157                                                                                \
4158   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4159   vclt.s16 blend_mask, pixels, #0;                                             \
4160   vshr.s16 pixels_fourth, pixels, #2;                                          \
4161   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4162   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4163                                                                                \
4164   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4165   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4166                                                                                \
4167   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4168   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4169   cmp fb_ptr_cmp, #28;                                                         \
4170   bls 2f;                                                                      \
4171                                                                                \
4172   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4173   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4174   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4175   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4176   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4177   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4178                                                                                \
4179  3:                                                                            \
4180   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4181   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4182   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4183   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4184   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4185                                                                                \
4186   subs num_blocks, num_blocks, #1;                                             \
4187   bne 0b;                                                                      \
4188                                                                                \
4189  1:                                                                            \
4190   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4191   vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
4192   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4193   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4194   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4195   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4196                                                                                \
4197   restore_abi_regs();                                                          \
4198   ldmia sp!, { r4, pc };                                                       \
4199                                                                                \
4200  2:                                                                            \
4201   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4202   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4203                                                                                \
4204   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4205   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4206   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4207   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4208   bal 3b                                                                       \
4209
4210
4211
4212 #define blend_blocks_add_fourth_untextured_builder(mask_evaluate)              \
4213 .align 3;                                                                      \
4214                                                                                \
4215 function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
4216   stmdb sp!, { r4, r14 };                                                      \
4217   save_abi_regs();                                                             \
4218   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4219   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4220                                                                                \
4221   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4222   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4223                                                                                \
4224   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4225   mov c_64, #64;                                                               \
4226                                                                                \
4227   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4228   vmov.u16 d128_0x03E0, #0x0300;                                               \
4229   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4230   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4231   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4232   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4233   vorr.u16 d128_0x1C07, #0x0007;                                               \
4234                                                                                \
4235   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4236   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4237   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4238   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4239   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4240   vshr.s16 pixels_fourth, pixels, #2;                                          \
4241   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4242                                                                                \
4243   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4244   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4245   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4246   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4247   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4248   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4249   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4250   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4251                                                                                \
4252   subs num_blocks, num_blocks, #1;                                             \
4253   beq 1f;                                                                      \
4254                                                                                \
4255  0:                                                                            \
4256   mov fb_ptr, fb_ptr_next;                                                     \
4257   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4258                                                                                \
4259   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4260                                                                                \
4261   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4262   vshr.s16 pixels_fourth, pixels, #2;                                          \
4263   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4264   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4265                                                                                \
4266   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4267   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4268                                                                                \
4269   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4270   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4271   cmp fb_ptr_cmp, #28;                                                         \
4272   bls 2f;                                                                      \
4273                                                                                \
4274   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4275   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4276   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4277   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4278   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4279   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4280                                                                                \
4281  3:                                                                            \
4282   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4283   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4284   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4285   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4286   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4287                                                                                \
4288   subs num_blocks, num_blocks, #1;                                             \
4289   bne 0b;                                                                      \
4290                                                                                \
4291  1:                                                                            \
4292   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4293   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4294   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4295   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4296                                                                                \
4297   restore_abi_regs();                                                          \
4298   ldmia sp!, { r4, pc };                                                       \
4299                                                                                \
4300  2:                                                                            \
4301   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4302   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4303                                                                                \
4304   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4305   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4306   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4307   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4308   bal 3b                                                                       \
4309
4310
4311 blend_blocks_add_fourth_textured_builder(off)
4312 blend_blocks_add_fourth_textured_builder(on)
4313 blend_blocks_add_fourth_untextured_builder(off)
4314 blend_blocks_add_fourth_untextured_builder(on)
4315
4316 // TODO: Optimize this more. Need a scene that actually uses it for
4317 // confirmation..
4318
4319 .align 3
4320
4321 function(blend_blocks_textured_unblended_on)         
4322   stmdb sp!, { r4, r14 }
4323   save_abi_regs()
4324   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4325   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
4326
4327   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4328   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
4329
4330   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4331   mov c_64, #64
4332
4333   ldr fb_ptr, [pixel_ptr, #28]
4334   vld1.u16 { fb_pixels }, [fb_ptr]
4335   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
4336   vclt.s16 write_mask, fb_pixels, #0
4337   vld1.u32 { pixels }, [pixel_ptr, :128], c_64
4338
4339   subs num_blocks, num_blocks, #1
4340   beq 1f
4341
4342  0:
4343   vorr.u16 pixels, pixels, msb_mask
4344   vorr.u16 draw_mask, draw_mask, write_mask
4345   vbif.u16 fb_pixels, pixels, draw_mask
4346   vst1.u16 { fb_pixels }, [fb_ptr]
4347
4348   ldr fb_ptr, [pixel_ptr, #28]
4349   vld1.u16 { fb_pixels }, [fb_ptr]
4350   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
4351   vclt.s16 write_mask, fb_pixels, #0
4352   vld1.u32 { pixels }, [pixel_ptr, :128], c_64
4353
4354   subs num_blocks, num_blocks, #1
4355   bne 0b
4356  
4357  1:
4358   vorr.u16 pixels, pixels, msb_mask
4359   vorr.u16 draw_mask, draw_mask, write_mask
4360   vbif.u16 fb_pixels, pixels, draw_mask
4361   vst1.u16 { fb_pixels }, [fb_ptr]
4362
4363   restore_abi_regs()
4364   ldmia sp!, { r4, pc }
4365
4366
4367 function(blend_blocks_textured_unblended_off)
4368   bx lr
4369
4370
4371 function(warmup)
4372   mov r3, #64
4373   cmp r0, #0
4374   bxeq lr
4375
4376  0:
4377   vld1.u32 { u_whole_8, v_whole_8 }, [r1, :128], r3
4378
4379   subs r0, r0, #1
4380   bne 0b
4381
4382   bx lr
4383
4384 #undef vram_ptr
4385 #undef color
4386 #undef width
4387 #undef height
4388 #undef pitch
4389
4390 #define vram_ptr                                          r0
4391 #define color                                             r1
4392 #define width                                             r2
4393 #define height                                            r3
4394
4395 #define pitch                                             r1
4396
4397 #define num_width                                         r12
4398
4399 #undef colors_a
4400 #undef colors_b
4401
4402 #define colors_a                                          q0
4403 #define colors_b                                          q1
4404
4405 .align 3
4406
4407 function(render_block_fill_body)
4408   vdup.u16 colors_a, color
4409   mov pitch, #2048
4410
4411   vmov colors_b, colors_a
4412   sub pitch, pitch, width, lsl #1
4413
4414   mov num_width, width
4415
4416  0:  
4417   vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]!
4418
4419   subs num_width, num_width, #16
4420   bne 0b
4421
4422   add vram_ptr, vram_ptr, pitch
4423   mov num_width, width
4424
4425   subs height, height, #1
4426   bne 0b
4427
4428   bx lr
4429  
4430
4431 #undef x
4432 #undef y
4433 #undef width
4434 #undef height
4435 #undef fb_ptr
4436 #undef texture_mask
4437 #undef num_blocks
4438 #undef temp
4439 #undef dirty_textures_mask
4440 #undef clut_ptr
4441 #undef current_texture_mask
4442
4443 #define psx_gpu                                           r0
4444 #define x                                                 r1
4445 #define y                                                 r2
4446 #define u                                                 r3
4447 #define v                                                 r4
4448 #define width                                             r5
4449 #define height                                            r6
4450 #define offset_u                                          r8
4451 #define offset_v                                          r9
4452 #define offset_u_right                                    r10
4453 #define width_rounded                                     r11
4454 #define height_rounded                                    r12
4455
4456 #define texture_offset_base                               r1
4457 #define tile_width                                        r2
4458 #define tile_height                                       r3
4459 #define num_blocks                                        r4
4460 #define block                                             r5
4461 #define sub_tile_height                                   r6
4462 #define fb_ptr                                            r7
4463 #define texture_mask                                      r8
4464 #define column_data                                       r9
4465 #define texture_offset                                    r10
4466 #define tiles_remaining                                   r11
4467 #define fb_ptr_advance_column                             r12
4468 #define texture_block_ptr                                 r14
4469
4470 #define temp                                              r14
4471
4472 #define texture_page_ptr                                  r3
4473 #define left_block_mask                                   r4
4474 #define right_block_mask                                  r5
4475 #define texture_mask_rev                                  r10
4476 #define control_mask                                      r11
4477
4478 #define dirty_textures_mask                               r4
4479 #define clut_ptr                                          r5
4480 #define current_texture_mask                              r6
4481
4482
4483 #undef texels
4484 #undef clut_low_a
4485 #undef clut_low_b
4486 #undef clut_high_a
4487 #undef clut_high_b
4488 #undef clut_a
4489 #undef clut_b
4490 #undef texels_low
4491 #undef texels_high
4492
4493 #define texels                                            d0
4494 #define draw_masks_fb_ptrs                                q1
4495
4496 #define draw_mask_fb_ptr_left                             d2
4497 #define draw_mask_fb_ptr_right                            d3
4498
4499 #define draw_mask_fb_ptr_left_a                           d2
4500 #define draw_mask_fb_ptr_left_b                           d3
4501 #define draw_mask_fb_ptr_right_a                          d10
4502 #define draw_mask_fb_ptr_right_b                          d11
4503 #define draw_masks_fb_ptrs2                               q5
4504
4505 #define clut_low_a                                        d4
4506 #define clut_low_b                                        d5
4507 #define clut_high_a                                       d6
4508 #define clut_high_b                                       d7
4509
4510 #define block_masks                                       d8
4511 #define block_masks_shifted                               d9
4512
4513 #define clut_a                                            q2
4514 #define clut_b                                            q3
4515
4516 #define texels_low                                        d12
4517 #define texels_high                                       d13
4518
4519 #define texels_wide_low                                   d14
4520 #define texels_wide_high                                  d15
4521 #define texels_wide                                       q7
4522
4523
4524 setup_sprite_flush_blocks:
4525   vpush { q1 - q5 }
4526
4527   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
4528   bl flush_render_block_buffer
4529   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
4530
4531   vpop { q1 - q5 }
4532
4533   add block, psx_gpu, #psx_gpu_blocks_offset
4534   bx lr
4535
4536
4537 setup_sprite_update_texture_4bpp_cache:
4538   stmdb sp!, { r0 - r3, r14 }
4539   bl update_texture_4bpp_cache
4540   ldmia sp!, { r0 - r3, pc }
4541
4542
4543 setup_sprite_update_texture_8bpp_cache:
4544   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
4545   bl update_texture_8bpp_cache
4546   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
4547
4548
4549 #define setup_sprite_tiled_initialize_4bpp()                                   \
4550   ldr dirty_textures_mask,                                                     \
4551    [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset];                        \
4552   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset];                           \
4553                                                                                \
4554   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset];   \
4555   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128];                               \
4556                                                                                \
4557   tst current_texture_mask, dirty_textures_mask;                               \
4558   vuzp.u8 clut_a, clut_b;                                                      \
4559                                                                                \
4560   blne setup_sprite_update_texture_4bpp_cache                                  \
4561
4562 #define setup_sprite_tiled_initialize_8bpp()                                   \
4563   ldr dirty_textures_mask,                                                     \
4564    [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset];                        \
4565   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset];   \
4566                                                                                \
4567   tst current_texture_mask, dirty_textures_mask;                               \
4568   blne setup_sprite_update_texture_8bpp_cache                                  \
4569
4570
4571 #define setup_sprite_block_count_single()                                      \
4572   sub_tile_height                                                              \
4573
4574 #define setup_sprite_block_count_double()                                      \
4575   sub_tile_height, lsl #1                                                      \
4576
4577 #define setup_sprite_tile_add_blocks(type)                                     \
4578   add num_blocks, num_blocks, setup_sprite_block_count_##type();               \
4579   cmp num_blocks, #MAX_BLOCKS;                                                 \
4580                                                                                \
4581   movgt num_blocks, setup_sprite_block_count_##type();                         \
4582   blgt setup_sprite_flush_blocks                                               \
4583
4584
4585 #define setup_sprite_tile_full_4bpp(edge)                                      \
4586   setup_sprite_tile_add_blocks(double);                                        \
4587                                                                                \
4588  4:                                                                            \
4589   and texture_block_ptr, texture_offset, texture_mask;                         \
4590   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4591                                                                                \
4592   pld [fb_ptr];                                                                \
4593   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4594   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4595                                                                                \
4596   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4597   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4598                                                                                \
4599   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4600   add texture_block_ptr, texture_offset, #8;                                   \
4601                                                                                \
4602   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4603   add block, block, #40;                                                       \
4604                                                                                \
4605   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4606   add fb_ptr, fb_ptr, #16;                                                     \
4607                                                                                \
4608   vst1.u32 { draw_mask_fb_ptr_left }, [block, :64];                            \
4609   add block, block, #24;                                                       \
4610                                                                                \
4611   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4612   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4613                                                                                \
4614   pld [fb_ptr];                                                                \
4615   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4616   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4617                                                                                \
4618   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4619   add block, block, #40;                                                       \
4620                                                                                \
4621   add texture_offset, texture_offset, #0x10;                                   \
4622   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4623                                                                                \
4624   vst1.u32 { draw_mask_fb_ptr_right }, [block, :64];                           \
4625   add block, block, #24;                                                       \
4626                                                                                \
4627   subs sub_tile_height, sub_tile_height, #1;                                   \
4628   bne 4b;                                                                      \
4629                                                                                \
4630   add texture_offset, texture_offset, #0xF00;                                  \
4631   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4632
4633   
4634 #define setup_sprite_tile_half_4bpp(edge)                                      \
4635   setup_sprite_tile_add_blocks(single);                                        \
4636                                                                                \
4637  4:                                                                            \
4638   and texture_block_ptr, texture_offset, texture_mask;                         \
4639   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4640                                                                                \
4641   pld [fb_ptr];                                                                \
4642   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4643   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4644                                                                                \
4645   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4646   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4647                                                                                \
4648   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4649   add block, block, #40;                                                       \
4650                                                                                \
4651   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4652   vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64];                          \
4653                                                                                \
4654   add block, block, #24;                                                       \
4655   add texture_offset, texture_offset, #0x10;                                   \
4656                                                                                \
4657   add fb_ptr, fb_ptr, #2048;                                                   \
4658   subs sub_tile_height, sub_tile_height, #1;                                   \
4659                                                                                \
4660   bne 4b;                                                                      \
4661                                                                                \
4662   add texture_offset, texture_offset, #0xF00;                                  \
4663   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4664  
4665  
4666 #define setup_sprite_tile_full_8bpp(edge)                                      \
4667   setup_sprite_tile_add_blocks(double);                                        \
4668   add block, block, #16;                                                       \
4669                                                                                \
4670  4:                                                                            \
4671   and texture_block_ptr, texture_offset, texture_mask;                         \
4672   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4673                                                                                \
4674   pld [fb_ptr];                                                                \
4675   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4676   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4677                                                                                \
4678   add texture_block_ptr, texture_offset, #8;                                   \
4679   vst1.u32 { texels }, [block, :64];                                           \
4680                                                                                \
4681   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4682   add block, block, #24;                                                       \
4683                                                                                \
4684   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4685                                                                                \
4686   add fb_ptr, fb_ptr, #16;                                                     \
4687   vst1.u32 { draw_mask_fb_ptr_left }, [block, :64];                            \
4688                                                                                \
4689   add block, block, #40;                                                       \
4690   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4691   pld [fb_ptr];                                                                \
4692                                                                                \
4693   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4694   vst1.u32 { texels }, [block, :64];                                           \
4695   add block, block, #24;                                                       \
4696                                                                                \
4697   add texture_offset, texture_offset, #0x10;                                   \
4698   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4699                                                                                \
4700   vst1.u32 { draw_mask_fb_ptr_right }, [block, :64];                           \
4701   add block, block, #40;                                                       \
4702                                                                                \
4703   subs sub_tile_height, sub_tile_height, #1;                                   \
4704   bne 4b;                                                                      \
4705                                                                                \
4706   sub block, block, #16;                                                       \
4707   add texture_offset, texture_offset, #0xF00;                                  \
4708   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4709
4710   
4711 #define setup_sprite_tile_half_8bpp(edge)                                      \
4712   setup_sprite_tile_add_blocks(single);                                        \
4713   add block, block, #16;                                                       \
4714                                                                                \
4715  4:                                                                            \
4716   and texture_block_ptr, texture_offset, texture_mask;                         \
4717   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4718   pld [fb_ptr];                                                                \
4719                                                                                \
4720   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4721   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4722                                                                                \
4723   vst1.u32 { texels }, [block, :64];                                           \
4724   add block, block, #24;                                                       \
4725                                                                                \
4726   vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64];                          \
4727   add block, block, #40;                                                       \
4728                                                                                \
4729   add texture_offset, texture_offset, #0x10;                                   \
4730   add fb_ptr, fb_ptr, #2048;                                                   \
4731                                                                                \
4732   subs sub_tile_height, sub_tile_height, #1;                                   \
4733   bne 4b;                                                                      \
4734                                                                                \
4735   sub block, block, #16;                                                       \
4736   add texture_offset, texture_offset, #0xF00;                                  \
4737   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4738
4739  
4740 #define setup_sprite_tile_column_edge_pre_adjust_half_right()                  \
4741   add texture_offset, texture_offset_base, #8;                                 \
4742   add fb_ptr, fb_ptr, #16                                                      \
4743
4744 #define setup_sprite_tile_column_edge_pre_adjust_half_left()                   \
4745   mov texture_offset, texture_offset_base                                      \
4746
4747 #define setup_sprite_tile_column_edge_pre_adjust_half(edge)                    \
4748   setup_sprite_tile_column_edge_pre_adjust_half_##edge()                       \
4749
4750 #define setup_sprite_tile_column_edge_pre_adjust_full(edge)                    \
4751   mov texture_offset, texture_offset_base                                      \
4752
4753 #define setup_sprite_tile_column_edge_post_adjust_half_right()                 \
4754   sub fb_ptr, fb_ptr, #16                                                      \
4755
4756 #define setup_sprite_tile_column_edge_post_adjust_half_left()                  \
4757
4758 #define setup_sprite_tile_column_edge_post_adjust_half(edge)                   \
4759   setup_sprite_tile_column_edge_post_adjust_half_##edge()                      \
4760
4761 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
4762
4763
4764 #define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode,  \
4765  x4mode)                                                                       \
4766   mov sub_tile_height, column_data;                                            \
4767   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4768   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4769   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4770
4771 #define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode,   \
4772  x4mode)                                                                       \
4773   and sub_tile_height, column_data, #0xFF;                                     \
4774   mov tiles_remaining, column_data, lsr #16;                                   \
4775   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4776   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4777                                                                                \
4778   subs tiles_remaining, tiles_remaining, #1;                                   \
4779   beq 2f;                                                                      \
4780                                                                                \
4781  3:                                                                            \
4782   mov sub_tile_height, #16;                                                    \
4783   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4784   subs tiles_remaining, tiles_remaining, #1;                                   \
4785   bne 3b;                                                                      \
4786                                                                                \
4787  2:                                                                            \
4788   uxtb sub_tile_height, column_data, ror #8;                                   \
4789   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4790   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4791
4792
4793 #define setup_sprite_column_data_single()                                      \
4794   mov column_data, height;                                                     \
4795   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]            \
4796
4797 #define setup_sprite_column_data_multi()                                       \
4798   and height_rounded, height_rounded, #0xF;                                    \
4799   rsb column_data, offset_v, #16;                                              \
4800                                                                                \
4801   add height_rounded, height_rounded, #1;                                      \
4802   sub tile_height, tile_height, #1;                                            \
4803                                                                                \
4804   orr column_data, column_data, tile_height, lsl #16;                          \
4805   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset];           \
4806                                                                                \
4807   orr column_data, column_data, height_rounded, lsl #8                         \
4808
4809 #define setup_sprite_setup_left_draw_mask_fb_ptr()                             \
4810   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4811   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4812
4813 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column()              \
4814   mov fb_ptr_advance_column, #32;                                              \
4815   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4816                                                                                \
4817   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11;           \
4818   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4819
4820 #define setup_sprite_setup_right_draw_mask_fb_ptr()                            \
4821   vdup.u8 draw_mask_fb_ptr_left, block_masks[4];                               \
4822   vdup.u8 draw_mask_fb_ptr_right, block_masks[5]                               \
4823
4824 #define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode,     \
4825  edge, x4mode)                                                                 \
4826  setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode:     \
4827   setup_sprite_column_data_##multi_height();                                   \
4828   vext.32 block_masks_shifted, block_masks, block_masks, #1;                   \
4829   vorr.u32 block_masks, block_masks, block_masks_shifted;                      \
4830   setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
4831                                                                                \
4832   setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
4833   restore_abi_regs();                                                          \
4834   ldmia sp!, { r4 - r11, pc }                                                  \
4835
4836 #define setup_sprite_tiled_advance_column()                                    \
4837   add texture_offset_base, texture_offset_base, #0x100;                        \
4838   tst texture_offset_base, #0xF00;                                             \
4839   subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00)             \
4840
4841 #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode,      \
4842  right_mode, x4mode)                                                           \
4843  setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
4844   setup_sprite_column_data_##multi_height();                                   \
4845                                                                                \
4846   setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode();           \
4847                                                                                \
4848   setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
4849                                                                                \
4850   subs tile_width, tile_width, #2;                                             \
4851   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4852                                                                                \
4853   beq 1f;                                                                      \
4854                                                                                \
4855   vmov.u8 draw_masks_fb_ptrs, #0;                                              \
4856   vmov.u8 draw_masks_fb_ptrs2, #0;                                             \
4857                                                                                \
4858  0:                                                                            \
4859   setup_sprite_tiled_advance_column();                                         \
4860   setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode);      \
4861   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4862   subs tile_width, tile_width, #1;                                             \
4863   bne 0b;                                                                      \
4864                                                                                \
4865  1:                                                                            \
4866   setup_sprite_setup_right_draw_mask_fb_ptr##x4mode();                         \
4867                                                                                \
4868   setup_sprite_tiled_advance_column();                                         \
4869   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
4870   restore_abi_regs();                                                          \
4871   ldmia sp!, { r4 - r11, pc }                                                  \
4872
4873
4874 #define setup_sprite_offset_u_adjust()                                         \
4875
4876 #define setup_sprite_get_left_block_mask()                                     \
4877   and left_block_mask, left_block_mask, #0xFF                                  \
4878
4879 #define setup_sprite_compare_left_block_mask()                                 \
4880   cmp left_block_mask, #0xFF                                                   \
4881
4882 #define setup_sprite_get_right_block_mask()                                    \
4883   uxtb right_block_mask, right_block_mask, ror #8                              \
4884
4885 #define setup_sprite_compare_right_block_mask()                                \
4886   cmp right_block_mask, #0xFF                                                  \
4887
4888
4889
4890 /* 4x stuff */
4891 #define fb_ptr2 column_data
4892
4893 #define setup_sprite_offset_u_adjust_4x()                                      \
4894   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
4895   lsl offset_u_right, #1;                                                      \
4896   lsl offset_u, #1;                                                            \
4897   add offset_u_right, #1                                                       \
4898
4899 #define setup_sprite_get_left_block_mask_4x()                                  \
4900   sxth left_block_mask, left_block_mask                                        \
4901
4902 #define setup_sprite_compare_left_block_mask_4x()                              \
4903   cmp left_block_mask, #0xFFFFFFFF                                             \
4904
4905 #define setup_sprite_get_right_block_mask_4x()                                 \
4906   sxth right_block_mask, right_block_mask, ror #16                             \
4907
4908 #define setup_sprite_compare_right_block_mask_4x()                             \
4909   cmp right_block_mask, #0xFFFFFFFF                                            \
4910
4911
4912 #define widen_texels_16bpp(texels_)                                            \
4913   vmov texels_wide_low, texels_;                                               \
4914   vmov texels_wide_high, texels_;                                              \
4915   vzip.16 texels_wide_low, texels_wide_high                                    \
4916
4917 #define widen_texels_8bpp(texels_)                                             \
4918   vmov texels_wide_low, texels_;                                               \
4919   vmov texels_wide_high, texels_;                                              \
4920   vzip.8 texels_wide_low, texels_wide_high                                     \
4921
4922 #define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)         \
4923   vst1.u32 { texels_ }, [block_, :128];                                        \
4924   add block_, block_, #40;                                                     \
4925                                                                                \
4926   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4927   vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64];                               \
4928   add block_, block_, #24                                                      \
4929
4930 /* assumes 16-byte offset already added to block_ */
4931 #define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)          \
4932   vst1.u32 { texels_ }, [block_, :64];                                         \
4933   add block_, block_, #24;                                                     \
4934                                                                                \
4935   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4936   vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64];                               \
4937   add block_, block_, #40                                                      \
4938
4939 #define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,             \
4940  draw_mask_fb_ptr_b_)                                                          \
4941   widen_texels_16bpp(texels_low);                                              \
4942   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4943                                                                                \
4944   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr);          \
4945                                                                                \
4946   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);      \
4947   widen_texels_16bpp(texels_high);                                             \
4948                                                                                \
4949   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4950   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);      \
4951                                                                                \
4952   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4953   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)       \
4954
4955 #define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,              \
4956  draw_mask_fb_ptr_b_)                                                          \
4957   widen_texels_8bpp(texels);                                                   \
4958   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4959                                                                                \
4960   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr);       \
4961   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);   \
4962                                                                                \
4963   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4964   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);  \
4965                                                                                \
4966   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4967   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)   \
4968
4969
4970 #define setup_sprite_tiled_initialize_4bpp_4x()                                \
4971   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset];                           \
4972   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128];                               \
4973                                                                                \
4974   vuzp.u8 clut_a, clut_b                                                       \
4975
4976 #define setup_sprite_tiled_initialize_8bpp_4x()                                \
4977
4978
4979 #define setup_sprite_block_count_single_4x()                                   \
4980   sub_tile_height, lsl #2                                                      \
4981
4982 #define setup_sprite_block_count_double_4x()                                   \
4983   sub_tile_height, lsl #(1+2)                                                  \
4984
4985 #define setup_sprite_tile_full_4bpp_4x(edge)                                   \
4986   setup_sprite_tile_add_blocks(double_4x);                                     \
4987   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
4988                                                                                \
4989  4:                                                                            \
4990   and texture_block_ptr, texture_offset, texture_mask;                         \
4991   pld [fb_ptr];                                                                \
4992                                                                                \
4993   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4994   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4995                                                                                \
4996   add texture_block_ptr, texture_offset, #8;                                   \
4997   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4998                                                                                \
4999   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
5000   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5001                                                                                \
5002   vzip.8 texels_low, texels_high;                                              \
5003   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                  \
5004    draw_mask_fb_ptr_left_b);                                                   \
5005                                                                                \
5006   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5007   pld [fb_ptr, #2048];                                                         \
5008                                                                                \
5009   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5010   add fb_ptr, fb_ptr, #16*2;                                                   \
5011                                                                                \
5012   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5013   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5014                                                                                \
5015   vzip.8 texels_low, texels_high;                                              \
5016   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                 \
5017    draw_mask_fb_ptr_right_b);                                                  \
5018                                                                                \
5019   add texture_offset, texture_offset, #0x10;                                   \
5020   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5021                                                                                \
5022   subs sub_tile_height, sub_tile_height, #1;                                   \
5023   bne 4b;                                                                      \
5024                                                                                \
5025   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5026   add texture_offset, texture_offset, #0xF00;                                  \
5027   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5028
5029
5030 #define setup_sprite_tile_half_4bpp_4x(edge)                                   \
5031   setup_sprite_tile_add_blocks(single_4x);                                     \
5032   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5033                                                                                \
5034  4:                                                                            \
5035   and texture_block_ptr, texture_offset, texture_mask;                         \
5036   pld [fb_ptr];                                                                \
5037                                                                                \
5038   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5039   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5040                                                                                \
5041   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5042   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5043                                                                                \
5044   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5045   add texture_offset, texture_offset, #0x10;                                   \
5046                                                                                \
5047   vzip.8 texels_low, texels_high;                                              \
5048   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,              \
5049    draw_mask_fb_ptr_##edge##_b);                                               \
5050                                                                                \
5051   pld [fb_ptr, #2048];                                                         \
5052   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5053                                                                                \
5054   subs sub_tile_height, sub_tile_height, #1;                                   \
5055   bne 4b;                                                                      \
5056                                                                                \
5057   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5058   add texture_offset, texture_offset, #0xF00;                                  \
5059   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5060
5061
5062 #define setup_sprite_tile_full_8bpp_4x(edge)                                   \
5063   setup_sprite_tile_add_blocks(double_4x);                                     \
5064   add block, block, #16;                                                       \
5065   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5066                                                                                \
5067  4:                                                                            \
5068   and texture_block_ptr, texture_offset, texture_mask;                         \
5069   pld [fb_ptr];                                                                \
5070                                                                                \
5071   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5072   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5073                                                                                \
5074   add texture_block_ptr, texture_offset, #8;                                   \
5075   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                   \
5076    draw_mask_fb_ptr_left_b);                                                   \
5077                                                                                \
5078   pld [fb_ptr, #2048];                                                         \
5079   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
5080                                                                                \
5081   add fb_ptr, fb_ptr, #16*2;                                                   \
5082   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5083                                                                                \
5084   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5085                                                                                \
5086   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                  \
5087    draw_mask_fb_ptr_right_b);                                                  \
5088                                                                                \
5089   add texture_offset, texture_offset, #0x10;                                   \
5090   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5091                                                                                \
5092   subs sub_tile_height, sub_tile_height, #1;                                   \
5093   bne 4b;                                                                      \
5094                                                                                \
5095   sub block, block, #16;                                                       \
5096   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5097   add texture_offset, texture_offset, #0xF00;                                  \
5098   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5099
5100   
5101 #define setup_sprite_tile_half_8bpp_4x(edge)                                   \
5102   setup_sprite_tile_add_blocks(single_4x);                                     \
5103   add block, block, #16;                                                       \
5104   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5105                                                                                \
5106  4:                                                                            \
5107   and texture_block_ptr, texture_offset, texture_mask;                         \
5108   pld [fb_ptr];                                                                \
5109                                                                                \
5110   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5111   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5112                                                                                \
5113   pld [fb_ptr, #2048];                                                         \
5114   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,               \
5115    draw_mask_fb_ptr_##edge##_b);                                               \
5116                                                                                \
5117   add texture_offset, texture_offset, #0x10;                                   \
5118   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5119                                                                                \
5120   subs sub_tile_height, sub_tile_height, #1;                                   \
5121   bne 4b;                                                                      \
5122                                                                                \
5123   sub block, block, #16;                                                       \
5124   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5125   add texture_offset, texture_offset, #0xF00;                                  \
5126   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5127
5128  
5129 #define setup_sprite_tile_column_edge_pre_adjust_half_right_4x()               \
5130   add texture_offset, texture_offset_base, #8;                                 \
5131   add fb_ptr, fb_ptr, #16 * 2                                                  \
5132
5133 #define setup_sprite_tile_column_edge_pre_adjust_half_left_4x()                \
5134   mov texture_offset, texture_offset_base                                      \
5135
5136 #define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge)                 \
5137   setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x()                  \
5138
5139 #define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge)                 \
5140   mov texture_offset, texture_offset_base                                      \
5141
5142 #define setup_sprite_tile_column_edge_post_adjust_half_right_4x()              \
5143   sub fb_ptr, fb_ptr, #16 * 2                                                  \
5144
5145 #define setup_sprite_tile_column_edge_post_adjust_half_left_4x()               \
5146
5147 #define setup_sprite_tile_column_edge_post_adjust_half_4x(edge)                \
5148   setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x()                 \
5149
5150 #define setup_sprite_tile_column_edge_post_adjust_full_4x(edge)                \
5151
5152
5153 #define setup_sprite_setup_left_draw_mask_fb_ptr_4x()                          \
5154   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5155   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5156   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5157   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5158
5159 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x()           \
5160   mov fb_ptr_advance_column, #32 * 2;                                          \
5161   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5162   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5163   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1;       \
5164   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5165   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5166
5167 #define setup_sprite_setup_right_draw_mask_fb_ptr_4x()                         \
5168   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4];                             \
5169   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5];                             \
5170   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6];                            \
5171   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7]                             \
5172
5173
5174 // r0: psx_gpu
5175 // r1: x
5176 // r2: y
5177 // r3: u
5178 // [sp]: v
5179 // [sp + 4]: width
5180 // [sp + 8]: height
5181 // [sp + 12]: color (unused)
5182
5183 #define setup_sprite_tiled_builder(texture_mode, x4mode)                       \
5184                                                                                \
5185 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, full,        \
5186   x4mode);                                                                     \
5187 setup_sprite_tile_column_width_single(texture_mode, multi,  full, none,        \
5188   x4mode);                                                                     \
5189 setup_sprite_tile_column_width_multi(texture_mode,  single, full, full,        \
5190   x4mode);                                                                     \
5191 setup_sprite_tile_column_width_single(texture_mode, single, full, none,        \
5192   x4mode);                                                                     \
5193 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, full,        \
5194   x4mode);                                                                     \
5195 setup_sprite_tile_column_width_single(texture_mode, multi,  half, right,       \
5196   x4mode);                                                                     \
5197 setup_sprite_tile_column_width_multi(texture_mode,  single, half, full,        \
5198   x4mode);                                                                     \
5199 setup_sprite_tile_column_width_single(texture_mode, single, half, right,       \
5200   x4mode);                                                                     \
5201 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, half,        \
5202   x4mode);                                                                     \
5203 setup_sprite_tile_column_width_single(texture_mode, multi,  half, left,        \
5204   x4mode);                                                                     \
5205 setup_sprite_tile_column_width_multi(texture_mode,  single, full, half,        \
5206   x4mode);                                                                     \
5207 setup_sprite_tile_column_width_single(texture_mode, single, half, left,        \
5208   x4mode);                                                                     \
5209 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, half,        \
5210   x4mode);                                                                     \
5211 setup_sprite_tile_column_width_multi(texture_mode,  single, half, half,        \
5212   x4mode);                                                                     \
5213                                                                                \
5214 .align 4;                                                                      \
5215                                                                                \
5216 function(setup_sprite_##texture_mode##x4mode)                                  \
5217   stmdb sp!, { r4 - r11, r14 };                                                \
5218   setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
5219                                                                                \
5220   ldr v, [sp, #36];                                                            \
5221   and offset_u, u, #0xF;                                                       \
5222                                                                                \
5223   ldr width, [sp, #40];                                                        \
5224   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
5225                                                                                \
5226   ldr height, [sp, #44];                                                       \
5227   add fb_ptr, fb_ptr, y, lsl #11;                                              \
5228                                                                                \
5229   save_abi_regs();                                                             \
5230                                                                                \
5231   add fb_ptr, fb_ptr, x, lsl #1;                                               \
5232   and offset_v, v, #0xF;                                                       \
5233                                                                                \
5234   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
5235   add width_rounded, offset_u, width;                                          \
5236                                                                                \
5237   add height_rounded, offset_v, height;                                        \
5238   add width_rounded, width_rounded, #15;                                       \
5239                                                                                \
5240   add height_rounded, height_rounded, #15;                                     \
5241   mov tile_width, width_rounded, lsr #4;                                       \
5242                                                                                \
5243   /* texture_offset_base = VH-VL-00-00                                       */\
5244   mov texture_offset_base, v, lsl #8;                                          \
5245   and offset_u_right, width_rounded, #0xF;                                     \
5246                                                                                \
5247   /* texture_offset_base = VH-UH-UL-00                                       */\
5248   bfi texture_offset_base, u, #4, #8;                                          \
5249   mov right_block_mask, #0xFFFFFFFE;                                           \
5250                                                                                \
5251   setup_sprite_offset_u_adjust##x4mode();                                      \
5252                                                                                \
5253   /* texture_offset_base = VH-UH-VL-00                                       */\
5254   bfi texture_offset_base, v, #4, #4;                                          \
5255   mov left_block_mask, #0xFFFFFFFF;                                            \
5256                                                                                \
5257   mov tile_height, height_rounded, lsr #4;                                     \
5258   mvn left_block_mask, left_block_mask, lsl offset_u;                          \
5259                                                                                \
5260   /* texture_mask = HH-HL-WH-WL                                              */\
5261   ldrh texture_mask, [psx_gpu, #psx_gpu_texture_mask_width_offset];            \
5262   mov right_block_mask, right_block_mask, lsl offset_u_right;                  \
5263                                                                                \
5264   /* texture_mask_rev = WH-WL-HH-HL                                          */\
5265   rev16 texture_mask_rev, texture_mask;                                        \
5266   vmov block_masks, left_block_mask, right_block_mask;                         \
5267                                                                                \
5268   /* texture_mask = HH-HL-HL-WL                                              */\
5269   bfi texture_mask, texture_mask_rev, #4, #4;                                  \
5270   /* texture_mask_rev = 00-00-00-WH                                          */\
5271   mov texture_mask_rev, texture_mask_rev, lsr #12;                             \
5272                                                                                \
5273   /* texture_mask = HH-WH-HL-WL                                              */\
5274   bfi texture_mask, texture_mask_rev, #8, #4;                                  \
5275   setup_sprite_get_left_block_mask##x4mode();                                  \
5276                                                                                \
5277   mov control_mask, #0;                                                        \
5278   setup_sprite_compare_left_block_mask##x4mode();                              \
5279                                                                                \
5280   setup_sprite_get_right_block_mask##x4mode();                                 \
5281   orreq control_mask, control_mask, #0x4;                                      \
5282                                                                                \
5283   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
5284   setup_sprite_compare_right_block_mask##x4mode();                             \
5285                                                                                \
5286   orreq control_mask, control_mask, #0x8;                                      \
5287   cmp tile_width, #1;                                                          \
5288                                                                                \
5289   add block, psx_gpu, #psx_gpu_blocks_offset;                                  \
5290   orreq control_mask, control_mask, #0x1;                                      \
5291                                                                                \
5292   cmp tile_height, #1;                                                         \
5293   add block, block, num_blocks, lsl #6;                                        \
5294                                                                                \
5295   orreq control_mask, control_mask, #0x2;                                      \
5296   JT_OP_REL(9f, control_mask, temp);                                           \
5297   JT_OP(ldr pc, [pc, control_mask, lsl #2]);                                   \
5298   nop;                                                                         \
5299                                                                                \
5300  9:                                                                            \
5301  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode);   \
5302  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode);  \
5303  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode);  \
5304  .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5305  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode);   \
5306  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5307  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode);  \
5308  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5309  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode);   \
5310  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode);  \
5311  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode);  \
5312  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5313  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode);   \
5314  .word 0x00000000;                                                             \
5315  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode);  \
5316
5317
5318 setup_sprite_tiled_builder(4bpp,);
5319 setup_sprite_tiled_builder(8bpp,);
5320
5321 #undef draw_mask_fb_ptr_left
5322 #undef draw_mask_fb_ptr_right
5323
5324 setup_sprite_tiled_builder(4bpp, _4x);
5325 setup_sprite_tiled_builder(8bpp, _4x);
5326
5327
5328 #undef block_ptr
5329 #undef num_blocks
5330 #undef clut_ptr
5331
5332 #define psx_gpu                                           r0
5333 #define block_ptr                                         r0
5334 #define num_blocks                                        r1
5335 #define clut_ptr                                          r2
5336 #define texel_shift_mask                                  r3
5337 #define block_pixels_a                                    r4
5338 #define block_pixels_b                                    r5
5339 #define texel_0                                           r6
5340 #define texel_2                                           r7
5341 #define texel_4                                           r8
5342 #define texel_6                                           r9
5343 #define texel_1                                           r10
5344 #define texel_3                                           r11
5345 #define texel_5                                           r12
5346 #define texel_7                                           r14
5347 #define texels_01                                         r6
5348 #define texels_23                                         r7
5349 #define texels_45                                         r8
5350 #define texels_67                                         r9
5351
5352 function(texture_sprite_blocks_8bpp)
5353   stmdb sp!, { r4 - r11, r14 }
5354   movw texel_shift_mask, #(0xFF << 1)
5355
5356   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5357   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
5358
5359   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5360   ldr block_pixels_a, [block_ptr, #16]
5361
5362  0:
5363   and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5364   ldr block_pixels_b, [block_ptr, #20]
5365
5366   and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5367   ldrh texel_0, [clut_ptr, texel_0]
5368
5369   and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5370   ldrh texel_1, [clut_ptr, texel_1]
5371
5372   and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5373   ldr block_pixels_a, [block_ptr, #(64 + 16)]
5374
5375   ldrh texel_2, [clut_ptr, texel_2]
5376   and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5377
5378   ldrh texel_3, [clut_ptr, texel_3]
5379   and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5380
5381   ldrh texel_4, [clut_ptr, texel_4]
5382   and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5383
5384   ldrh texel_5, [clut_ptr, texel_5]
5385   and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5386
5387   ldrh texel_6, [clut_ptr, texel_6]
5388   orr texels_01, texel_0, texel_1, lsl #16
5389
5390   ldrh texel_7, [clut_ptr, texel_7]
5391   orr texels_23, texel_2, texel_3, lsl #16
5392
5393   orr texels_45, texel_4, texel_5, lsl #16
5394   str texels_01, [block_ptr, #0]
5395
5396   orr texels_67, texel_6, texel_7, lsl #16
5397   str texels_23, [block_ptr, #4]
5398
5399   subs num_blocks, num_blocks, #1
5400   str texels_45, [block_ptr, #8]
5401
5402   str texels_67, [block_ptr, #12]
5403   add block_ptr, block_ptr, #64
5404
5405   bne 0b
5406
5407   ldmia sp!, { r4 - r11, pc }
5408
5409
5410 #undef width_rounded
5411 #undef texture_mask
5412 #undef num_blocks
5413 #undef texture_offset
5414 #undef texels_low
5415 #undef texels_high
5416 #undef texels_wide_low
5417 #undef texels_wide_high
5418 #undef texels_wide
5419 #undef fb_ptr2
5420 #undef temp
5421
5422 #define psx_gpu                                           r0
5423 #define x                                                 r1
5424 #define y                                                 r2
5425 #define u                                                 r3
5426 #define v                                                 r4
5427 #define width                                             r5
5428 #define height                                            r6
5429 #define left_offset                                       r8
5430 #define width_rounded                                     r9
5431 #define right_width                                       r10
5432
5433 #define block_width                                       r11
5434
5435 #define texture_offset_base                               r1
5436 #define texture_mask                                      r2
5437 #define texture_page_ptr                                  r3
5438 #define num_blocks                                        r4
5439 #define block                                             r5
5440 #define fb_ptr                                            r7
5441 #define texture_offset                                    r8
5442 #define blocks_remaining                                  r9
5443 #define fb_ptr2                                           r10
5444 #define fb_ptr_pitch                                      r12
5445 #define texture_block_ptr                                 r14
5446
5447 #define texture_mask_width                                r2
5448 #define texture_mask_height                               r3
5449 #define left_mask_bits                                    r4
5450 #define right_mask_bits                                   r5
5451
5452
5453 #undef block_masks
5454 #undef block_masks_shifted
5455 #undef texels
5456
5457 #define block_masks                                       d0
5458 #define block_masks_shifted                               d1
5459 #define draw_mask_fb_ptr                                  d2
5460 #define texels                                            q2
5461
5462 #define draw_mask_fb_ptr_a                                d2
5463 #define draw_mask_fb_ptr_b                                d3
5464 #define texels_low                                        d4
5465 #define texels_high                                       d5
5466 #define texels_wide_low                                   d6
5467 #define texels_wide_high                                  d7
5468 #define texels_wide                                       q3
5469
5470
5471 setup_sprites_16bpp_flush:
5472   vpush { d0 - d3 }
5473
5474   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
5475   bl flush_render_block_buffer
5476   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
5477
5478   vpop { d0 - d3 }
5479
5480   add block, psx_gpu, #psx_gpu_blocks_offset
5481   mov num_blocks, block_width
5482
5483   bx lr
5484
5485 function(setup_sprite_16bpp)
5486   stmdb sp!, { r4 - r11, r14 }
5487   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5488
5489   ldr v, [sp, #36]
5490   add fb_ptr, fb_ptr, y, lsl #11
5491
5492   ldr width, [sp, #40]
5493   add fb_ptr, fb_ptr, x, lsl #1
5494
5495   ldr height, [sp, #44]
5496   and left_offset, u, #0x7
5497
5498   add texture_offset_base, u, u
5499   add width_rounded, width, #7
5500
5501   add texture_offset_base, texture_offset_base, v, lsl #11
5502   mov left_mask_bits, #0xFF
5503   
5504   ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
5505   add width_rounded, width_rounded, left_offset
5506
5507   ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
5508   sub fb_ptr, fb_ptr, left_offset, lsl #1
5509
5510   add texture_mask, texture_mask_width, texture_mask_width
5511   mov right_mask_bits, #0xFE
5512
5513   and right_width, width_rounded, #0x7
5514   mvn left_mask_bits, left_mask_bits, lsl left_offset
5515
5516   add texture_mask, texture_mask, texture_mask_height, lsl #11
5517   mov block_width, width_rounded, lsr #3
5518
5519   mov right_mask_bits, right_mask_bits, lsl right_width
5520   movw fb_ptr_pitch, #(2048 + 16)
5521
5522   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5523   vmov block_masks, left_mask_bits, right_mask_bits
5524
5525   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5526   add block, psx_gpu, #psx_gpu_blocks_offset
5527
5528   bic texture_offset_base, texture_offset_base, #0xF
5529   cmp block_width, #1
5530
5531   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
5532   add block, block, num_blocks, lsl #6
5533
5534   bne 0f
5535
5536   vext.32 block_masks_shifted, block_masks, block_masks, #1
5537   vorr.u32 block_masks, block_masks, block_masks_shifted
5538   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5539
5540  1:
5541   add num_blocks, num_blocks, #1
5542   cmp num_blocks, #MAX_BLOCKS
5543   blgt setup_sprites_16bpp_flush
5544
5545   and texture_block_ptr, texture_offset_base, texture_mask
5546   subs height, height, #1
5547
5548   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5549   vld1.u32 { texels }, [texture_block_ptr, :128]
5550
5551   vst1.u32 { texels }, [block, :128]
5552   add block, block, #40
5553
5554   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5555   pld [fb_ptr]
5556
5557   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5558
5559   add block, block, #24
5560   add texture_offset_base, texture_offset_base, #2048
5561   add fb_ptr, fb_ptr, #2048
5562   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5563   bne 1b
5564
5565   ldmia sp!, { r4 - r11, pc }
5566
5567  0:
5568   add num_blocks, num_blocks, block_width
5569   mov texture_offset, texture_offset_base
5570
5571   cmp num_blocks, #MAX_BLOCKS
5572   blgt setup_sprites_16bpp_flush
5573
5574   add texture_offset_base, texture_offset_base, #2048
5575   and texture_block_ptr, texture_offset, texture_mask
5576
5577   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5578   vld1.u32 { texels }, [texture_block_ptr, :128]  
5579
5580   vst1.u32 { texels }, [block, :128]
5581   add block, block, #40
5582
5583   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5584   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5585   pld [fb_ptr]
5586
5587   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5588   subs blocks_remaining, block_width, #2
5589
5590   add texture_offset, texture_offset, #16
5591   add fb_ptr, fb_ptr, #16
5592
5593   vmov.u8 draw_mask_fb_ptr, #0
5594
5595   add block, block, #24
5596   beq 2f
5597
5598  1:
5599   and texture_block_ptr, texture_offset, texture_mask
5600   subs blocks_remaining, blocks_remaining, #1
5601
5602   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5603   vld1.u32 { texels }, [texture_block_ptr, :128]
5604
5605   vst1.u32 { texels }, [block, :128]
5606   add block, block, #40
5607
5608   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5609   pld [fb_ptr]
5610
5611   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5612   
5613   add texture_offset, texture_offset, #16
5614   add fb_ptr, fb_ptr, #16
5615
5616   add block, block, #24
5617   bne 1b
5618
5619  2:
5620   and texture_block_ptr, texture_offset, texture_mask
5621   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5622
5623   vld1.u32 { texels }, [texture_block_ptr, :128]
5624   vdup.u8 draw_mask_fb_ptr, block_masks[4]
5625
5626   vst1.u32 { texels }, [block, :128]
5627   add block, block, #40
5628
5629   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5630   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5631   
5632   add block, block, #24
5633   subs height, height, #1
5634
5635   add fb_ptr, fb_ptr, fb_ptr_pitch
5636   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5637
5638   bne 0b
5639
5640   ldmia sp!, { r4 - r11, pc }
5641
5642
5643 // 4x version
5644 // FIXME: duplicate code with normal version :(
5645 #undef draw_mask_fb_ptr
5646
5647 function(setup_sprite_16bpp_4x)
5648   stmdb sp!, { r4 - r11, r14 }
5649   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5650
5651   ldr v, [sp, #36]
5652   add fb_ptr, fb_ptr, y, lsl #11
5653
5654   ldr width, [sp, #40]
5655   add fb_ptr, fb_ptr, x, lsl #1
5656
5657   ldr height, [sp, #44]
5658   and left_offset, u, #0x7
5659
5660   add texture_offset_base, u, u
5661   add width_rounded, width, #7
5662
5663   add texture_offset_base, texture_offset_base, v, lsl #11
5664   movw left_mask_bits, #0xFFFF
5665   
5666   ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
5667   add width_rounded, width_rounded, left_offset
5668
5669   lsl left_offset, #1
5670
5671   ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
5672   sub fb_ptr, fb_ptr, left_offset, lsl #1
5673
5674   add texture_mask, texture_mask_width, texture_mask_width
5675   movw right_mask_bits, #0xFFFC
5676
5677   and right_width, width_rounded, #0x7
5678   mvn left_mask_bits, left_mask_bits, lsl left_offset
5679
5680   lsl right_width, #1
5681
5682   add texture_mask, texture_mask, texture_mask_height, lsl #11
5683   mov block_width, width_rounded, lsr #3
5684
5685   mov right_mask_bits, right_mask_bits, lsl right_width
5686   movw fb_ptr_pitch, #(2048 + 16) * 2
5687
5688   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5689   vmov block_masks, left_mask_bits, right_mask_bits
5690
5691   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5692   add block, psx_gpu, #psx_gpu_blocks_offset
5693
5694   bic texture_offset_base, texture_offset_base, #0xF
5695   cmp block_width, #1
5696
5697   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
5698   add block, block, num_blocks, lsl #6
5699
5700   lsl block_width, #2
5701   bne 0f
5702
5703   vext.32 block_masks_shifted, block_masks, block_masks, #1
5704   vorr.u32 block_masks, block_masks, block_masks_shifted
5705   vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5706   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5707
5708  1:
5709   add num_blocks, num_blocks, block_width
5710   cmp num_blocks, #MAX_BLOCKS
5711   blgt setup_sprites_16bpp_flush
5712
5713   and texture_block_ptr, texture_offset_base, texture_mask
5714   subs height, height, #1
5715
5716   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5717   vld1.u32 { texels }, [texture_block_ptr, :128]
5718
5719   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5720
5721   add texture_offset_base, texture_offset_base, #2048
5722   add fb_ptr, fb_ptr, #2048*2
5723   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5724   bne 1b
5725
5726   ldmia sp!, { r4 - r11, pc }
5727
5728  0:
5729   add num_blocks, num_blocks, block_width
5730   mov texture_offset, texture_offset_base
5731
5732   vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5733   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5734
5735   cmp num_blocks, #MAX_BLOCKS
5736   blgt setup_sprites_16bpp_flush
5737
5738   add texture_offset_base, texture_offset_base, #2048
5739   and texture_block_ptr, texture_offset, texture_mask
5740
5741   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5742   vld1.u32 { texels }, [texture_block_ptr, :128]
5743
5744   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5745
5746   subs blocks_remaining, block_width, #2*4
5747   add texture_offset, texture_offset, #16
5748
5749   vmov.u8 draw_mask_fb_ptr_a, #0
5750   vmov.u8 draw_mask_fb_ptr_b, #0
5751
5752   add fb_ptr, fb_ptr, #16*2
5753   beq 2f
5754
5755  1:
5756   and texture_block_ptr, texture_offset, texture_mask
5757   subs blocks_remaining, blocks_remaining, #4
5758
5759   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5760   vld1.u32 { texels }, [texture_block_ptr, :128]
5761
5762   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5763   add texture_offset, texture_offset, #16
5764
5765   add fb_ptr, fb_ptr, #16*2
5766   bgt 1b
5767
5768  2:
5769   vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5770   vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5771
5772   and texture_block_ptr, texture_offset, texture_mask
5773   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5774
5775   vld1.u32 { texels }, [texture_block_ptr, :128]
5776
5777   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5778   subs height, height, #1
5779
5780   add fb_ptr, fb_ptr, fb_ptr_pitch
5781   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5782
5783   bne 0b
5784
5785   ldmia sp!, { r4 - r11, pc }
5786
5787
5788 #undef width
5789 #undef right_width
5790 #undef right_mask_bits
5791 #undef color
5792 #undef height
5793 #undef blocks_remaining
5794 #undef colors
5795 #undef right_mask
5796 #undef test_mask
5797 #undef draw_mask
5798
5799 #define psx_gpu                                           r0
5800 #define x                                                 r1
5801 #define y                                                 r2
5802 #define width                                             r3
5803 #define right_width                                       r5
5804 #define right_mask_bits                                   r6
5805 #define fb_ptr                                            r7
5806 #define color                                             r8
5807 #define height                                            r9
5808 #define fb_ptr_pitch                                      r12
5809
5810 // referenced by setup_sprites_16bpp_flush
5811 #define num_blocks                                        r4
5812 #define block                                             r5
5813 #define block_width                                       r11
5814
5815 #define color_r                                           r1
5816 #define color_g                                           r2
5817 #define color_b                                           r8
5818 #define blocks_remaining                                  r6
5819
5820 #define colors                                            q0
5821 #define right_mask                                        q1
5822 #define test_mask                                         q2
5823 #define draw_mask                                         q2
5824 #define draw_mask_bits_fb_ptr                             d6
5825
5826
5827 .align 3
5828
5829 function(setup_sprite_untextured)
5830   ldrh r12, [psx_gpu, #psx_gpu_render_state_offset]
5831   tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
5832     | RENDER_FLAGS_BLEND)
5833   ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset]
5834   tsteq r12, #RENDER_INTERLACE_ENABLED
5835   beq setup_sprite_untextured_simple
5836
5837   stmdb sp!, { r4 - r11, r14 }
5838
5839   ldr width, [sp, #40]
5840   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5841
5842   ldr height, [sp, #44]
5843   add fb_ptr, fb_ptr, y, lsl #11
5844
5845   add fb_ptr, fb_ptr, x, lsl #1
5846   sub right_width, width, #1
5847
5848   ldr color, [sp, #48]
5849   and right_width, #7
5850
5851   add block_width, width, #7
5852   add right_width, #1
5853
5854   lsr block_width, #3
5855   mov right_mask_bits, #0xff
5856
5857   sub fb_ptr_pitch, block_width, #1
5858   lsl right_mask_bits, right_width
5859
5860   lsl fb_ptr_pitch, #3+1
5861   ubfx color_r, color, #3, #5
5862
5863   rsb fb_ptr_pitch, #1024*2
5864   ubfx color_g, color, #11, #5
5865
5866   vld1.u32 { test_mask }, [psx_gpu, :128]
5867   ubfx color_b, color, #19, #5
5868
5869   vdup.u16 right_mask, right_mask_bits
5870   orr color, color_r, color_b, lsl #10
5871
5872   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5873   orr color, color, color_g, lsl #5
5874
5875   vtst.u16 right_mask, right_mask, test_mask
5876   add block, psx_gpu, #psx_gpu_blocks_offset
5877
5878   vdup.u16 colors, color
5879   add block, block, num_blocks, lsl #6
5880
5881
5882 setup_sprite_untextured_height_loop:
5883   add num_blocks, block_width
5884   sub blocks_remaining, block_width, #1
5885
5886   cmp num_blocks, #MAX_BLOCKS
5887   blgt setup_sprites_16bpp_flush
5888
5889   cmp blocks_remaining, #0
5890   ble 1f
5891
5892   vmov.u8 draw_mask, #0 /* zero_mask */
5893   vmov.u8 draw_mask_bits_fb_ptr, #0
5894
5895  0:
5896   vst1.u32 { draw_mask }, [block, :128]!
5897   subs blocks_remaining, #1
5898
5899   vst1.u32 { colors }, [block, :128]
5900   add block, block, #24
5901
5902   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5903   vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
5904   
5905   add block, block, #24
5906   add fb_ptr, #8*2
5907   bgt 0b
5908
5909  1:
5910   vst1.u32 { right_mask }, [block, :128]!
5911   subs height, #1
5912
5913   vst1.u32 { colors }, [block, :128]
5914   add block, block, #24
5915
5916   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5917   vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
5918   
5919   add block, block, #24
5920   add fb_ptr, fb_ptr_pitch
5921
5922   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5923   bgt setup_sprite_untextured_height_loop
5924
5925   ldmia sp!, { r4 - r11, pc }
5926
5927
5928
5929 #undef texture_page_ptr
5930 #undef vram_ptr
5931 #undef dirty_textures_mask
5932 #undef current_texture_mask
5933
5934 #define psx_gpu                                           r0
5935 #define current_texture_page                              r1
5936 #define texture_page_ptr                                  r2
5937 #define vram_ptr_a                                        r3
5938 #define current_texture_page_x                            r12
5939 #define current_texture_page_y                            r4
5940 #define dirty_textures_mask                               r5
5941 #define tile_y                                            r6
5942 #define tile_x                                            r7
5943 #define sub_y                                             r8
5944 #define current_texture_mask                              r9
5945 #define c_4096                                            r10
5946 #define vram_ptr_b                                        r11
5947
5948 #define texel_block_a                                     d0
5949 #define texel_block_b                                     d1
5950 #define texel_block_expanded_a                            q1
5951 #define texel_block_expanded_b                            q2
5952 #define texel_block_expanded_ab                           q2
5953 #define texel_block_expanded_c                            q3
5954 #define texel_block_expanded_d                            q0
5955 #define texel_block_expanded_cd                           q3
5956
5957 function(update_texture_4bpp_cache)
5958   stmdb sp!, { r4 - r11, r14 }
5959   vpush { q0 - q3 }
5960
5961   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
5962
5963   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
5964   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
5965
5966   and current_texture_page_x, current_texture_page, #0xF
5967   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
5968
5969   mov current_texture_page_y, current_texture_page, lsr #4
5970   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
5971
5972   add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5973   mov tile_y, #16
5974
5975   add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7  
5976   bic dirty_textures_mask, current_texture_mask
5977   
5978   mov tile_x, #16
5979   str dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
5980
5981   mov sub_y, #8
5982   movw c_4096, #4096
5983
5984   add vram_ptr_b, vram_ptr_a, #2048
5985
5986  0:
5987   vld1.u32 { texel_block_a }, [vram_ptr_a, :64], c_4096
5988   vld1.u32 { texel_block_b }, [vram_ptr_b, :64], c_4096
5989
5990   vmovl.u8 texel_block_expanded_a, texel_block_a
5991   vshll.u8 texel_block_expanded_b, texel_block_a, #4
5992   vmovl.u8 texel_block_expanded_c, texel_block_b
5993   vshll.u8 texel_block_expanded_d, texel_block_b, #4
5994
5995   vbic.u16 texel_block_expanded_a, #0x00F0
5996   vbic.u16 texel_block_expanded_b, #0x00F0
5997   vbic.u16 texel_block_expanded_c, #0x00F0
5998   vbic.u16 texel_block_expanded_d, #0x00F0
5999
6000   vorr.u16 texel_block_expanded_ab, texel_block_expanded_a,                    \
6001    texel_block_expanded_b
6002   vorr.u16 texel_block_expanded_cd, texel_block_expanded_c,                    \
6003    texel_block_expanded_d
6004
6005   vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd },               \
6006    [texture_page_ptr, :256]!
6007
6008   subs sub_y, sub_y, #1
6009   bne 0b
6010
6011   mov sub_y, #8
6012   add vram_ptr_a, vram_ptr_a, #8
6013   add vram_ptr_b, vram_ptr_b, #8
6014
6015   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6016   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6017
6018   subs tile_x, tile_x, #1
6019   bne 0b
6020
6021   mov tile_x, #16
6022   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6023   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6024
6025   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6026   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6027
6028   subs tile_y, tile_y, #1
6029   bne 0b
6030
6031   vpop { q0 - q3 }
6032   ldmia sp!, { r4 - r11, pc }
6033
6034
6035 #undef current_texture_page
6036
6037 #define psx_gpu                                           r0
6038 #define texture_page                                      r1
6039 #define texture_page_ptr                                  r2
6040 #define vram_ptr_a                                        r3
6041 #define texture_page_x                                    r12
6042 #define texture_page_y                                    r4
6043 #define current_texture_page                              r5
6044 #define tile_y                                            r6
6045 #define tile_x                                            r7
6046 #define sub_y                                             r8
6047 #define c_4096                                            r10
6048 #define vram_ptr_b                                        r11
6049
6050
6051 #undef texels_a
6052 #undef texels_b
6053
6054 #define texels_a                                          q0
6055 #define texels_b                                          q1
6056 #define texels_c                                          q2
6057 #define texels_d                                          q3
6058
6059
6060 function(update_texture_8bpp_cache_slice)
6061   stmdb sp!, { r4 - r11, r14 }
6062   vpush { q0 - q3 }
6063
6064   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
6065   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
6066
6067   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
6068   mov tile_y, #16
6069
6070   and texture_page_x, texture_page, #0xF
6071   mov texture_page_y, texture_page, lsr #4
6072
6073   add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7  
6074   mov tile_x, #8
6075
6076   add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6077   eor current_texture_page, current_texture_page, texture_page
6078
6079   ands current_texture_page, current_texture_page, #0x1
6080   mov sub_y, #4
6081
6082   addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6083   movw c_4096, #4096
6084
6085   add vram_ptr_b, vram_ptr_a, #2048
6086
6087  0:
6088   vld1.u32 { texels_a }, [vram_ptr_a, :128], c_4096
6089   vld1.u32 { texels_b }, [vram_ptr_b, :128], c_4096
6090   vld1.u32 { texels_c }, [vram_ptr_a, :128], c_4096
6091   vld1.u32 { texels_d }, [vram_ptr_b, :128], c_4096
6092
6093   vst1.u32 { texels_a, texels_b }, [texture_page_ptr, :256]!
6094   vst1.u32 { texels_c, texels_d }, [texture_page_ptr, :256]!
6095
6096   subs sub_y, sub_y, #1
6097   bne 0b
6098
6099   mov sub_y, #4
6100
6101   add vram_ptr_a, vram_ptr_a, #16
6102   add vram_ptr_b, vram_ptr_b, #16
6103
6104   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6105   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6106
6107   subs tile_x, tile_x, #1
6108   bne 0b
6109
6110   mov tile_x, #8
6111
6112   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6113   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6114
6115   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6116   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6117
6118   subs tile_y, tile_y, #1
6119   add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6120
6121   bne 0b
6122
6123   vpop { q0 - q3 }
6124   ldmia sp!, { r4 - r11, pc }
6125
6126
6127 /* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6128 function(scale2x_tiles8)
6129   push { r4, r14 }
6130
6131   mov r4, r1
6132   add r12, r0, #1024*2
6133   mov r14, r2
6134
6135 0:
6136   vld1.u16 { q0 }, [r1, :128]!
6137   vld1.u16 { q2 }, [r1, :128]!
6138   vmov q1, q0
6139   vmov q3, q2
6140   vzip.16 q0, q1
6141   vzip.16 q2, q3
6142   subs r14, #2
6143   vst1.u16 { q0, q1 }, [r0, :128]!
6144   vst1.u16 { q0, q1 }, [r12, :128]!
6145   blt 1f
6146   vst1.u16 { q2, q3 }, [r0, :128]!
6147   vst1.u16 { q2, q3 }, [r12, :128]!
6148   bgt 0b
6149 1:
6150   subs r3, #1
6151   mov r14, r2
6152   add r0, #1024*2*2
6153   add r4, #1024*2
6154   sub r0, r0, r2, lsl #4+1
6155   mov r1, r4
6156   add r12, r0, #1024*2
6157   bgt 0b
6158   nop
6159
6160   pop { r4, pc }
6161
6162 // vim:filetype=armasm