gpu_neon: try to make the compiler save some callee-save regs
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation; either version 2 of
8  * the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  */
15
16 #define MAX_SPANS                                         512
17 #define MAX_BLOCKS                                        64
18 #define MAX_BLOCKS_PER_ROW                                128
19
20 #define RENDER_STATE_MASK_EVALUATE                        0x20
21 #define RENDER_FLAGS_MODULATE_TEXELS                      0x1
22 #define RENDER_FLAGS_BLEND                                0x2
23 #define RENDER_INTERLACE_ENABLED                          0x1
24
25 #include "psx_gpu_offsets.h"
26
27 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
28
29 #define edge_data_left_x_offset                           0
30 #define edge_data_num_blocks_offset                       2
31 #define edge_data_right_mask_offset                       4
32 #define edge_data_y_offset                                6
33
34 .syntax unified
35 .text
36
37 #if 0
38 #define save_abi_regs() \
39   vpush {q4-q7}
40 #define restore_abi_regs() \
41   vpop  {q4-q7}
42 #else
43 #define save_abi_regs()
44 #define restore_abi_regs()
45 #endif
46
47 #define psx_gpu                                           r0
48 #define v_a                                               r1
49 #define v_b                                               r2
50 #define v_c                                               r3
51
52 #define x0                                                r4
53 #define x1                                                r5
54 #define x2                                                r6
55 #define x0_x1                                             r5
56 #define x1_x2                                             r6
57 #define y0                                                r7
58 #define y1                                                r8
59 #define y2                                                r9
60 #define y0_y1                                             r7
61 #define y1_y2                                             r8
62 #define b0                                                r9
63 #define b1                                                r10
64 #define b2                                                r11
65 #define b0_b1                                             r10
66 #define b1_b2                                             r11
67
68
69 #define area_r_s                                          r5
70
71 #define g_bx0                                             r2
72 #define g_bx                                              r3
73 #define g_bx2                                             r4
74 #define g_bx3                                             r5
75 #define b_base                                            r6
76 #define g_by                                              r8
77
78 #define gs_bx                                             r7
79 #define gs_by                                             r10
80
81 #define ga_bx                                             g_bx
82 #define ga_by                                             g_by
83
84 #define gw_bx_h                                           g_bx
85 #define gw_by_h                                           g_by
86
87 #define gw_bx_l                                           r11
88 #define gw_by_l                                           gw_bx_l
89
90 #define store_a                                           r0
91 #define store_b                                           r1
92 #define store_inc                                         r5
93
94
95 #define v0                                                q0
96 #define uvrgb0                                            d0
97 #define x0_y0                                             d1
98
99 #define v1                                                q1
100 #define uvrgb1                                            d2
101 #define x1_y1                                             d3
102
103 #define v2                                                q2
104 #define uvrgb2                                            d4
105 #define x2_y2                                             d5
106
107 #define x0_ab                                             q3
108 #define uvrg_xxxx0                                        q3
109 #define uvrg0                                             d6
110 #define xxxx0                                             d7
111
112 #define x1_ab                                             q4
113 #define uvrg_xxxx1                                        q4
114 #define uvrg1                                             d8
115 #define xxxx1                                             d9
116
117 #define x2_ab                                             q5
118 #define uvrg_xxxx2                                        q5
119 #define uvrg2                                             d10
120 #define xxxx2                                             d11
121
122 #define y0_ab                                             q6
123 #define yyyy_uvrg0                                        q6
124 #define yyyy0                                             d12
125 #define uvrg0b                                            d13
126
127 #define y1_ab                                             q7
128 #define yyyy_uvrg1                                        q7
129 #define yyyy1                                             d14
130 #define uvrg1b                                            d15
131
132 #define y2_ab                                             q8
133 #define yyyy_uvrg2                                        q8
134 #define yyyy2                                             d16
135 #define uvrg2b                                            d17
136
137 #define d0_ab                                             q9
138 #define d0_a                                              d18
139 #define d0_b                                              d19
140
141 #define d1_ab                                             q10
142 #define d1_a                                              d20
143 #define d1_b                                              d21
144
145 #define d2_ab                                             q11
146 #define d2_a                                              d22
147 #define d2_b                                              d23
148
149 #define d3_ab                                             q12
150 #define d3_a                                              d24
151 #define d3_b                                              d25
152
153 #define ga_uvrg_x                                         q1
154 #define ga_uvrg_y                                         q4
155
156 #define dx                                                x0_x1
157 #define dy                                                y0_y1
158 #define db                                                b0_b1
159
160 #define uvrg_base                                         q11
161
162 #define gs_uvrg_x                                         q5
163 #define gs_uvrg_y                                         q6
164
165 #define g_uvrg_x                                          q1
166 #define ga_uv_x                                           d2
167 #define g_uv_x                                            d2
168 #define ga_rg_x                                           d3
169 #define g_rg_x                                            d3
170
171 #define g_uvrg_y                                          q4
172 #define ga_uv_y                                           d8
173 #define g_uv_y                                            d8
174 #define ga_rg_y                                           d9
175 #define g_rg_y                                            d9
176
177 #define gw_uv_x                                           q1
178 #define gw_rg_x                                           q2
179 #define gw_uv_y                                           q4
180 #define gw_rg_y                                           q3
181
182 #define w_mask                                            q9
183 #define w_mask_l                                          d18
184
185 #define r_shift                                           q10
186
187 #define uvrg_dx0                                          q0
188 #define uvrg_dx0l                                         d0
189 #define uvrg_dx0h                                         d1
190
191 #define uvrg_dx1                                          q1
192 #define uvrg_dx1l                                         d2
193 #define uvrg_dx1h                                         d3
194
195 #define uvrg_dx2                                          q2
196 #define uvrg_dx2l                                         d4
197 #define uvrg_dx2h                                         d5
198
199 #define uvrg_dx3                                          q3
200 #define uvrg_dx3l                                         d6
201 #define uvrg_dx3h                                         d7
202
203 #define uvrgb_phase                                       q13
204
205 .align 4
206
207 #include "arm_features.h"
208
209 #define function(name) FUNCTION(name):
210
211 #ifndef TEXRELS_FORBIDDEN
212
213 #define JT_OP_REL(table_label, index_reg, temp)
214 #define JT_OP(x...) x
215 #define JTE(start, target) target
216
217 #else
218
219 #define JT_OP_REL(table_label, index_reg, temp)                                \
220   adr temp, table_label;                                                       \
221   ldr temp, [temp, index_reg, lsl #2];                                         \
222   add pc, pc, temp                                                             \
223
224 #define JT_OP(x...)
225 #define JTE(start, target) (target - start)
226
227 #endif
228
229 #ifdef __MACH__
230 #define flush_render_block_buffer _flush_render_block_buffer
231 #define setup_sprite_untextured_simple _setup_sprite_untextured_simple
232 #define update_texture_8bpp_cache _update_texture_8bpp_cache
233 #endif
234
235 @ r0: psx_gpu
236 @ r1: v_a
237 @ r2: v_b
238 @ r3: v_c
239
240 function(compute_all_gradients)
241   // First compute the triangle area reciprocal and shift. The division will
242   // happen concurrently with much of the work which follows.
243   @ r12 = psx_gpu->triangle_area
244   ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
245   stmdb sp!, { r4 - r11, lr }
246   save_abi_regs()
247
248   @ load exponent of 62 into upper half of double
249   movw r4, #0
250   clz r14, r12                       @ r14 = shift
251
252   movt r4, #((62 + 1023) << 4)
253   mov r12, r12, lsl r14              @ r12 = triangle_area_normalized
254
255   @ load area normalized into lower half of double
256   mov r5, r12, lsr #10
257   vmov.f64 d30, r5, r4               @ d30 = (1 << 62) + ta_n
258
259   movt r4, #((1022 + 31) << 4)
260   mov r5, r12, lsl #20
261
262   add r4, r4, r12, lsr #11
263   vmov.f64 d31, r5, r4
264
265   vdiv.f64 d30, d30, d31             @ d30 = ((1 << 62) + ta_n) / ta_n
266
267   // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
268   // ( d0       *  d1      ) - ( d2       *  d3      ) =
269   // ( m0                  ) - ( m1                  ) = gradient
270
271   // This is split to do 12 elements at a time over three sets: a, b, and c.
272   // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
273   // two of the slots are unused.
274
275   // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
276   // is g.
277
278   // First type is:  uvrg bxxx xxxx 
279   // Second type is: yyyy ybyy uvrg 
280   // Since x_a and y_c are the same the same variable is used for both. 
281
282   vld1.u32 { v0 }, [v_a, :128]       @ v0 = { uvrg0, b0, x0, y0 }
283   ldrsh x0, [v_a, #8]                @ load x0
284
285   vld1.u32 { v1 }, [v_b, :128]       @ v1 = { uvrg1, b1, x1, y1}
286   ldrh x1, [v_b, #8]                 @ load x1
287
288   vld1.u32 { v2 }, [v_c, :128]       @ v2 = { uvrg2, b2, x2, y2 }
289   ldrh x2, [v_c, #8]                 @ load x2
290
291   vmovl.u8 uvrg_xxxx0, uvrgb0        @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
292   ldrh y0, [v_a, #10]                @ load y0
293
294   vmovl.u8 uvrg_xxxx1, uvrgb1        @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
295   ldrh y1, [v_b, #10]                @ load y1
296
297   vmovl.u8 uvrg_xxxx2, uvrgb2        @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
298   ldrh y2, [v_c, #10]                @ load y2
299
300   vmov.u8 uvrg0b, uvrg0              @ uvrg0b = { uv0, rg0 }
301   vdup.u16 xxxx0, x0_y0[0]           @ xxxx0 = { xx0, xx0 }
302
303   orr x1_x2, x1, x2, lsl #16         @ x1_x2 = { x1, x2 }
304   pkhbt x0_x1, x0, x1, lsl #16       @ x0_x1 = { x0, x1 }
305
306   vmov.u8 uvrg1b, uvrg1              @ uvrg1b = { uv1, rg1 }
307   vdup.u16 xxxx1, x1_y1[0]           @ xxxx1 = { xx1, xx1 }
308
309   vmov.u8 uvrg2b, uvrg2              @ uvrg2b = { uv2, rg2 }
310   vdup.u16 xxxx2, x2_y2[0]           @ xxxx2 = { xx2, xx2 }
311
312   ldrb b2, [v_c, #4]                 @ load b2
313   orr y0_y1, y0, y1, lsl #16         @ y0_y1 = { y0, y1 }
314
315   ldrb b1, [v_b, #4]                 @ load b1
316   orr y1_y2, y1, y2, lsl #16         @ y1_y2 = { y1, y2 }
317
318   vdup.u16 yyyy0, x0_y0[1]           @ yyyy0 = { yy0, yy0 }
319   vsub.s16 d0_ab, x1_ab, x0_ab
320
321   ldrb b0, [v_a, #4]                 @ load b0
322   orr b1_b2, b1, b2, lsl #16         @ b1_b2 = { b1, b2 }
323
324   vdup.u16 yyyy1, x1_y1[1]           @ yyyy1 = { yy1, yy1 }
325   vsub.s16 d2_ab, x2_ab, x1_ab
326
327   vdup.u16 yyyy2, x2_y2[1]           @ yyyy2 = { yy2, yy2 }
328   vsub.s16 d1_ab, y2_ab, y1_ab
329
330   orr b0_b1, b0, b1, lsl #16         @ b1_b2 = { b1, b2 }
331   ssub16 dx, x1_x2, x0_x1            @ dx = { x1 - x0, x2 - x1 }
332
333   ssub16 dy, y1_y2, y0_y1            @ dy = { y1 - y0, y2 - y1 }
334   ssub16 db, b1_b2, b0_b1            @ db = { b1 - b0, b2 - b1 }
335
336   vsub.s16 d3_ab, y1_ab, y0_ab
337   smusdx ga_by, dx, db               @ ga_by = ((x1 - x0) * (b2 - b1)) -
338                                      @         ((x2 - X1) * (b1 - b0)) 
339   vmull.s16 ga_uvrg_x, d0_a, d1_a
340   smusdx ga_bx, db, dy               @ ga_bx = ((b1 - b0) * (y2 - y1)) -
341                                      @         ((b2 - b1) * (y1 - y0))
342   vmlsl.s16 ga_uvrg_x, d2_a, d3_a
343   movs gs_bx, ga_bx, asr #31
344
345   vmull.s16 ga_uvrg_y, d0_b, d1_b
346   rsbmi ga_bx, ga_bx, #0
347
348   @ r12 = psx_gpu->uvrgb_phase
349   ldr r12, [psx_gpu, #psx_gpu_uvrgb_phase_offset]
350
351   vmlsl.s16 ga_uvrg_y, d2_b, d3_b
352   movs gs_by, ga_by, asr #31
353
354   vshr.u64 d0, d30, #22
355   add b_base, r12, b0, lsl #16
356
357   vdup.u32 uvrgb_phase, r12
358
359   rsbmi ga_by, ga_by, #0
360   vclt.s32 gs_uvrg_x, ga_uvrg_x, #0  @ gs_uvrg_x = ga_uvrg_x < 0
361
362   @ r12 = psx_gpu->triangle_winding_offset
363   ldrb r12, [psx_gpu, #psx_gpu_triangle_winding_offset]
364   vclt.s32 gs_uvrg_y, ga_uvrg_y, #0  @ gs_uvrg_y = ga_uvrg_y < 0
365
366   rsb r12, r12, #0                   @ r12 = -(triangle->winding)
367
368   vdup.u32 w_mask, r12               @ w_mask = { -w, -w, -w, -w }
369   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
370
371   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
372   vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
373
374   vadd.u32 uvrg_base, uvrgb_phase
375   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
376
377   vmov area_r_s, s0                  @ area_r_s = triangle_reciprocal
378   vabs.s32 ga_uvrg_y, ga_uvrg_y      @ ga_uvrg_y = abs(ga_uvrg_y)
379
380   vmull.u32 gw_rg_x, ga_rg_x, d0[0]
381   vmull.u32 gw_uv_x, ga_uv_x, d0[0]
382   vmull.u32 gw_rg_y, ga_rg_y, d0[0]
383   vmull.u32 gw_uv_y, ga_uv_y, d0[0]
384
385   vshl.u64 gw_rg_x, gw_rg_x, r_shift
386   vshl.u64 gw_uv_x, gw_uv_x, r_shift
387   vshl.u64 gw_rg_y, gw_rg_y, r_shift
388   vshl.u64 gw_uv_y, gw_uv_y, r_shift
389
390   veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
391   vmovn.u64 g_uv_x, gw_uv_x
392
393   veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
394   vmovn.u64 g_rg_x, gw_rg_x
395
396   veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
397   vmovn.u64 g_uv_y, gw_uv_y
398
399   vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
400   vmovn.u64 g_rg_y, gw_rg_y
401
402   veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
403   mov ga_bx, ga_bx, lsl #13
404
405   vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
406   mov ga_by, ga_by, lsl #13
407
408   vdup.u32 x0_y0, x0
409   umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
410
411   vshl.u32 g_uvrg_x, g_uvrg_x, #4
412   vshl.u32 g_uvrg_y, g_uvrg_y, #4
413
414   umull gw_by_l, gw_by_h, ga_by, area_r_s
415   vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
416
417   eor gs_bx, gs_bx, r12
418   vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
419
420   veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
421   eor gs_by, gs_by, r12
422
423   rsb r11, r14, #0                   @ r11 = negative shift for scalar lsr
424   add store_a, psx_gpu, #psx_gpu_uvrg_offset
425
426   sub r11, r11, #(32 - 13)
427
428   add store_b, store_a, #16
429   mov store_inc, #32
430
431   vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
432   vst1.u32 { uvrg_base }, [store_a, :128], store_inc
433
434   vst1.u32 { uvrg_dx1 }, [store_b, :128], store_inc
435   mov g_bx, gw_bx_h, lsr r11
436
437   vst1.u32 { g_uvrg_y }, [store_a, :128], store_inc
438   mov g_by, gw_by_h, lsr r11
439
440   vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l },                     \
441    [store_b, :128], store_inc
442   eor g_bx, g_bx, gs_bx
443
444   vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h },                     \
445    [store_b, :128], store_inc
446   sub g_bx, g_bx, gs_bx
447
448   lsl g_bx, g_bx, #4  
449   eor g_by, g_by, gs_by
450
451   mls b_base, g_bx, x0, b_base
452   sub g_by, g_by, gs_by
453
454   lsl g_by, g_by, #4
455   mov g_bx0, #0
456
457   add g_bx2, g_bx, g_bx
458   add g_bx3, g_bx, g_bx2
459
460   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
461
462   restore_abi_regs()
463   ldmia sp!, { r4 - r11, pc }
464
465
466 #define psx_gpu                                  r0
467 #define v_a                                      r1
468 #define v_b                                      r2
469 #define v_c                                      r3
470
471 #define temp                                     r14
472
473 #define x_a                                      r4
474 #define x_b                                      r5
475 #define x_c                                      r6
476 #define y_a                                      r1
477 #define y_b                                      r2
478 #define y_c                                      r3
479
480 #define height_minor_a                           r7
481 #define height_minor_b                           r8
482 #define height_major                             r9
483 #define height                                   r9
484
485 #define reciprocal_table_ptr                     r10
486
487 #define edge_alt_low                             r4
488 #define edge_alt_high                            r5
489 #define edge_dx_dy_alt                           r6
490 #define edge_shift_alt                           r10
491
492 #define edge_dx_dy_alt_low                       r4
493 #define edge_dx_dy_alt_high                      r5
494
495 #define span_edge_data                           r4
496 #define span_uvrg_offset                         r5
497 #define span_b_offset                            r6
498
499 #define clip                                     r14
500
501 #define b                                        r11
502 #define b_dy                                     r12
503
504
505 #define alternate_x                              q0
506 #define alternate_dx_dy                          q1
507 #define alternate_x_32                           q2
508
509 #define alternate_x_low                          d0
510 #define alternate_x_high                         d1
511 #define alternate_dx_dy_low                      d2
512 #define alternate_dx_dy_high                     d3
513 #define alternate_x_32_low                       d4
514 #define alternate_x_32_high                      d5
515
516 #define left_x                                   q3
517 #define right_x                                  q4
518 #define left_dx_dy                               q5
519 #define right_dx_dy                              q6
520 #define left_edge                                q7
521 #define right_edge                               q8
522
523 #define left_x_low                               d6
524 #define left_x_high                              d7
525 #define right_x_low                              d8
526 #define right_x_high                             d9
527 #define left_dx_dy_low                           d10
528 #define left_dx_dy_high                          d11
529 #define right_dx_dy_low                          d12
530 #define right_dx_dy_high                         d13
531 #define left_edge_low                            d14
532 #define left_edge_high                           d15
533 #define right_edge_low                           d16
534 #define right_edge_high                          d17
535
536 #define y_mid_point                              d18
537 #define c_0x0004                                 d19
538
539 #define left_right_x_16                          q11
540 #define span_shifts_y                            q12
541 #define c_0x0001                                 q13
542
543 #define span_shifts                              d24
544 #define y_x4                                     d25
545 #define c_0xFFFE                                 d26
546 #define c_0x0007                                 d27
547
548 #define left_right_x_16_low                      d22
549 #define left_right_x_16_high                     d23
550
551 #define uvrg                                     q14
552 #define uvrg_dy                                  q15
553
554 #define alternate_x_16                           d4
555
556 #define v_clip                                   q3
557 #define v_clip_low                               d6
558
559 #define right_x_32                               q10
560 #define left_x_32                                q11
561 #define alternate_select                         d24
562
563 #define right_x_32_low                           d20
564 #define right_x_32_high                          d21
565 #define left_x_32_low                            d22
566 #define left_x_32_high                           d23
567
568 #define edges_xy                                 q0
569 #define edges_dx_dy                              d2
570 #define edge_shifts                              d3
571 #define edge_shifts_64                           q2
572
573 #define edges_xy_left                            d0
574 #define edges_xy_right                           d1
575
576 #define height_reciprocals                       d6
577 #define heights                                  d7
578
579 #define widths                                   d8
580 #define c_0x01                                   d9
581 #define x_starts                                 d10
582 #define x_ends                                   d11
583
584 #define heights_b                                d12
585 #define edges_dx_dy_64                           q10
586
587 #define edges_dx_dy_64_left                      d20
588 #define edges_dx_dy_64_right                     d21
589
590
591 #define setup_spans_prologue()                                                 \
592   stmdb sp!, { r4 - r11, lr };                                                 \
593   save_abi_regs();                                                             \
594                                                                                \
595   ldrsh x_a, [v_a, #8];                                                        \
596   ldrsh x_b, [v_b, #8];                                                        \
597   ldrsh x_c, [v_c, #8];                                                        \
598   ldrsh y_a, [v_a, #10];                                                       \
599   ldrsh y_b, [v_b, #10];                                                       \
600   ldrsh y_c, [v_c, #10];                                                       \
601                                                                                \
602   add temp, psx_gpu, #psx_gpu_uvrg_offset;                                     \
603   vld1.32 { uvrg }, [temp];                                                    \
604   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
605   vld1.32 { uvrg_dy }, [temp];                                                 \
606   ldr reciprocal_table_ptr, [psx_gpu, #psx_gpu_reciprocal_table_ptr_offset];   \
607                                                                                \
608   vmov.u32 c_0x01, #0x01                                                       \
609
610 #define setup_spans_load_b()                                                   \
611   ldr b, [psx_gpu, #psx_gpu_b_offset];                                         \
612   ldr b_dy, [psx_gpu, #psx_gpu_b_dy_offset]                                    \
613
614 #define setup_spans_prologue_b()                                               \
615   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
616   add temp, psx_gpu, #psx_gpu_viewport_start_x_offset;                         \
617                                                                                \
618   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
619   vmov.u16 c_0x0004, #0x0004;                                                  \
620                                                                                \
621   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
622   vmov.u16 c_0x0001, #0x0001;                                                  \
623                                                                                \
624   vld1.u16 { left_edge_low[], left_edge_high[] }, [temp];                      \
625   add temp, psx_gpu, #psx_gpu_viewport_end_x_offset;                           \
626                                                                                \
627   vld1.u16 { right_edge_low[], right_edge_high[] }, [temp];                    \
628   vadd.u16 right_edge, right_edge, c_0x0001;                                   \
629                                                                                \
630   vmov.u16 c_0x0007, #0x0007;                                                  \
631   vmvn.u16 c_0xFFFE, #0x0001                                                   \
632
633
634 #define compute_edge_delta_x2()                                                \
635   ldr temp, [reciprocal_table_ptr, height, lsl #2];                            \
636                                                                                \
637   vdup.u32 heights, height;                                                    \
638   vsub.u32 widths, x_ends, x_starts;                                           \
639                                                                                \
640   vdup.u32 edge_shifts, temp;                                                  \
641   vsub.u32 heights_b, heights, c_0x01;                                         \
642   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
643                                                                                \
644   vmla.s32 heights_b, x_starts, heights;                                       \
645   vbic.u16 edge_shifts, #0xE0;                                                 \
646   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
647   vmull.s32 edges_xy, heights_b, height_reciprocals                            \
648
649 #define width_alt                 r6
650 #define height_reciprocal_alt     r11
651 #define height_b_alt              r12
652
653 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
654   vmov heights, height_a, height_b;                                            \
655   ldr temp, [reciprocal_table_ptr, height_a, lsl #2];                          \
656   vmov.u32 edge_shifts[0], temp;                                               \
657   ldr temp, [reciprocal_table_ptr, height_b, lsl #2];                          \
658   vmov.u32 edge_shifts[1], temp;                                               \
659   ldr edge_shift_alt, [reciprocal_table_ptr, height_minor_b, lsl #2];          \
660                                                                                \
661   vsub.u32 widths, x_ends, x_starts;                                           \
662   sub width_alt, x_c, start_c;                                                 \
663                                                                                \
664   vsub.u32 heights_b, heights, c_0x01;                                         \
665   sub height_b_alt, height_minor_b, #1;                                        \
666                                                                                \
667   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
668   lsr height_reciprocal_alt, edge_shift_alt, #10;                              \
669                                                                                \
670   vmla.s32 heights_b, x_starts, heights;                                       \
671   mla height_b_alt, height_minor_b, start_c, height_b_alt;                     \
672                                                                                \
673   vbic.u16 edge_shifts, #0xE0;                                                 \
674   and edge_shift_alt, edge_shift_alt, #0x1F;                                   \
675                                                                                \
676   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
677   mul edge_dx_dy_alt, width_alt, height_reciprocal_alt;                        \
678                                                                                \
679   vmull.s32 edges_xy, heights_b, height_reciprocals;                           \
680   smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt       \
681
682
683 #define setup_spans_adjust_y_up()                                              \
684   vsub.u32 y_x4, y_x4, c_0x0004                                                \
685
686 #define setup_spans_adjust_y_down()                                            \
687   vadd.u32 y_x4, y_x4, c_0x0004                                                \
688
689 #define setup_spans_adjust_interpolants_up()                                   \
690   vsub.u32 uvrg, uvrg, uvrg_dy;                                                \
691   sub b, b, b_dy                                                               \
692
693 #define setup_spans_adjust_interpolants_down()                                 \
694   vadd.u32 uvrg, uvrg, uvrg_dy;                                                \
695   add b, b, b_dy                                                               \
696
697
698 #define setup_spans_clip_interpolants_increment()                              \
699   mla b, b_dy, clip, b;                                                        \
700   vmla.s32 uvrg, uvrg_dy, v_clip                                               \
701
702 #define setup_spans_clip_interpolants_decrement()                              \
703   mls b, b_dy, clip, b;                                                        \
704   vmls.s32 uvrg, uvrg_dy, v_clip                                               \
705
706 #define setup_spans_clip_alternate_yes()                                       \
707   smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip                      \
708
709 #define setup_spans_clip_alternate_no()                                        \
710
711 #define setup_spans_clip(direction, alternate_active)                          \
712   vdup.u32 v_clip, clip;                                                       \
713   setup_spans_clip_alternate_##alternate_active();                             \
714   setup_spans_clip_interpolants_##direction();                                 \
715   vmlal.s32 edges_xy, edges_dx_dy, v_clip_low                                  \
716
717
718 #define setup_spans_adjust_edges_alternate_no(left_index, right_index)         \
719   vmovl.s32 edge_shifts_64, edge_shifts;                                       \
720   vmovl.s32 edges_dx_dy_64, edges_dx_dy;                                       \
721                                                                                \
722   vshl.s64 edges_xy, edges_xy, edge_shifts_64;                                 \
723   vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64;                     \
724                                                                                \
725   vmov left_x_low, edges_xy_##left_index;                                      \
726   vmov right_x_low, edges_xy_##right_index;                                    \
727                                                                                \
728   vmov left_dx_dy_low, edges_dx_dy_64_##left_index;                            \
729   vmov left_dx_dy_high, edges_dx_dy_64_##left_index;                           \
730   vmov right_dx_dy_low, edges_dx_dy_64_##right_index;                          \
731   vmov right_dx_dy_high, edges_dx_dy_64_##right_index;                         \
732                                                                                \
733   vadd.u64 left_x_high, left_x_low, left_dx_dy_low;                            \
734   vadd.u64 right_x_high, right_x_low, right_dx_dy_low;                         \
735                                                                                \
736   vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy;                                 \
737   vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy                               \
738
739
740 #define setup_spans_adjust_edges_alternate_yes(left_index, right_index)        \
741   setup_spans_adjust_edges_alternate_no(left_index, right_index);              \
742                                                                                \
743   vdup.u16 y_mid_point, y_b;                                                   \
744   rsb temp, edge_shift_alt, #32;                                               \
745                                                                                \
746   lsl edge_alt_high, edge_alt_high, edge_shift_alt;                            \
747   orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp;                    \
748   lsl edge_alt_low, edge_alt_low, edge_shift_alt;                              \
749   vmov alternate_x_low, edge_alt_low, edge_alt_high;                           \
750                                                                                \
751   asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp;                               \
752   lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt;                      \
753   vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high;           \
754   vmov alternate_dx_dy_high, alternate_dx_dy_low;                              \
755                                                                                \
756   vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low;             \
757   vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy                   \
758
759
760 #define setup_spans_y_select_up()                                              \
761   vclt.s16 alternate_select, y_x4, y_mid_point                                 \
762
763 #define setup_spans_y_select_down()                                            \
764   vcgt.s16 alternate_select, y_x4, y_mid_point                                 \
765
766
767 #define setup_spans_alternate_select_left()                                    \
768   vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select               \
769
770 #define setup_spans_alternate_select_right()                                   \
771   vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select              \
772
773
774 #define setup_spans_set_x4_alternate_yes(alternate, direction)                 \
775   vshrn.s64 alternate_x_32_low, alternate_x, #32;                              \
776   vshrn.s64 left_x_32_low, left_x, #32;                                        \
777   vshrn.s64 right_x_32_low, right_x, #32;                                      \
778                                                                                \
779   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
780   vadd.u64 left_x, left_x, left_dx_dy;                                         \
781   vadd.u64 right_x, right_x, right_dx_dy;                                      \
782                                                                                \
783   vshrn.s64 alternate_x_32_high, alternate_x, #32;                             \
784   vshrn.s64 left_x_32_high, left_x, #32;                                       \
785   vshrn.s64 right_x_32_high, right_x, #32;                                     \
786                                                                                \
787   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
788   vadd.u64 left_x, left_x, left_dx_dy;                                         \
789   vadd.u64 right_x, right_x, right_dx_dy;                                      \
790                                                                                \
791   vmovn.u32 alternate_x_16, alternate_x_32;                                    \
792   setup_spans_y_select_##direction();                                          \
793   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
794                                                                                \
795   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
796   setup_spans_alternate_select_##alternate();                                  \
797                                                                                \
798   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
799   str b, [span_b_offset], #4;                                                  \
800   setup_spans_adjust_interpolants_##direction();                               \
801                                                                                \
802   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
803                                                                                \
804   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
805   str b, [span_b_offset], #4;                                                  \
806   setup_spans_adjust_interpolants_##direction();                               \
807                                                                                \
808   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
809                                                                                \
810   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
811   str b, [span_b_offset], #4;                                                  \
812   setup_spans_adjust_interpolants_##direction();                               \
813                                                                                \
814   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
815   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
816   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
817                                                                                \
818   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
819   str b, [span_b_offset], #4;                                                  \
820   setup_spans_adjust_interpolants_##direction();                               \
821                                                                                \
822   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
823   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
824                                                                                \
825   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
826                                                                                \
827   setup_spans_adjust_y_##direction()                                           \
828
829
830 #define setup_spans_set_x4_alternate_no(alternate, direction)                  \
831   vshrn.s64 left_x_32_low, left_x, #32;                                        \
832   vshrn.s64 right_x_32_low, right_x, #32;                                      \
833                                                                                \
834   vadd.u64 left_x, left_x, left_dx_dy;                                         \
835   vadd.u64 right_x, right_x, right_dx_dy;                                      \
836                                                                                \
837   vshrn.s64 left_x_32_high, left_x, #32;                                       \
838   vshrn.s64 right_x_32_high, right_x, #32;                                     \
839                                                                                \
840   vadd.u64 left_x, left_x, left_dx_dy;                                         \
841   vadd.u64 right_x, right_x, right_dx_dy;                                      \
842                                                                                \
843   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
844   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
845                                                                                \
846   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
847   str b, [span_b_offset], #4;                                                  \
848   setup_spans_adjust_interpolants_##direction();                               \
849                                                                                \
850   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
851                                                                                \
852   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
853   str b, [span_b_offset], #4;                                                  \
854   setup_spans_adjust_interpolants_##direction();                               \
855                                                                                \
856   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
857                                                                                \
858   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
859   str b, [span_b_offset], #4;                                                  \
860   setup_spans_adjust_interpolants_##direction();                               \
861                                                                                \
862   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
863   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
864   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
865                                                                                \
866   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
867   str b, [span_b_offset], #4;                                                  \
868   setup_spans_adjust_interpolants_##direction();                               \
869                                                                                \
870   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
871   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
872                                                                                \
873   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
874                                                                                \
875   setup_spans_adjust_y_##direction()                                           \
876
877
878 #define edge_adjust_low           r11
879 #define edge_adjust_high          r12
880
881 #define setup_spans_alternate_adjust_yes()                                     \
882   smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a;     \
883   subs edge_alt_low, edge_alt_low, edge_adjust_low;                            \
884   sbc edge_alt_high, edge_alt_high, edge_adjust_high                           \
885
886 #define setup_spans_alternate_adjust_no()                                      \
887
888
889 #define setup_spans_down(left_index, right_index, alternate, alternate_active) \
890   setup_spans_alternate_adjust_##alternate_active();                           \
891   setup_spans_load_b();                                                        \
892                                                                                \
893   ldrsh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset];                       \
894   subs y_c, y_c, temp;                                                         \
895   subgt height, height, y_c;                                                   \
896   addgt height, height, #1;                                                    \
897                                                                                \
898   ldrsh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset];                     \
899   subs clip, temp, y_a;                                                        \
900   ble 0f;                                                                      \
901                                                                                \
902   sub height, height, clip;                                                    \
903   add y_a, y_a, clip;                                                          \
904   setup_spans_clip(increment, alternate_active);                               \
905                                                                                \
906  0:                                                                            \
907   cmp height, #0;                                                              \
908   ble 1f;                                                                      \
909                                                                                \
910   orr temp, y_a, y_a, lsl #16;                                                 \
911   add temp, temp, #(1 << 16);                                                  \
912   add y_a, temp, #2;                                                           \
913   add y_a, y_a, #(2 << 16);                                                    \
914   vmov y_x4, temp, y_a;                                                        \
915                                                                                \
916   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
917    right_index);                                                               \
918   setup_spans_prologue_b();                                                    \
919                                                                                \
920   strh height, [psx_gpu, #psx_gpu_num_spans_offset];                           \
921                                                                                \
922  2:                                                                            \
923   setup_spans_set_x4_alternate_##alternate_active(alternate, down);            \
924   subs height, height, #4;                                                     \
925   bhi 2b;                                                                      \
926                                                                                \
927  1:                                                                            \
928
929
930 #define setup_spans_alternate_pre_increment_yes()                              \
931   adds edge_alt_low, edge_alt_low, edge_dx_dy_alt;                             \
932   adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31                    \
933
934 #define setup_spans_alternate_pre_increment_no()                               \
935
936
937 #define setup_spans_up_decrement_yes()                                         \
938   suble height, height, #1                                                     \
939
940 #define setup_spans_up_decrement_no()                                          \
941
942
943 #define setup_spans_up(left_index, right_index, alternate, alternate_active)   \
944   setup_spans_alternate_adjust_##alternate_active();                           \
945   setup_spans_load_b();                                                        \
946   sub y_a, y_a, #1;                                                            \
947                                                                                \
948   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset];                      \
949   subs temp, temp, y_c;                                                        \
950   subgt height, height, temp;                                                  \
951   setup_spans_up_decrement_##alternate_active();                               \
952                                                                                \
953   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset];                        \
954   subs clip, y_a, temp;                                                        \
955   ble 0f;                                                                      \
956                                                                                \
957   sub height, height, clip;                                                    \
958   sub y_a, y_a, clip;                                                          \
959   setup_spans_clip(decrement, alternate_active);                               \
960                                                                                \
961  0:                                                                            \
962   cmp height, #0;                                                              \
963   ble 1f;                                                                      \
964                                                                                \
965   orr temp, y_a, y_a, lsl #16;                                                 \
966   sub temp, temp, #(1 << 16);                                                  \
967   sub y_a, temp, #2;                                                           \
968   sub y_a, y_a, #(2 << 16);                                                    \
969   vmov y_x4, temp, y_a;                                                        \
970                                                                                \
971   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
972                                                                                \
973   setup_spans_alternate_pre_increment_##alternate_active();                    \
974   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
975    right_index);                                                               \
976   setup_spans_adjust_interpolants_up();                                        \
977   setup_spans_prologue_b();                                                    \
978                                                                                \
979   strh height, [psx_gpu, #psx_gpu_num_spans_offset];                           \
980                                                                                \
981  2:                                                                            \
982   setup_spans_set_x4_alternate_##alternate_active(alternate, up);              \
983   subs height, height, #4;                                                     \
984   bhi 2b;                                                                      \
985                                                                                \
986  1:                                                                            \
987
988
989 #define setup_spans_epilogue()                                                 \
990   restore_abi_regs();                                                          \
991   ldmia sp!, { r4 - r11, pc }                                                  \
992
993
994 #define setup_spans_up_up(minor, major)                                        \
995   setup_spans_prologue();                                                      \
996   sub height_minor_a, y_a, y_b;                                                \
997   sub height_minor_b, y_b, y_c;                                                \
998   sub height, y_a, y_c;                                                        \
999                                                                                \
1000   vdup.u32 x_starts, x_a;                                                      \
1001   vmov x_ends, x_c, x_b;                                                       \
1002                                                                                \
1003   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1004   setup_spans_up(major, minor, minor, yes);                                    \
1005   setup_spans_epilogue()                                                       \
1006
1007 function(setup_spans_up_left)
1008   setup_spans_up_up(left, right)
1009
1010 function(setup_spans_up_right)
1011   setup_spans_up_up(right, left)
1012
1013 #define setup_spans_down_down(minor, major)                                    \
1014   setup_spans_prologue();                                                      \
1015   sub height_minor_a, y_b, y_a;                                                \
1016   sub height_minor_b, y_c, y_b;                                                \
1017   sub height, y_c, y_a;                                                        \
1018                                                                                \
1019   vdup.u32 x_starts, x_a;                                                      \
1020   vmov x_ends, x_c, x_b;                                                       \
1021                                                                                \
1022   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1023   setup_spans_down(major, minor, minor, yes);                                  \
1024   setup_spans_epilogue()                                                       \
1025
1026 function(setup_spans_down_left)
1027   setup_spans_down_down(left, right)
1028
1029 function(setup_spans_down_right)
1030   setup_spans_down_down(right, left)
1031
1032
1033 #define setup_spans_up_flat()                                                  \
1034   sub height, y_a, y_c;                                                        \
1035                                                                                \
1036   compute_edge_delta_x2();                                                     \
1037   setup_spans_up(left, right, none, no);                                       \
1038   setup_spans_epilogue()                                                       \
1039
1040 function(setup_spans_up_a)
1041   setup_spans_prologue()
1042
1043   vmov x_starts, x_a, x_b
1044   vdup.u32 x_ends, x_c
1045
1046   setup_spans_up_flat()
1047
1048 function(setup_spans_up_b)
1049   setup_spans_prologue()
1050
1051   vdup.u32 x_starts, x_a
1052   vmov x_ends, x_b, x_c
1053
1054   setup_spans_up_flat()
1055
1056 #define setup_spans_down_flat()                                                \
1057   sub height, y_c, y_a;                                                        \
1058                                                                                \
1059   compute_edge_delta_x2();                                                     \
1060   setup_spans_down(left, right, none, no);                                     \
1061   setup_spans_epilogue()                                                       \
1062
1063 function(setup_spans_down_a)
1064   setup_spans_prologue()
1065
1066   vmov x_starts, x_a, x_b
1067   vdup.u32 x_ends, x_c
1068
1069   setup_spans_down_flat()
1070
1071 function(setup_spans_down_b)
1072   setup_spans_prologue()
1073
1074   vdup.u32 x_starts, x_a
1075   vmov x_ends, x_b, x_c
1076
1077   setup_spans_down_flat()
1078
1079
1080 #define middle_y                                          r9
1081
1082 #define edges_xy_b                                        q11
1083 #define edges_dx_dy_b                                     d26
1084 #define edge_shifts_b                                     d27
1085 #define edges_dx_dy_and_shifts_b                          q13
1086 #define height_increment                                  d20
1087
1088 #define edges_dx_dy_and_shifts                            q1
1089
1090 #define edges_xy_b_left                                   d22
1091 #define edges_xy_b_right                                  d23
1092
1093 #define setup_spans_up_down_load_edge_set_b()                                  \
1094   vmov edges_xy, edges_xy_b;                                                   \
1095   vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b                        \
1096
1097
1098 function(setup_spans_up_down)
1099   setup_spans_prologue()
1100
1101   // s32 middle_y = y_a;
1102   sub height_minor_a, y_a, y_b
1103   sub height_minor_b, y_c, y_a
1104   sub height_major, y_c, y_b
1105
1106   vmov x_starts, x_a, x_c
1107   vdup.u32 x_ends, x_b
1108
1109   compute_edge_delta_x3(x_a, height_minor_a, height_major)
1110
1111   mov temp, #0
1112   vmov height_increment, temp, height_minor_b
1113   vmlal.s32 edges_xy, edges_dx_dy, height_increment
1114
1115   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1116   vmov edges_xy_b_right, edges_xy_right
1117
1118   vmov edge_shifts_b, edge_shifts
1119   vmov.u32 edge_shifts_b[0], edge_shift_alt
1120
1121   vneg.s32 edges_dx_dy_b, edges_dx_dy
1122   vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1123
1124   mov middle_y, y_a
1125   
1126   setup_spans_load_b()
1127   sub y_a, y_a, #1
1128
1129   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
1130   subs temp, temp, y_b
1131   subgt height_minor_a, height_minor_a, temp
1132
1133   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
1134   subs clip, y_a, temp
1135   ble 0f
1136
1137   sub height_minor_a, height_minor_a, clip
1138   sub y_a, y_a, clip
1139   setup_spans_clip(decrement, no)
1140
1141  0:                                                                
1142   cmp height_minor_a, #0
1143   ble 3f
1144
1145   orr temp, y_a, y_a, lsl #16
1146   sub temp, temp, #(1 << 16)
1147   sub y_a, temp, #2
1148   sub y_a, y_a, #(2 << 16)
1149   vmov y_x4, temp, y_a
1150
1151   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1152
1153   strh height_minor_a, [psx_gpu, #psx_gpu_num_spans_offset]
1154
1155   setup_spans_adjust_edges_alternate_no(left, right); 
1156   setup_spans_adjust_interpolants_up()
1157   setup_spans_up_down_load_edge_set_b()
1158
1159   setup_spans_prologue_b()
1160
1161
1162  2: 
1163   setup_spans_set_x4_alternate_no(none, up)
1164   subs height_minor_a, height_minor_a, #4
1165   bhi 2b
1166
1167   add span_edge_data, span_edge_data, height_minor_a, lsl #3
1168   add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1169   add span_b_offset, span_b_offset, height_minor_a, lsl #2
1170
1171  4:
1172   add temp, psx_gpu, #psx_gpu_uvrg_offset
1173   vld1.32 { uvrg }, [temp]
1174   mov y_a, middle_y
1175   
1176   setup_spans_load_b()
1177
1178   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
1179   subs y_c, y_c, temp
1180   subgt height_minor_b, height_minor_b, y_c
1181   addgt height_minor_b, height_minor_b, #1
1182
1183   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
1184   subs clip, temp, y_a
1185   ble 0f
1186
1187   sub height_minor_b, height_minor_b, clip
1188   add y_a, y_a, clip
1189   setup_spans_clip(increment, no)
1190
1191  0:
1192   cmp height_minor_b, #0
1193   ble 1f
1194
1195   orr temp, y_a, y_a, lsl #16
1196   add temp, temp, #(1 << 16) 
1197   add y_a, temp, #2
1198   add y_a, y_a, #(2 << 16)
1199   vmov y_x4, temp, y_a
1200
1201   setup_spans_adjust_edges_alternate_no(left, right)
1202
1203   ldrh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1204   add temp, temp, height_minor_b
1205
1206   cmp temp, #MAX_SPANS
1207   beq 5f
1208
1209   strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1210
1211  2:                                                     
1212   setup_spans_set_x4_alternate_no(none, down)
1213   subs height_minor_b, height_minor_b, #4
1214   bhi 2b
1215
1216  1:
1217   setup_spans_epilogue()
1218
1219  3:
1220   setup_spans_up_down_load_edge_set_b()
1221   setup_spans_prologue_b()
1222   bal 4b
1223
1224  5:
1225   // FIXME: overflow corner case
1226   sub temp, temp, height_minor_b
1227   bics height_minor_b, #3
1228   add temp, temp, height_minor_b
1229   strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1230   bne 2b
1231   bal 1b
1232
1233 #undef span_uvrg_offset
1234 #undef span_edge_data
1235 #undef span_b_offset
1236 #undef left_x
1237 #undef b
1238
1239 #define psx_gpu                                  r0
1240 #define num_spans                                r1
1241 #define span_uvrg_offset                         r2
1242 #define span_edge_data                           r3
1243 #define span_b_offset                            r4
1244 #define b_dx                                     r5
1245 #define span_num_blocks                          r6
1246 #define y                                        r7
1247 #define left_x                                   r8
1248 #define b                                        r9
1249 #define dither_offset_ptr                        r10
1250 #define block_ptr_a                              r11
1251 #define fb_ptr                                   r12
1252 #define num_blocks                               r14
1253
1254 #define uvrg_dx_ptr                              r2
1255 #define texture_mask_ptr                         r3
1256 #define dither_shift                             r8
1257 #define dither_row                               r10
1258
1259 #define c_32                                     r7
1260 #define b_dx4                                    r8
1261 #define b_dx8                                    r9
1262 #define block_ptr_b                              r10
1263
1264 #define block_span_ptr                           r10
1265 #define right_mask                               r8
1266
1267 #define color                                    r2
1268 #define color_r                                  r3
1269 #define color_g                                  r4
1270 #define color_b                                  r5
1271
1272 #undef uvrg
1273
1274 #define u_block                                  q0
1275 #define v_block                                  q1
1276 #define r_block                                  q2
1277 #define g_block                                  q3
1278 #define b_block                                  q4
1279
1280 #define uv_dx4                                   d10
1281 #define rg_dx4                                   d11
1282 #define uv_dx8                                   d12
1283 #define rg_dx8                                   d13
1284 #define b_whole_8                                d14
1285 #define fb_mask_ptrs                             d15
1286
1287 #define uvrg_dx4                                 q5
1288 #define uvrg_dx8                                 q6
1289 #define uv_dx8                                   d12
1290 #define rg_dx8                                   d13
1291
1292 #define u_whole                                  q8
1293 #define v_whole                                  q9
1294 #define r_whole                                  q10
1295 #define g_whole                                  q11
1296 #define b_whole                                  q12
1297
1298 #define u_whole_low                              d16
1299 #define u_whole_high                             d17
1300 #define v_whole_low                              d18
1301 #define v_whole_high                             d19
1302 #define r_whole_low                              d20
1303 #define r_whole_high                             d21
1304 #define g_whole_low                              d22
1305 #define g_whole_high                             d23
1306 #define b_whole_low                              d24
1307 #define b_whole_high                             d25
1308
1309 #define dx4                                      q13
1310 #define dx8                                      q13
1311
1312 #define u_whole_8                                d26
1313 #define v_whole_8                                d27
1314 #define u_whole_8b                               d24
1315 #define r_whole_8                                d24
1316 #define g_whole_8                                d25
1317
1318 #define uv_whole_8                               q13
1319 #define uv_whole_8b                              q14
1320
1321 #define dither_offsets                           q14
1322 #define texture_mask                             q15
1323 #define texture_mask_u                           d30
1324 #define texture_mask_v                           d31
1325
1326 #define dither_offsets_short                     d28
1327
1328 #define v_left_x                                 q8
1329 #define uvrg                                     q9
1330 #define block_span                               q10
1331
1332 #define uv                                       d18
1333 #define rg                                       d19
1334
1335 #define draw_mask                                q1
1336 #define draw_mask_edge                           q13
1337 #define test_mask                                q0
1338
1339 #define uvrg_dx                                  q3
1340
1341 #define colors                                   q2
1342
1343 #define setup_blocks_texture_swizzled()                                        \
1344   vand.u8 u_whole_8b, u_whole_8, texture_mask_u;                               \
1345   vsli.u8 u_whole_8, v_whole_8, #4;                                            \
1346   vsri.u8 v_whole_8, u_whole_8b, #4                                            \
1347
1348 #define setup_blocks_texture_unswizzled()                                      \
1349
1350
1351 #define setup_blocks_shaded_textured_builder(swizzling)                        \
1352 .align 3;                                                                      \
1353                                                                                \
1354 function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
1355   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
1356   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1357                                                                                \
1358   vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128];                                   \
1359   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1360                                                                                \
1361   cmp num_spans, #0;                                                           \
1362   bxeq lr;                                                                     \
1363                                                                                \
1364   stmdb sp!, { r4 - r11, r14 };                                                \
1365   save_abi_regs();                                                             \
1366   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1367                                                                                \
1368   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
1369   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1370                                                                                \
1371   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16];     \
1372   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1373                                                                                \
1374   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1375   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1376                                                                                \
1377   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
1378   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1379                                                                                \
1380   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1381                                                                                \
1382  0:                                                                            \
1383   vmov.u8 fb_mask_ptrs, #0;                                                    \
1384                                                                                \
1385   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
1386   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1387                                                                                \
1388   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
1389   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
1390                                                                                \
1391   cmp span_num_blocks, #0;                                                     \
1392   beq 1f;                                                                      \
1393                                                                                \
1394   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
1395   add num_blocks, span_num_blocks, num_blocks;                                 \
1396                                                                                \
1397   cmp num_blocks, #MAX_BLOCKS;                                                 \
1398   bgt 2f;                                                                      \
1399                                                                                \
1400  3:                                                                            \
1401   ldr b, [span_b_offset];                                                      \
1402   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1403                                                                                \
1404   vdup.u32 v_left_x, left_x;                                                   \
1405   and y, y, #0x3;                                                              \
1406                                                                                \
1407   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
1408   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1409                                                                                \
1410   mla b, b_dx, left_x, b;                                                      \
1411   and dither_shift, left_x, #0x03;                                             \
1412                                                                                \
1413   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
1414   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1415                                                                                \
1416   mov dither_shift, dither_shift, lsl #3;                                      \
1417   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1418                                                                                \
1419   mov c_32, #32;                                                               \
1420   subs span_num_blocks, span_num_blocks, #1;                                   \
1421                                                                                \
1422   mov dither_row, dither_row, ror dither_shift;                                \
1423   mov b_dx4, b_dx, lsl #2;                                                     \
1424                                                                                \
1425   vdup.u32 dither_offsets_short, dither_row;                                   \
1426   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1427                                                                                \
1428   vdup.u32 b_block, b;                                                         \
1429   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1430                                                                                \
1431   vdup.u32 u_block, uv[0];                                                     \
1432   mov b_dx8, b_dx, lsl #3;                                                     \
1433                                                                                \
1434   vdup.u32 v_block, uv[1];                                                     \
1435   vdup.u32 r_block, rg[0];                                                     \
1436   vdup.u32 g_block, rg[1];                                                     \
1437                                                                                \
1438   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1439                                                                                \
1440   vadd.u32 u_block, u_block, block_span;                                       \
1441   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1442                                                                                \
1443   vadd.u32 v_block, v_block, block_span;                                       \
1444   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1445                                                                                \
1446   vadd.u32 r_block, r_block, block_span;                                       \
1447   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1448                                                                                \
1449   vadd.u32 g_block, g_block, block_span;                                       \
1450   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
1451                                                                                \
1452   vadd.u32 b_block, b_block, block_span;                                       \
1453   add block_ptr_b, block_ptr_a, #16;                                           \
1454                                                                                \
1455   vshrn.u32 u_whole_low, u_block, #16;                                         \
1456   vshrn.u32 v_whole_low, v_block, #16;                                         \
1457   vshrn.u32 r_whole_low, r_block, #16;                                         \
1458   vshrn.u32 g_whole_low, g_block, #16;                                         \
1459                                                                                \
1460   vdup.u32 dx4, uv_dx4[0];                                                     \
1461   vshrn.u32 b_whole_low, b_block, #16;                                         \
1462                                                                                \
1463   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1464   vdup.u32 dx4, uv_dx4[1];                                                     \
1465                                                                                \
1466   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1467   vdup.u32 dx4, rg_dx4[0];                                                     \
1468                                                                                \
1469   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1470   vdup.u32 dx4, rg_dx4[1];                                                     \
1471                                                                                \
1472   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1473   vdup.u32 dx4, b_dx4;                                                         \
1474                                                                                \
1475   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1476   vdup.u32 dx8, uv_dx8[0];                                                     \
1477                                                                                \
1478   vadd.u32 u_block, u_block, dx8;                                              \
1479   vdup.u32 dx8, uv_dx8[1];                                                     \
1480                                                                                \
1481   vadd.u32 v_block, v_block, dx8;                                              \
1482   vdup.u32 dx8, rg_dx8[0];                                                     \
1483                                                                                \
1484   vadd.u32 r_block, r_block, dx8;                                              \
1485   vdup.u32 dx8, rg_dx8[1];                                                     \
1486                                                                                \
1487   vadd.u32 g_block, g_block, dx8;                                              \
1488   vdup.u32 dx8, b_dx8;                                                         \
1489                                                                                \
1490   vadd.u32 b_block, b_block, dx8;                                              \
1491   vmovn.u16 u_whole_8, u_whole;                                                \
1492                                                                                \
1493   vmovn.u16 v_whole_8, v_whole;                                                \
1494                                                                                \
1495   vmovn.u16 b_whole_8, b_whole;                                                \
1496   pld [fb_ptr];                                                                \
1497   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1498                                                                                \
1499   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1500   setup_blocks_texture_##swizzling();                                          \
1501                                                                                \
1502   vmovn.u16 r_whole_8, r_whole;                                                \
1503   beq 5f;                                                                      \
1504                                                                                \
1505  4:                                                                            \
1506   vmovn.u16 g_whole_8, g_whole;                                                \
1507   vshrn.u32 u_whole_low, u_block, #16;                                         \
1508                                                                                \
1509   vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32;                 \
1510   vshrn.u32 v_whole_low, v_block, #16;                                         \
1511                                                                                \
1512   vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32;                \
1513   vshrn.u32 r_whole_low, r_block, #16;                                         \
1514                                                                                \
1515   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1516   vshrn.u32 g_whole_low, g_block, #16;                                         \
1517                                                                                \
1518   vdup.u32 dx4, uv_dx4[0];                                                     \
1519   vshrn.u32 b_whole_low, b_block, #16;                                         \
1520                                                                                \
1521   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1522   vdup.u32 dx4, uv_dx4[1];                                                     \
1523                                                                                \
1524   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1525   vdup.u32 dx4, rg_dx4[0];                                                     \
1526                                                                                \
1527   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1528   vdup.u32 dx4, rg_dx4[1];                                                     \
1529                                                                                \
1530   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1531   vdup.u32 dx4, b_dx4;                                                         \
1532                                                                                \
1533   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1534   vdup.u32 dx8, uv_dx8[0];                                                     \
1535                                                                                \
1536   vadd.u32 u_block, u_block, dx8;                                              \
1537   vdup.u32 dx8, uv_dx8[1];                                                     \
1538                                                                                \
1539   vadd.u32 v_block, v_block, dx8;                                              \
1540   vdup.u32 dx8, rg_dx8[0];                                                     \
1541                                                                                \
1542   vadd.u32 r_block, r_block, dx8;                                              \
1543   vdup.u32 dx8, rg_dx8[1];                                                     \
1544                                                                                \
1545   vadd.u32 g_block, g_block, dx8;                                              \
1546   vdup.u32 dx8, b_dx8;                                                         \
1547                                                                                \
1548   vadd.u32 b_block, b_block, dx8;                                              \
1549   vmovn.u16 u_whole_8, u_whole;                                                \
1550                                                                                \
1551   add fb_ptr, fb_ptr, #16;                                                     \
1552   vmovn.u16 v_whole_8, v_whole;                                                \
1553                                                                                \
1554   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1555   vmovn.u16 b_whole_8, b_whole;                                                \
1556                                                                                \
1557   pld [fb_ptr];                                                                \
1558                                                                                \
1559   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1560   subs span_num_blocks, span_num_blocks, #1;                                   \
1561                                                                                \
1562   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1563   setup_blocks_texture_##swizzling();                                          \
1564                                                                                \
1565   vmovn.u16 r_whole_8, r_whole;                                                \
1566   bne 4b;                                                                      \
1567                                                                                \
1568  5:                                                                            \
1569   vmovn.u16 g_whole_8, g_whole;                                                \
1570   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
1571                                                                                \
1572   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
1573   vdup.u8 draw_mask, right_mask;                                               \
1574                                                                                \
1575   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1576   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1577   vzip.u8 u_whole_8, v_whole_8;                                                \
1578                                                                                \
1579   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1580   vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32;                \
1581   vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32;                          \
1582   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1583   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1584                                                                                \
1585  1:                                                                            \
1586   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1587   add span_b_offset, span_b_offset, #4;                                        \
1588                                                                                \
1589   add span_edge_data, span_edge_data, #8;                                      \
1590   subs num_spans, num_spans, #1;                                               \
1591                                                                                \
1592   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1593   bne 0b;                                                                      \
1594                                                                                \
1595   restore_abi_regs();                                                          \
1596   ldmia sp!, { r4 - r11, pc };                                                 \
1597                                                                                \
1598  2:                                                                            \
1599   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1600   vpush { texture_mask };                                                      \
1601   vpush { uvrg_dx4 };                                                          \
1602                                                                                \
1603   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
1604   bl flush_render_block_buffer;                                                \
1605   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
1606                                                                                \
1607   vpop { uvrg_dx4 };                                                           \
1608   vpop { texture_mask };                                                       \
1609                                                                                \
1610   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1611   vmov.u8 fb_mask_ptrs, #0;                                                    \
1612                                                                                \
1613   mov num_blocks, span_num_blocks;                                             \
1614   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1615   bal 3b                                                                       \
1616
1617
1618 setup_blocks_shaded_textured_builder(swizzled)
1619 setup_blocks_shaded_textured_builder(unswizzled)
1620
1621
1622 #define setup_blocks_unshaded_textured_builder(swizzling)                      \
1623 .align 3;                                                                      \
1624                                                                                \
1625 function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
1626   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
1627   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1628                                                                                \
1629   vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128];                                   \
1630   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1631                                                                                \
1632   cmp num_spans, #0;                                                           \
1633   bxeq lr;                                                                     \
1634                                                                                \
1635   stmdb sp!, { r4 - r11, r14 };                                                \
1636   save_abi_regs();                                                             \
1637   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1638                                                                                \
1639   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1640                                                                                \
1641   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16];     \
1642   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1643                                                                                \
1644   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1645   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1646                                                                                \
1647   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1648                                                                                \
1649   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1650                                                                                \
1651  0:                                                                            \
1652   vmov.u8 fb_mask_ptrs, #0;                                                    \
1653                                                                                \
1654   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
1655   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1656                                                                                \
1657   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
1658   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
1659                                                                                \
1660   cmp span_num_blocks, #0;                                                     \
1661   beq 1f;                                                                      \
1662                                                                                \
1663   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
1664   add num_blocks, span_num_blocks, num_blocks;                                 \
1665                                                                                \
1666   cmp num_blocks, #MAX_BLOCKS;                                                 \
1667   bgt 2f;                                                                      \
1668                                                                                \
1669  3:                                                                            \
1670   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1671                                                                                \
1672   vdup.u32 v_left_x, left_x;                                                   \
1673   and y, y, #0x3;                                                              \
1674                                                                                \
1675   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
1676   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1677                                                                                \
1678   and dither_shift, left_x, #0x03;                                             \
1679                                                                                \
1680   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
1681   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1682                                                                                \
1683   mov dither_shift, dither_shift, lsl #3;                                      \
1684   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1685                                                                                \
1686   mov c_32, #32;                                                               \
1687   subs span_num_blocks, span_num_blocks, #1;                                   \
1688                                                                                \
1689   mov dither_row, dither_row, ror dither_shift;                                \
1690                                                                                \
1691   vdup.u32 dither_offsets_short, dither_row;                                   \
1692   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1693                                                                                \
1694   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1695                                                                                \
1696   vdup.u32 u_block, uv[0];                                                     \
1697                                                                                \
1698   vdup.u32 v_block, uv[1];                                                     \
1699   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1700                                                                                \
1701   vadd.u32 u_block, u_block, block_span;                                       \
1702   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1703                                                                                \
1704   vadd.u32 v_block, v_block, block_span;                                       \
1705   add block_ptr_b, block_ptr_a, #16;                                           \
1706                                                                                \
1707   vshrn.u32 u_whole_low, u_block, #16;                                         \
1708   vshrn.u32 v_whole_low, v_block, #16;                                         \
1709                                                                                \
1710   vdup.u32 dx4, uv_dx4[0];                                                     \
1711                                                                                \
1712   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1713   vdup.u32 dx4, uv_dx4[1];                                                     \
1714                                                                                \
1715   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1716   vdup.u32 dx8, uv_dx8[0];                                                     \
1717                                                                                \
1718   vadd.u32 u_block, u_block, dx8;                                              \
1719   vdup.u32 dx8, uv_dx8[1];                                                     \
1720                                                                                \
1721   vadd.u32 v_block, v_block, dx8;                                              \
1722   vmovn.u16 u_whole_8, u_whole;                                                \
1723                                                                                \
1724   vmovn.u16 v_whole_8, v_whole;                                                \
1725                                                                                \
1726   pld [fb_ptr];                                                                \
1727   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1728                                                                                \
1729   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1730   setup_blocks_texture_##swizzling();                                          \
1731                                                                                \
1732   beq 5f;                                                                      \
1733                                                                                \
1734  4:                                                                            \
1735   vshrn.u32 u_whole_low, u_block, #16;                                         \
1736                                                                                \
1737   vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32;                 \
1738   vshrn.u32 v_whole_low, v_block, #16;                                         \
1739                                                                                \
1740   add block_ptr_b, block_ptr_b, #32;                                           \
1741   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1742                                                                                \
1743   vdup.u32 dx4, uv_dx4[0];                                                     \
1744   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1745   vdup.u32 dx4, uv_dx4[1];                                                     \
1746                                                                                \
1747   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1748   vdup.u32 dx8, uv_dx8[0];                                                     \
1749                                                                                \
1750   vadd.u32 u_block, u_block, dx8;                                              \
1751   vdup.u32 dx8, uv_dx8[1];                                                     \
1752                                                                                \
1753   vadd.u32 v_block, v_block, dx8;                                              \
1754   vmovn.u16 u_whole_8, u_whole;                                                \
1755                                                                                \
1756   add fb_ptr, fb_ptr, #16;                                                     \
1757   vmovn.u16 v_whole_8, v_whole;                                                \
1758                                                                                \
1759   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1760   pld [fb_ptr];                                                                \
1761                                                                                \
1762   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1763   subs span_num_blocks, span_num_blocks, #1;                                   \
1764                                                                                \
1765   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1766   setup_blocks_texture_##swizzling();                                          \
1767                                                                                \
1768   bne 4b;                                                                      \
1769                                                                                \
1770  5:                                                                            \
1771   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
1772                                                                                \
1773   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
1774   vdup.u8 draw_mask, right_mask;                                               \
1775                                                                                \
1776   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1777   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1778   vzip.u8 u_whole_8, v_whole_8;                                                \
1779                                                                                \
1780   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1781   add block_ptr_b, block_ptr_b, #32;                                           \
1782   vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32;                          \
1783   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1784   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1785                                                                                \
1786  1:                                                                            \
1787   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1788   add span_edge_data, span_edge_data, #8;                                      \
1789   subs num_spans, num_spans, #1;                                               \
1790                                                                                \
1791   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1792   bne 0b;                                                                      \
1793                                                                                \
1794   restore_abi_regs();                                                          \
1795   ldmia sp!, { r4 - r11, pc };                                                 \
1796                                                                                \
1797  2:                                                                            \
1798   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1799   vpush { texture_mask };                                                      \
1800   vpush { uvrg_dx4 };                                                          \
1801                                                                                \
1802   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
1803   bl flush_render_block_buffer;                                                \
1804   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
1805                                                                                \
1806   vpop { uvrg_dx4 };                                                           \
1807   vpop { texture_mask };                                                       \
1808                                                                                \
1809   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1810   vmov.u8 fb_mask_ptrs, #0;                                                    \
1811                                                                                \
1812   mov num_blocks, span_num_blocks;                                             \
1813   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1814   bal 3b                                                                       \
1815
1816
1817 setup_blocks_unshaded_textured_builder(swizzled)
1818 setup_blocks_unshaded_textured_builder(unswizzled)
1819
1820
1821 .align 3
1822
1823 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1824   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
1825   veor.u32 draw_mask, draw_mask, draw_mask
1826
1827   cmp num_spans, #0
1828   bxeq lr
1829
1830   stmdb sp!, { r4 - r11, r14 }
1831   save_abi_regs()
1832   vld1.u32 { test_mask }, [psx_gpu, :128]
1833
1834   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
1835
1836   ubfx color_r, color, #3, #5
1837   ubfx color_g, color, #11, #5
1838   ubfx color_b, color, #19, #5
1839
1840   orr color, color_r, color_b, lsl #10
1841   orr color, color, color_g, lsl #5
1842
1843   vdup.u16 colors, color
1844
1845   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
1846   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1847
1848   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1849   add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1850
1851  0:
1852   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1853   ldrh y, [span_edge_data, #edge_data_y_offset]
1854
1855   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
1856
1857   cmp span_num_blocks, #0
1858   beq 1f
1859
1860   ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
1861   add num_blocks, span_num_blocks, num_blocks
1862
1863   cmp num_blocks, #MAX_BLOCKS
1864   bgt 2f
1865
1866  3:
1867   add fb_ptr, fb_ptr, y, lsl #11
1868   and y, y, #0x3
1869
1870   add fb_ptr, fb_ptr, left_x, lsl #1
1871   mov c_32, #32
1872
1873   subs span_num_blocks, span_num_blocks, #1
1874
1875   add block_ptr_b, block_ptr_a, #16
1876   pld [fb_ptr]
1877
1878   vmov.u32 fb_mask_ptrs[1], fb_ptr
1879   beq 5f
1880
1881  4:
1882   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_32
1883   vst1.u32 { colors }, [block_ptr_b, :128], c_32
1884   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
1885
1886   add fb_ptr, fb_ptr, #16
1887   add block_ptr_b, block_ptr_b, #32
1888
1889   pld [fb_ptr]
1890
1891   vmov.u32 fb_mask_ptrs[1], fb_ptr
1892   subs span_num_blocks, span_num_blocks, #1
1893
1894   bne 4b
1895
1896  5:
1897   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]
1898
1899   vdup.u8 draw_mask_edge, right_mask
1900   vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1901
1902   vst1.u32 { colors }, [block_ptr_b, :128], c_32
1903   vst1.u32 { draw_mask_edge }, [block_ptr_a, :128], c_32
1904   add block_ptr_b, block_ptr_b, #32
1905   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
1906
1907  1:
1908   add span_edge_data, span_edge_data, #8
1909   subs num_spans, num_spans, #1
1910
1911   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
1912   bne 0b
1913
1914   restore_abi_regs()
1915   ldmia sp!, { r4 - r11, pc }
1916                                                                            
1917  2:
1918   vpush { colors }
1919
1920   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
1921   bl flush_render_block_buffer
1922   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
1923
1924   vpop { colors }
1925
1926   vld1.u32 { test_mask }, [psx_gpu, :128]
1927   veor.u32 draw_mask, draw_mask, draw_mask
1928
1929   mov num_blocks, span_num_blocks
1930   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1931   bal 3b
1932
1933
1934 #define mask_msb_scalar                                   r14
1935
1936 #define msb_mask                                          q15
1937
1938 #define pixels_low                                        d16
1939
1940 #define msb_mask_low                                      d30
1941 #define msb_mask_high                                     d31
1942
1943
1944 .align 3
1945
1946 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1947   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
1948
1949   cmp num_spans, #0
1950   bxeq lr
1951
1952   stmdb sp!, { r4 - r11, r14 }
1953
1954   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
1955
1956   ubfx color_r, color, #3, #5
1957   ubfx color_g, color, #11, #5
1958
1959   ldrh mask_msb_scalar, [psx_gpu, #psx_gpu_mask_msb_offset]
1960   ubfx color_b, color, #19, #5
1961
1962   orr color, color_r, color_b, lsl #10
1963   orr color, color, color_g, lsl #5
1964   orr color, color, mask_msb_scalar
1965
1966   vdup.u16 colors, color
1967
1968   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1969   orr color, color, color, lsl #16
1970
1971
1972  0:
1973   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1974   ldrh y, [span_edge_data, #edge_data_y_offset]
1975
1976   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
1977
1978   cmp span_num_blocks, #0
1979   beq 1f
1980
1981   ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
1982
1983   add fb_ptr, fb_ptr, y, lsl #11
1984   subs span_num_blocks, span_num_blocks, #1
1985
1986   add fb_ptr, fb_ptr, left_x, lsl #1
1987   beq 3f
1988
1989  2:
1990   vst1.u32 { colors }, [fb_ptr]!
1991   subs span_num_blocks, span_num_blocks, #1
1992
1993   bne 2b
1994
1995  3:
1996   ldrb right_mask, [span_edge_data, #edge_data_right_mask_offset]
1997
1998   cmp right_mask, #0x0
1999   beq 5f
2000
2001   tst right_mask, #0xF
2002   streq color, [fb_ptr], #4
2003   moveq right_mask, right_mask, lsr #4
2004   streq color, [fb_ptr], #4
2005
2006   tst right_mask, #0x3
2007   streq color, [fb_ptr], #4
2008   moveq right_mask, right_mask, lsr #2
2009
2010   tst right_mask, #0x1
2011   strheq color, [fb_ptr]
2012
2013  1:
2014   add span_edge_data, span_edge_data, #8
2015   subs num_spans, num_spans, #1
2016   bne 0b
2017
2018   ldmia sp!, { r4 - r11, pc }
2019                                                                            
2020  5:
2021   vst1.u32 { colors }, [fb_ptr]
2022   bal 1b
2023
2024
2025 #undef c_64
2026
2027 #define c_64                                              r7
2028 #define rg_dx_ptr                                         r2
2029
2030
2031 #undef r_block
2032 #undef g_block
2033 #undef b_block
2034 #undef r_whole
2035 #undef g_whole
2036 #undef b_whole
2037 #undef r_whole_low
2038 #undef r_whole_high
2039 #undef g_whole_low
2040 #undef g_whole_high
2041 #undef b_whole_low
2042 #undef b_whole_high
2043 #undef r_whole_8
2044 #undef g_whole_8
2045 #undef b_whole_8
2046 #undef dither_offsets
2047 #undef rg_dx4
2048 #undef rg_dx8
2049 #undef dx4
2050 #undef dx8
2051 #undef v_left_x
2052 #undef uvrg
2053 #undef block_span
2054 #undef rg
2055 #undef draw_mask
2056 #undef test_mask
2057
2058 #define r_block                                           q0
2059 #define g_block                                           q1
2060 #define b_block                                           q2
2061
2062 #define r_whole                                           q3
2063 #define g_whole                                           q4
2064 #define b_whole                                           q5
2065
2066 #define r_whole_low                                       d6
2067 #define r_whole_high                                      d7
2068 #define g_whole_low                                       d8
2069 #define g_whole_high                                      d9
2070 #define b_whole_low                                       d10
2071 #define b_whole_high                                      d11
2072
2073 #define gb_whole_8                                        q6
2074
2075 #define g_whole_8                                         d12
2076 #define b_whole_8                                         d13
2077
2078 #define r_whole_8                                         d14
2079
2080 #define pixels                                            q8
2081
2082 #define rg_dx4                                            d18
2083 #define rg_dx8                                            d19
2084
2085 #define dx4                                               q10
2086 #define dx8                                               q10
2087
2088 #define v_left_x                                          d6
2089 #define uvrg                                              q4
2090 #define block_span                                        q5
2091
2092 #define rg                                                d9
2093
2094 #define d64_1                                             d22
2095 #define d64_128                                           d23
2096
2097 #define d128_4                                            q12
2098 #define d128_0x7                                          q13
2099
2100 #define d64_4                                             d24
2101
2102 #define dither_offsets                                    q14
2103 #define draw_mask                                         q15
2104
2105 #define dither_offsets_low                                d28
2106
2107 #define rg_dx                                             d0
2108 #define test_mask                                         q10
2109
2110
2111 #define setup_blocks_shaded_untextured_dither_a_dithered()                     \
2112   vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low;                           \
2113   vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets;                             \
2114
2115 #define setup_blocks_shaded_untextured_dither_b_dithered()                     \
2116   vqsub.u8 r_whole_8, r_whole_8, d64_4;                                        \
2117   vqsub.u8 gb_whole_8, gb_whole_8, d128_4                                      \
2118
2119 #define setup_blocks_shaded_untextured_dither_a_undithered()                   \
2120
2121 #define setup_blocks_shaded_untextured_dither_b_undithered()                   \
2122
2123
2124 #define setup_blocks_shaded_untextured_indirect_builder(dithering)             \
2125 .align 3;                                                                      \
2126                                                                                \
2127 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
2128   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
2129   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2130                                                                                \
2131   vld1.u32 { rg_dx }, [rg_dx_ptr, :64];                                        \
2132                                                                                \
2133   cmp num_spans, #0;                                                           \
2134   bxeq lr;                                                                     \
2135                                                                                \
2136   stmdb sp!, { r4 - r11, r14 };                                                \
2137   save_abi_regs();                                                             \
2138   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2139                                                                                \
2140   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
2141   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2142                                                                                \
2143   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2144                                                                                \
2145   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
2146   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2147                                                                                \
2148   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2149   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2150                                                                                \
2151   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
2152   vmov.u8 d64_1, #1;                                                           \
2153                                                                                \
2154   vmov.u8 d128_4, #4;                                                          \
2155   vmov.u8 d64_128, #128;                                                       \
2156                                                                                \
2157   vmov.u8 d128_0x7, #0x7;                                                      \
2158                                                                                \
2159  0:                                                                            \
2160   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
2161   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2162                                                                                \
2163   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
2164   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
2165                                                                                \
2166   cmp span_num_blocks, #0;                                                     \
2167   beq 1f;                                                                      \
2168                                                                                \
2169   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
2170   add num_blocks, span_num_blocks, num_blocks;                                 \
2171                                                                                \
2172   cmp num_blocks, #MAX_BLOCKS;                                                 \
2173   bgt 2f;                                                                      \
2174                                                                                \
2175  3:                                                                            \
2176   ldr b, [span_b_offset];                                                      \
2177   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2178                                                                                \
2179   vdup.u32 v_left_x, left_x;                                                   \
2180   and y, y, #0x3;                                                              \
2181                                                                                \
2182   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
2183   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2184                                                                                \
2185   mla b, b_dx, left_x, b;                                                      \
2186   and dither_shift, left_x, #0x03;                                             \
2187                                                                                \
2188   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
2189   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2190                                                                                \
2191   mov dither_shift, dither_shift, lsl #3;                                      \
2192   vmla.u32 rg, rg_dx, v_left_x;                                                \
2193                                                                                \
2194   mov c_64, #64;                                                               \
2195   subs span_num_blocks, span_num_blocks, #1;                                   \
2196                                                                                \
2197   mov dither_row, dither_row, ror dither_shift;                                \
2198   mov b_dx4, b_dx, lsl #2;                                                     \
2199                                                                                \
2200   vdup.u32 dither_offsets, dither_row;                                         \
2201   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2202                                                                                \
2203   vdup.u32 b_block, b;                                                         \
2204   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2205                                                                                \
2206   mov b_dx8, b_dx, lsl #3;                                                     \
2207   vdup.u32 r_block, rg[0];                                                     \
2208   vdup.u32 g_block, rg[1];                                                     \
2209                                                                                \
2210   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2211                                                                                \
2212   vadd.u32 r_block, r_block, block_span;                                       \
2213   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2214                                                                                \
2215   vadd.u32 g_block, g_block, block_span;                                       \
2216   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
2217                                                                                \
2218   vadd.u32 b_block, b_block, block_span;                                       \
2219   add block_ptr_b, block_ptr_a, #16;                                           \
2220                                                                                \
2221   vshrn.u32 r_whole_low, r_block, #16;                                         \
2222   vshrn.u32 g_whole_low, g_block, #16;                                         \
2223   vshrn.u32 b_whole_low, b_block, #16;                                         \
2224   vdup.u32 dx4, rg_dx4[0];                                                     \
2225                                                                                \
2226   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2227   vdup.u32 dx4, rg_dx4[1];                                                     \
2228                                                                                \
2229   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2230   vdup.u32 dx4, b_dx4;                                                         \
2231                                                                                \
2232   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2233   vdup.u32 dx8, rg_dx8[0];                                                     \
2234                                                                                \
2235   vadd.u32 r_block, r_block, dx8;                                              \
2236   vdup.u32 dx8, rg_dx8[1];                                                     \
2237                                                                                \
2238   vadd.u32 g_block, g_block, dx8;                                              \
2239   vdup.u32 dx8, b_dx8;                                                         \
2240                                                                                \
2241   vadd.u32 b_block, b_block, dx8;                                              \
2242                                                                                \
2243   vmovn.u16 r_whole_8, r_whole;                                                \
2244   vmovn.u16 g_whole_8, g_whole;                                                \
2245   vmovn.u16 b_whole_8, b_whole;                                                \
2246                                                                                \
2247   beq 5f;                                                                      \
2248   veor.u32 draw_mask, draw_mask, draw_mask;                                    \
2249                                                                                \
2250  4:                                                                            \
2251   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2252   vshrn.u32 r_whole_low, r_block, #16;                                         \
2253                                                                                \
2254   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2255   vshrn.u32 g_whole_low, g_block, #16;                                         \
2256                                                                                \
2257   vshrn.u32 b_whole_low, b_block, #16;                                         \
2258   str fb_ptr, [block_ptr_a, #44];                                              \
2259                                                                                \
2260   vdup.u32 dx4, rg_dx4[0];                                                     \
2261   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2262   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2263                                                                                \
2264   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2265   vdup.u32 dx4, rg_dx4[1];                                                     \
2266                                                                                \
2267   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2268   vdup.u32 dx4, b_dx4;                                                         \
2269                                                                                \
2270   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2271   vdup.u32 dx8, rg_dx8[0];                                                     \
2272                                                                                \
2273   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2274   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2275   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2276                                                                                \
2277   vadd.u32 r_block, r_block, dx8;                                              \
2278   vdup.u32 dx8, rg_dx8[1];                                                     \
2279                                                                                \
2280   vadd.u32 g_block, g_block, dx8;                                              \
2281   vdup.u32 dx8, b_dx8;                                                         \
2282                                                                                \
2283   vadd.u32 b_block, b_block, dx8;                                              \
2284   add fb_ptr, fb_ptr, #16;                                                     \
2285                                                                                \
2286   vmovn.u16 r_whole_8, r_whole;                                                \
2287   vmovn.u16 g_whole_8, g_whole;                                                \
2288   vmovn.u16 b_whole_8, b_whole;                                                \
2289                                                                                \
2290   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64;                           \
2291   vst1.u32 { pixels }, [block_ptr_b, :128], c_64;                              \
2292                                                                                \
2293   pld [fb_ptr];                                                                \
2294                                                                                \
2295   subs span_num_blocks, span_num_blocks, #1;                                   \
2296   bne 4b;                                                                      \
2297                                                                                \
2298  5:                                                                            \
2299   str fb_ptr, [block_ptr_a, #44];                                              \
2300   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2301                                                                                \
2302   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
2303   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2304                                                                                \
2305   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2306   vdup.u8 draw_mask, right_mask;                                               \
2307                                                                                \
2308   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2309   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
2310                                                                                \
2311   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
2312                                                                                \
2313   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2314   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2315   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2316                                                                                \
2317   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64;                           \
2318   vst1.u32 { pixels }, [block_ptr_b, :128], c_64;                              \
2319                                                                                \
2320  1:                                                                            \
2321   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2322   add span_b_offset, span_b_offset, #4;                                        \
2323                                                                                \
2324   add span_edge_data, span_edge_data, #8;                                      \
2325   subs num_spans, num_spans, #1;                                               \
2326                                                                                \
2327   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
2328   bne 0b;                                                                      \
2329                                                                                \
2330   restore_abi_regs();                                                          \
2331   ldmia sp!, { r4 - r11, pc };                                                 \
2332                                                                                \
2333  2:                                                                            \
2334   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
2335   vpush { rg_dx4 };                                                            \
2336                                                                                \
2337   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
2338   bl flush_render_block_buffer;                                                \
2339   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
2340                                                                                \
2341   vpop { rg_dx4 };                                                             \
2342                                                                                \
2343   vmov.u8 d64_1, #1;                                                           \
2344   vmov.u8 d128_4, #4;                                                          \
2345   vmov.u8 d64_128, #128;                                                       \
2346   vmov.u8 d128_0x7, #0x7;                                                      \
2347                                                                                \
2348   vadd.u32 rg_dx8, rg_dx4, rg_dx4;                                             \
2349                                                                                \
2350   mov num_blocks, span_num_blocks;                                             \
2351   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2352   bal 3b                                                                       \
2353
2354
2355 setup_blocks_shaded_untextured_indirect_builder(undithered)
2356 setup_blocks_shaded_untextured_indirect_builder(dithered)
2357
2358
2359 #undef draw_mask
2360
2361 #define mask_msb_ptr                                      r14
2362
2363 #define draw_mask                                         q0
2364 #define pixels_low                                        d16
2365 #define pixels_high                                       d17
2366
2367
2368
2369 #define setup_blocks_shaded_untextured_direct_builder(dithering)               \
2370 .align 3;                                                                      \
2371                                                                                \
2372 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
2373   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
2374   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2375                                                                                \
2376   vld1.u32 { rg_dx }, [rg_dx_ptr, :64];                                        \
2377                                                                                \
2378   cmp num_spans, #0;                                                           \
2379   bxeq lr;                                                                     \
2380                                                                                \
2381   stmdb sp!, { r4 - r11, r14 };                                                \
2382   save_abi_regs();                                                             \
2383   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2384                                                                                \
2385   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
2386   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2387                                                                                \
2388   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2389   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2390                                                                                \
2391   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2392   vmov.u8 d64_1, #1;                                                           \
2393                                                                                \
2394   vmov.u8 d128_4, #4;                                                          \
2395   vmov.u8 d64_128, #128;                                                       \
2396                                                                                \
2397   vmov.u8 d128_0x7, #0x7;                                                      \
2398   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
2399   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
2400                                                                                \
2401  0:                                                                            \
2402   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
2403   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2404                                                                                \
2405   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
2406   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
2407                                                                                \
2408   cmp span_num_blocks, #0;                                                     \
2409   beq 1f;                                                                      \
2410                                                                                \
2411   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
2412   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2413                                                                                \
2414   ldr b, [span_b_offset];                                                      \
2415   vdup.u32 v_left_x, left_x;                                                   \
2416   and y, y, #0x3;                                                              \
2417                                                                                \
2418   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
2419   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2420                                                                                \
2421   mla b, b_dx, left_x, b;                                                      \
2422   and dither_shift, left_x, #0x03;                                             \
2423                                                                                \
2424   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
2425   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2426                                                                                \
2427   mov dither_shift, dither_shift, lsl #3;                                      \
2428   vmla.u32 rg, rg_dx, v_left_x;                                                \
2429                                                                                \
2430   subs span_num_blocks, span_num_blocks, #1;                                   \
2431                                                                                \
2432   mov dither_row, dither_row, ror dither_shift;                                \
2433   mov b_dx4, b_dx, lsl #2;                                                     \
2434                                                                                \
2435   vdup.u32 dither_offsets, dither_row;                                         \
2436   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2437                                                                                \
2438   vdup.u32 b_block, b;                                                         \
2439   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2440                                                                                \
2441   mov b_dx8, b_dx, lsl #3;                                                     \
2442   vdup.u32 r_block, rg[0];                                                     \
2443   vdup.u32 g_block, rg[1];                                                     \
2444                                                                                \
2445   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2446                                                                                \
2447   vadd.u32 r_block, r_block, block_span;                                       \
2448   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2449                                                                                \
2450   vadd.u32 g_block, g_block, block_span;                                       \
2451   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
2452                                                                                \
2453   vadd.u32 b_block, b_block, block_span;                                       \
2454   add block_ptr_b, block_ptr_a, #16;                                           \
2455                                                                                \
2456   vshrn.u32 r_whole_low, r_block, #16;                                         \
2457   vshrn.u32 g_whole_low, g_block, #16;                                         \
2458   vshrn.u32 b_whole_low, b_block, #16;                                         \
2459   vdup.u32 dx4, rg_dx4[0];                                                     \
2460                                                                                \
2461   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2462   vdup.u32 dx4, rg_dx4[1];                                                     \
2463                                                                                \
2464   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2465   vdup.u32 dx4, b_dx4;                                                         \
2466                                                                                \
2467   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2468   vdup.u32 dx8, rg_dx8[0];                                                     \
2469                                                                                \
2470   vadd.u32 r_block, r_block, dx8;                                              \
2471   vdup.u32 dx8, rg_dx8[1];                                                     \
2472                                                                                \
2473   vadd.u32 g_block, g_block, dx8;                                              \
2474   vdup.u32 dx8, b_dx8;                                                         \
2475                                                                                \
2476   vadd.u32 b_block, b_block, dx8;                                              \
2477                                                                                \
2478   vmovn.u16 r_whole_8, r_whole;                                                \
2479   vmovn.u16 g_whole_8, g_whole;                                                \
2480   vmovn.u16 b_whole_8, b_whole;                                                \
2481                                                                                \
2482   beq 3f;                                                                      \
2483                                                                                \
2484  2:                                                                            \
2485   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2486   vshrn.u32 r_whole_low, r_block, #16;                                         \
2487                                                                                \
2488   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2489   vshrn.u32 g_whole_low, g_block, #16;                                         \
2490                                                                                \
2491   vshrn.u32 b_whole_low, b_block, #16;                                         \
2492                                                                                \
2493   vdup.u32 dx4, rg_dx4[0];                                                     \
2494   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2495   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2496                                                                                \
2497   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2498   vdup.u32 dx4, rg_dx4[1];                                                     \
2499                                                                                \
2500   vmov pixels, msb_mask;                                                       \
2501   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2502   vdup.u32 dx4, b_dx4;                                                         \
2503                                                                                \
2504   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2505   vdup.u32 dx8, rg_dx8[0];                                                     \
2506                                                                                \
2507   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2508   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2509   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2510                                                                                \
2511   vadd.u32 r_block, r_block, dx8;                                              \
2512   vdup.u32 dx8, rg_dx8[1];                                                     \
2513                                                                                \
2514   vadd.u32 g_block, g_block, dx8;                                              \
2515   vdup.u32 dx8, b_dx8;                                                         \
2516                                                                                \
2517   vadd.u32 b_block, b_block, dx8;                                              \
2518                                                                                \
2519   vmovn.u16 r_whole_8, r_whole;                                                \
2520   vmovn.u16 g_whole_8, g_whole;                                                \
2521   vmovn.u16 b_whole_8, b_whole;                                                \
2522                                                                                \
2523   vst1.u32 { pixels }, [fb_ptr]!;                                              \
2524   subs span_num_blocks, span_num_blocks, #1;                                   \
2525   bne 2b;                                                                      \
2526                                                                                \
2527  3:                                                                            \
2528   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2529                                                                                \
2530   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
2531   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2532                                                                                \
2533   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2534   rbit right_mask, right_mask;                                                 \
2535   vmov pixels, msb_mask;                                                       \
2536   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2537   clz right_mask, right_mask;                                                  \
2538                                                                                \
2539   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2540   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2541   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2542                                                                                \
2543   JT_OP_REL(100f, right_mask, temp);                                           \
2544   JT_OP(ldr pc, [pc, right_mask, lsl #2]);                                     \
2545   nop;                                                                         \
2546  100:                                                                          \
2547   nop;                                                                         \
2548   .word JTE(100b, 4f);                                                         \
2549   .word JTE(100b, 5f);                                                         \
2550   .word JTE(100b, 6f);                                                         \
2551   .word JTE(100b, 7f);                                                         \
2552   .word JTE(100b, 8f);                                                         \
2553   .word JTE(100b, 9f);                                                         \
2554   .word JTE(100b, 10f);                                                        \
2555   .word JTE(100b, 11f);                                                        \
2556                                                                                \
2557  4:                                                                            \
2558   vst1.u16 { pixels_low[0] }, [fb_ptr];                                        \
2559   bal 1f;                                                                      \
2560                                                                                \
2561  5:                                                                            \
2562   vst1.u32 { pixels_low[0] }, [fb_ptr];                                        \
2563   bal 1f;                                                                      \
2564                                                                                \
2565  6:                                                                            \
2566   vst1.u32 { pixels_low[0] }, [fb_ptr]!;                                       \
2567   vst1.u16 { pixels_low[2] }, [fb_ptr];                                        \
2568   bal 1f;                                                                      \
2569                                                                                \
2570  7:                                                                            \
2571   vst1.u32 { pixels_low }, [fb_ptr];                                           \
2572   bal 1f;                                                                      \
2573                                                                                \
2574  8:                                                                            \
2575   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2576   vst1.u16 { pixels_high[0] }, [fb_ptr];                                       \
2577   bal 1f;                                                                      \
2578                                                                                \
2579  9:                                                                            \
2580   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2581   vst1.u32 { pixels_high[0] }, [fb_ptr]!;                                      \
2582   bal 1f;                                                                      \
2583                                                                                \
2584  10:                                                                           \
2585   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2586   vst1.u32 { pixels_high[0] }, [fb_ptr]!;                                      \
2587   vst1.u16 { pixels_high[2] }, [fb_ptr];                                       \
2588   bal 1f;                                                                      \
2589                                                                                \
2590  11:                                                                           \
2591   vst1.u32 { pixels }, [fb_ptr];                                               \
2592   bal 1f;                                                                      \
2593                                                                                \
2594  1:                                                                            \
2595   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2596   add span_b_offset, span_b_offset, #4;                                        \
2597                                                                                \
2598   add span_edge_data, span_edge_data, #8;                                      \
2599   subs num_spans, num_spans, #1;                                               \
2600                                                                                \
2601   bne 0b;                                                                      \
2602                                                                                \
2603   restore_abi_regs();                                                          \
2604   ldmia sp!, { r4 - r11, pc }                                                  \
2605
2606 setup_blocks_shaded_untextured_direct_builder(undithered)
2607 setup_blocks_shaded_untextured_direct_builder(dithered)
2608
2609
2610 #undef psx_gpu
2611 #undef num_blocks
2612 #undef triangle
2613 #undef c_64
2614
2615 #define psx_gpu                                  r0
2616 #define block_ptr                                r1
2617 #define num_blocks                               r2
2618 #define uv_01                                    r3
2619 #define uv_23                                    r4
2620 #define uv_45                                    r5
2621 #define uv_67                                    r6
2622 #define uv_0                                     r7
2623 #define uv_1                                     r3
2624 #define uv_2                                     r8
2625 #define uv_3                                     r4
2626 #define uv_4                                     r9
2627 #define uv_5                                     r5
2628 #define uv_6                                     r10
2629 #define uv_7                                     r6
2630 #define texture_ptr                              r11
2631
2632 #define pixel_0                                  r7
2633 #define pixel_1                                  r3
2634 #define pixel_2                                  r8
2635 #define pixel_3                                  r4
2636 #define pixel_4                                  r9
2637 #define pixel_5                                  r5
2638 #define pixel_6                                  r10
2639 #define pixel_7                                  r6
2640
2641 #define pixels_a                                 r7
2642 #define pixels_b                                 r9
2643 #define pixels_c                                 r8
2644 #define pixels_d                                 r10
2645
2646 #define c_64                                     r0
2647
2648 #define clut_ptr                                 r12
2649 #define current_texture_mask                     r5
2650 #define dirty_textures_mask                      r6
2651
2652 #define texels                                   d0
2653
2654 #define clut_low_a                               d2
2655 #define clut_low_b                               d3
2656 #define clut_high_a                              d4
2657 #define clut_high_b                              d5
2658
2659 #define clut_a                                   q1
2660 #define clut_b                                   q2
2661
2662 #define texels_low                               d6
2663 #define texels_high                              d7
2664
2665 .align 3
2666
2667 function(texture_blocks_untextured)
2668   bx lr
2669
2670
2671 .align 3
2672
2673 function(texture_blocks_4bpp)
2674   stmdb sp!, { r3 - r11, r14 }
2675   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2676
2677   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2678   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2679
2680   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2681   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]
2682
2683   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
2684   vuzp.u8 clut_a, clut_b
2685
2686   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
2687   tst dirty_textures_mask, current_texture_mask
2688
2689   bne 1f
2690   mov c_64, #64
2691
2692 0:
2693   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2694
2695   uxtah uv_0, texture_ptr, uv_01
2696   uxtah uv_1, texture_ptr, uv_01, ror #16
2697
2698   uxtah uv_2, texture_ptr, uv_23
2699   uxtah uv_3, texture_ptr, uv_23, ror #16
2700
2701   uxtah uv_4, texture_ptr, uv_45
2702   ldrb pixel_0, [uv_0]
2703
2704   uxtah uv_5, texture_ptr, uv_45, ror #16
2705   ldrb pixel_1, [uv_1]
2706
2707   uxtah uv_6, texture_ptr, uv_67
2708   ldrb pixel_2, [uv_2]
2709
2710   uxtah uv_7, texture_ptr, uv_67, ror #16
2711   ldrb pixel_3, [uv_3]
2712
2713   ldrb pixel_4, [uv_4]
2714   subs num_blocks, num_blocks, #1
2715
2716   ldrb pixel_5, [uv_5]
2717   orr pixels_a, pixel_0, pixel_1, lsl #8
2718
2719   ldrb pixel_6, [uv_6]
2720   orr pixels_b, pixel_4, pixel_5, lsl #8
2721
2722   ldrb pixel_7, [uv_7]
2723   orr pixels_a, pixels_a, pixel_2, lsl #16
2724
2725   orr pixels_b, pixels_b, pixel_6, lsl #16
2726   orr pixels_a, pixels_a, pixel_3, lsl #24
2727
2728   orr pixels_b, pixels_b, pixel_7, lsl #24
2729   vmov texels, pixels_a, pixels_b
2730
2731   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2732   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2733
2734   vst2.u8 { texels_low, texels_high }, [block_ptr, :128], c_64
2735   bne 0b
2736
2737   ldmia sp!, { r3 - r11, pc }
2738
2739 1:
2740   stmdb sp!, { r1 - r2 }  
2741   bl update_texture_4bpp_cache
2742
2743   mov c_64, #64
2744   ldmia sp!, { r1 - r2 }
2745   bal 0b
2746
2747
2748 .align 3
2749
2750 function(texture_blocks_8bpp)
2751   stmdb sp!, { r3 - r11, r14 }
2752   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2753
2754   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2755   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2756
2757   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2758   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
2759
2760   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]
2761   tst dirty_textures_mask, current_texture_mask
2762
2763   bne 1f
2764   nop
2765
2766 0:
2767   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2768
2769   uxtah uv_0, texture_ptr, uv_01
2770   uxtah uv_1, texture_ptr, uv_01, ror #16
2771
2772   uxtah uv_2, texture_ptr, uv_23
2773   uxtah uv_3, texture_ptr, uv_23, ror #16
2774
2775   uxtah uv_4, texture_ptr, uv_45
2776   ldrb pixel_0, [uv_0]
2777
2778   uxtah uv_5, texture_ptr, uv_45, ror #16
2779   ldrb pixel_1, [uv_1]
2780
2781   uxtah uv_6, texture_ptr, uv_67
2782   ldrb pixel_2, [uv_2]
2783
2784   uxtah uv_7, texture_ptr, uv_67, ror #16
2785   ldrb pixel_3, [uv_3]
2786
2787   ldrb pixel_4, [uv_4]
2788   add pixel_0, pixel_0, pixel_0
2789
2790   ldrb pixel_5, [uv_5]
2791   add pixel_1, pixel_1, pixel_1
2792
2793   ldrb pixel_6, [uv_6]
2794   add pixel_2, pixel_2, pixel_2
2795
2796   ldrb pixel_7, [uv_7]
2797   add pixel_3, pixel_3, pixel_3
2798
2799   ldrh pixel_0, [clut_ptr, pixel_0]
2800   add pixel_4, pixel_4, pixel_4
2801
2802   ldrh pixel_1, [clut_ptr, pixel_1]
2803   add pixel_5, pixel_5, pixel_5
2804
2805   ldrh pixel_2, [clut_ptr, pixel_2]
2806   add pixel_6, pixel_6, pixel_6
2807
2808   ldrh pixel_3, [clut_ptr, pixel_3]
2809   add pixel_7, pixel_7, pixel_7
2810
2811   ldrh pixel_4, [clut_ptr, pixel_4]
2812   orr pixels_a, pixel_0, pixel_1, lsl #16
2813
2814   ldrh pixel_5, [clut_ptr, pixel_5]
2815   orr pixels_c, pixel_2, pixel_3, lsl #16
2816
2817   ldrh pixel_6, [clut_ptr, pixel_6]
2818   subs num_blocks, num_blocks, #1
2819
2820   ldrh pixel_7, [clut_ptr, pixel_7]
2821   orr pixels_b, pixel_4, pixel_5, lsl #16
2822
2823   orr pixels_d, pixel_6, pixel_7, lsl #16
2824   stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d } 
2825
2826   add block_ptr, block_ptr, #64
2827   bne 0b
2828
2829   ldmia sp!, { r3 - r11, pc }
2830
2831 1:
2832   stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
2833
2834   bl update_texture_8bpp_cache
2835
2836   ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
2837   bal 0b
2838
2839
2840 #undef uv_0
2841 #undef uv_1
2842 #undef uv_2
2843 #undef uv_3
2844 #undef uv_4
2845 #undef uv_5
2846 #undef uv_6
2847 #undef uv_7
2848
2849 #undef pixel_0
2850 #undef pixel_1
2851 #undef pixel_2
2852 #undef pixel_3
2853 #undef pixel_4
2854 #undef pixel_5
2855 #undef pixel_6
2856 #undef pixel_7
2857
2858 #undef texture_ptr
2859
2860 #undef pixels_a
2861 #undef pixels_b
2862 #undef pixels_c
2863 #undef pixels_d
2864
2865 #define psx_gpu                                  r0
2866 #define block_ptr                                r1
2867 #define num_blocks                               r2
2868
2869 #define uv_0                                     r3
2870 #define uv_1                                     r4
2871 #define u_0                                      r3
2872 #define u_1                                      r4
2873 #define v_0                                      r5
2874 #define v_1                                      r6
2875
2876 #define uv_2                                     r5
2877 #define uv_3                                     r6
2878 #define u_2                                      r5
2879 #define u_3                                      r6
2880 #define v_2                                      r7
2881 #define v_3                                      r8
2882
2883 #define uv_4                                     r7
2884 #define uv_5                                     r8
2885 #define u_4                                      r7
2886 #define u_5                                      r8
2887 #define v_4                                      r9
2888 #define v_5                                      r10
2889
2890 #define uv_6                                     r9
2891 #define uv_7                                     r10
2892 #define u_6                                      r9
2893 #define u_7                                      r10
2894 #define v_6                                      r11
2895 #define v_7                                      r0
2896
2897 #define pixel_0                                  r3
2898 #define pixel_1                                  r4
2899 #define pixel_2                                  r5
2900 #define pixel_3                                  r6
2901 #define pixel_4                                  r7
2902 #define pixel_5                                  r8
2903 #define pixel_6                                  r9
2904 #define pixel_7                                  r10
2905
2906 #define pixels_a                                 r3
2907 #define pixels_b                                 r5
2908 #define pixels_c                                 r7
2909 #define pixels_d                                 r9
2910
2911 #define texture_ptr                              r12
2912
2913
2914 .align 3
2915
2916 function(texture_blocks_16bpp)
2917   stmdb sp!, { r3 - r11, r14 }
2918   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2919
2920   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2921   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2922
2923 0:
2924   ldrh uv_0, [block_ptr]
2925   subs num_blocks, num_blocks, #1
2926
2927   ldrh uv_1, [block_ptr, #2]
2928
2929   and v_0, uv_0, #0xFF00
2930   and v_1, uv_1, #0xFF00
2931
2932   and u_0, uv_0, #0xFF
2933   and u_1, uv_1, #0xFF
2934
2935   add uv_0, u_0, v_0, lsl #2
2936   ldrh uv_2, [block_ptr, #4]
2937
2938   add uv_1, u_1, v_1, lsl #2
2939   ldrh uv_3, [block_ptr, #6]
2940
2941   add uv_0, uv_0, uv_0
2942   add uv_1, uv_1, uv_1
2943
2944   and v_2, uv_2, #0xFF00
2945   and v_3, uv_3, #0xFF00
2946
2947   and u_2, uv_2, #0xFF
2948   and u_3, uv_3, #0xFF
2949
2950   add uv_2, u_2, v_2, lsl #2
2951   ldrh uv_4, [block_ptr, #8]
2952
2953   add uv_3, u_3, v_3, lsl #2
2954   ldrh uv_5, [block_ptr, #10]
2955
2956   add uv_2, uv_2, uv_2
2957   add uv_3, uv_3, uv_3
2958
2959   and v_4, uv_4, #0xFF00
2960   and v_5, uv_5, #0xFF00
2961
2962   and u_4, uv_4, #0xFF
2963   and u_5, uv_5, #0xFF
2964
2965   add uv_4, u_4, v_4, lsl #2
2966   ldrh uv_6, [block_ptr, #12]
2967
2968   add uv_5, u_5, v_5, lsl #2
2969   ldrh uv_7, [block_ptr, #14]
2970
2971   add uv_4, uv_4, uv_4
2972   ldrh pixel_0, [texture_ptr, uv_0]
2973
2974   add uv_5, uv_5, uv_5
2975   ldrh pixel_1, [texture_ptr, uv_1]
2976
2977   and v_6, uv_6, #0xFF00
2978   ldrh pixel_2, [texture_ptr, uv_2]
2979
2980   and v_7, uv_7, #0xFF00
2981   ldrh pixel_3, [texture_ptr, uv_3]
2982
2983   and u_6, uv_6, #0xFF
2984   ldrh pixel_4, [texture_ptr, uv_4]
2985
2986   and u_7, uv_7, #0xFF
2987   ldrh pixel_5, [texture_ptr, uv_5]
2988
2989   add uv_6, u_6, v_6, lsl #2
2990   add uv_7, u_7, v_7, lsl #2
2991
2992   add uv_6, uv_6, uv_6
2993   add uv_7, uv_7, uv_7
2994
2995   orr pixels_a, pixel_0, pixel_1, lsl #16
2996   orr pixels_b, pixel_2, pixel_3, lsl #16
2997
2998   ldrh pixel_6, [texture_ptr, uv_6]
2999   orr pixels_c, pixel_4, pixel_5, lsl #16
3000
3001   ldrh pixel_7, [texture_ptr, uv_7]
3002   orr pixels_d, pixel_6, pixel_7, lsl #16
3003
3004   stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
3005   add block_ptr, block_ptr, #64
3006
3007   bne 0b
3008
3009   ldmia sp!, { r3 - r11, pc }
3010
3011
3012 #undef num_blocks
3013
3014 #undef test_mask
3015 #undef texels
3016 #undef pixels_b
3017 #undef pixels
3018 #undef d64_1
3019 #undef d64_4
3020 #undef d64_128
3021 #undef draw_mask
3022 #undef msb_mask
3023 #undef msb_mask_low
3024 #undef msb_mask_high
3025 #undef fb_pixels
3026
3027 #undef c_32
3028 #undef fb_ptr
3029 #undef mask_msb_ptr
3030
3031 #define psx_gpu                                  r0
3032 #define num_blocks                               r1
3033 #define color_ptr                                r2
3034 #define colors_scalar                            r2
3035 #define colors_scalar_compare                    r3
3036 #define mask_msb_ptr                             r2
3037
3038 #define block_ptr_load_a                         r0
3039 #define block_ptr_store                          r3
3040 #define block_ptr_load_b                         r12
3041 #define c_32                                     r2
3042
3043 #define c_48                                     r4
3044 #define fb_ptr                                   r14
3045 #define draw_mask_bits_scalar                    r5
3046
3047 #define d128_0x07                                q0
3048 #define d128_0x1F                                q1
3049 #define d128_0x8000                              q2
3050 #define test_mask                                q3
3051 #define texels                                   q4
3052 #define colors_rg                                q5
3053 #define colors_b_dm_bits                         q6
3054 #define texels_rg                                q7
3055 #define pixels_r                                 q8
3056 #define pixels_g                                 q9
3057 #define pixels_b                                 q10
3058 #define pixels                                   q11
3059 #define zero_mask                                q4
3060 #define draw_mask                                q12
3061 #define msb_mask                                 q13
3062
3063 #define fb_pixels                                q8
3064
3065 #define pixels_gb_low                            q9
3066
3067 #define colors_r                                 d10
3068 #define colors_g                                 d11
3069 #define colors_b                                 d12
3070 #define draw_mask_bits                           d13
3071 #define texels_r                                 d14
3072 #define texels_g                                 d15
3073 #define pixels_r_low                             d16
3074 #define pixels_g_low                             d18
3075 #define pixels_b_low                             d19
3076 #define msb_mask_low                             d26
3077 #define msb_mask_high                            d27
3078
3079 #define d64_1                                    d28
3080 #define d64_4                                    d29
3081 #define d64_128                                  d30
3082 #define texels_b                                 d31
3083
3084 #define shade_blocks_textured_modulated_prologue_indirect()                    \
3085   mov c_48, #48;                                                               \
3086   add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset                         \
3087
3088 #define shade_blocks_textured_modulated_prologue_direct()                      \
3089   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3090   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]            \
3091
3092
3093 #define shade_blocks_textured_modulated_prologue_shaded(dithering, target)     \
3094   
3095 #define shade_blocks_textured_false_modulation_check_undithered(target)        \
3096   ldr colors_scalar, [psx_gpu, #psx_gpu_triangle_color_offset];                \
3097   movw colors_scalar_compare, #0x8080;                                         \
3098                                                                                \
3099   movt colors_scalar_compare, #0x80;                                           \
3100   cmp colors_scalar, colors_scalar_compare;                                    \
3101   beq shade_blocks_textured_unmodulated_##target                               \
3102
3103 #define shade_blocks_textured_false_modulation_check_dithered(target)          \
3104
3105 #define shade_blocks_textured_modulated_prologue_unshaded(dithering, target)   \
3106   shade_blocks_textured_false_modulation_check_##dithering(target);            \
3107   add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset;                      \
3108   vld1.u32 { colors_r[] }, [color_ptr, :32];                                   \
3109   vdup.u8 colors_g, colors_r[1];                                               \
3110   vdup.u8 colors_b, colors_r[2];                                               \
3111   vdup.u8 colors_r, colors_r[0]                                                \
3112
3113
3114 #define shade_blocks_textured_modulated_load_dithered(target)                  \
3115   vld1.u32 { target }, [block_ptr_load_b, :128]                                \
3116
3117 #define shade_blocks_textured_modulated_load_last_dithered(target)             \
3118   vld1.u32 { target }, [block_ptr_load_b, :128], c_32                          \
3119
3120 #define shade_blocks_textured_modulated_load_undithered(target)                \
3121
3122 #define shade_blocks_textured_modulated_load_last_undithered(target)           \
3123   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3124
3125 #define shade_blocks_textured_modulate_dithered(channel)                       \
3126   vmlal.u8 pixels_##channel, texels_##channel, colors_##channel                \
3127
3128 #define shade_blocks_textured_modulate_undithered(channel)                     \
3129   vmull.u8 pixels_##channel, texels_##channel, colors_##channel                \
3130
3131
3132 #define shade_blocks_textured_modulated_store_draw_mask_indirect(offset)       \
3133   vst1.u32 { draw_mask }, [block_ptr_store, :128]!                             \
3134
3135 #define shade_blocks_textured_modulated_store_draw_mask_direct(offset)         \
3136   ldr fb_ptr, [block_ptr_load_b, #(offset - 64)];                              \
3137   vld1.u32 { fb_pixels }, [fb_ptr];                                            \
3138   vbit.u16 pixels, fb_pixels, draw_mask                                        \
3139
3140 #define shade_blocks_textured_modulated_store_pixels_indirect()                \
3141   vst1.u32 { pixels }, [block_ptr_store, :128], c_48                           \
3142
3143 #define shade_blocks_textured_modulated_store_pixels_direct()                  \
3144   vst1.u32 { pixels }, [fb_ptr]                                                \
3145
3146
3147 #define shade_blocks_textured_modulated_load_rg_shaded()                       \
3148   vld1.u32 { colors_r, colors_g }, [block_ptr_load_b, :128], c_32              \
3149
3150 #define shade_blocks_textured_modulated_load_rg_unshaded()                     \
3151   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3152
3153 #define shade_blocks_textured_modulated_load_bdm_shaded()                      \
3154   vld1.u32 { colors_b, draw_mask_bits }, [block_ptr_load_a, :128], c_32        \
3155
3156 #define shade_blocks_textured_modulated_load_bdm_unshaded()                    \
3157   ldr draw_mask_bits_scalar, [block_ptr_load_a, #8];                           \
3158   add block_ptr_load_a, block_ptr_load_a, #32                                  \
3159
3160 #define shade_blocks_textured_modulated_expand_draw_mask_shaded()              \
3161   vdup.u16 draw_mask, draw_mask_bits[0]                                        \
3162
3163 #define shade_blocks_textured_modulated_expand_draw_mask_unshaded()            \
3164   vdup.u16 draw_mask, draw_mask_bits_scalar                                    \
3165
3166
3167 #define shade_blocks_textured_modulated_apply_msb_mask_indirect()              \
3168
3169 #define shade_blocks_textured_modulated_apply_msb_mask_direct()                \
3170   vorr.u16 pixels, pixels, msb_mask                                            \
3171
3172
3173 #define shade_blocks_textured_modulated_builder(shading, dithering, target)    \
3174 .align 3;                                                                      \
3175                                                                                \
3176 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
3177   shade_blocks_textured_modulated_prologue_##shading(dithering, target);       \
3178   stmdb sp!, { r4 - r5, lr };                                                  \
3179   save_abi_regs();                                                             \
3180   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3181                                                                                \
3182   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
3183                                                                                \
3184   shade_blocks_textured_modulated_prologue_##target();                         \
3185                                                                                \
3186   add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset;                       \
3187   mov c_32, #32;                                                               \
3188                                                                                \
3189   add block_ptr_load_b, block_ptr_load_a, #16;                                 \
3190   vmov.u8 d64_1, #1;                                                           \
3191   vmov.u8 d64_4, #4;                                                           \
3192   vmov.u8 d64_128, #128;                                                       \
3193                                                                                \
3194   vld1.u32 { texels }, [block_ptr_load_a, :128], c_32;                         \
3195   vmov.u8 d128_0x07, #0x07;                                                    \
3196                                                                                \
3197   shade_blocks_textured_modulated_load_rg_##shading();                         \
3198   vmov.u8 d128_0x1F, #0x1F;                                                    \
3199                                                                                \
3200   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3201   vmov.u16 d128_0x8000, #0x8000;                                               \
3202                                                                                \
3203   vmovn.u16 texels_r, texels;                                                  \
3204   vshrn.u16 texels_g, texels, #5;                                              \
3205                                                                                \
3206   vshrn.u16 texels_b, texels, #7;                                              \
3207   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3208                                                                                \
3209   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3210   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3211                                                                                \
3212   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3213   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3214                                                                                \
3215   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3216   vshr.u8 texels_b, texels_b, #3;                                              \
3217                                                                                \
3218   shade_blocks_textured_modulate_##dithering(r);                               \
3219   shade_blocks_textured_modulate_##dithering(g);                               \
3220   shade_blocks_textured_modulate_##dithering(b);                               \
3221                                                                                \
3222   vand.u16 pixels, texels, d128_0x8000;                                        \
3223   vceq.u16 zero_mask, texels, #0;                                              \
3224                                                                                \
3225   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3226   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3227   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3228                                                                                \
3229   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3230   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3231   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3232   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3233                                                                                \
3234   subs num_blocks, num_blocks, #1;                                             \
3235   beq 1f;                                                                      \
3236                                                                                \
3237  .align 3;                                                                     \
3238                                                                                \
3239  0:                                                                            \
3240   vld1.u32 { texels }, [block_ptr_load_a, :128], c_32;                         \
3241   shade_blocks_textured_modulated_load_rg_##shading();                         \
3242   vshrn.u16 texels_g, texels, #5;                                              \
3243                                                                                \
3244   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3245   vshrn.u16 texels_b, texels, #7;                                              \
3246                                                                                \
3247   pld [block_ptr_load_a];                                                      \
3248   vmovn.u16 texels_r, texels;                                                  \
3249   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3250                                                                                \
3251   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3252   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3253   shade_blocks_textured_modulated_store_draw_mask_##target(-4);                \
3254                                                                                \
3255   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3256   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3257                                                                                \
3258   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3259   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3260                                                                                \
3261   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3262   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3263                                                                                \
3264   shade_blocks_textured_modulated_store_pixels_##target();                     \
3265   vshr.u8 texels_b, texels_b, #3;                                              \
3266                                                                                \
3267   shade_blocks_textured_modulate_##dithering(r);                               \
3268   shade_blocks_textured_modulate_##dithering(g);                               \
3269   shade_blocks_textured_modulate_##dithering(b);                               \
3270                                                                                \
3271   vand.u16 pixels, texels, d128_0x8000;                                        \
3272   vceq.u16 zero_mask, texels, #0;                                              \
3273                                                                                \
3274   subs num_blocks, num_blocks, #1;                                             \
3275                                                                                \
3276   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3277   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3278   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3279                                                                                \
3280   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3281   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3282   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3283   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3284                                                                                \
3285   bne 0b;                                                                      \
3286                                                                                \
3287  1:                                                                            \
3288   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3289   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3290   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3291                                                                                \
3292   shade_blocks_textured_modulated_store_draw_mask_##target(28);                \
3293   shade_blocks_textured_modulated_store_pixels_##target();                     \
3294                                                                                \
3295   restore_abi_regs();                                                          \
3296   ldmia sp!, { r4 - r5, pc }                                                   \
3297
3298
3299 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3300 shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3301 shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3302 shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3303
3304 shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3305 shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3306 shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3307 shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3308
3309
3310 #undef c_64
3311 #undef fb_ptr
3312 #undef color_ptr
3313
3314 #undef color_r
3315 #undef color_g
3316 #undef color_b
3317
3318 #undef test_mask
3319 #undef pixels
3320 #undef draw_mask
3321 #undef zero_mask
3322 #undef fb_pixels
3323 #undef msb_mask
3324 #undef msb_mask_low
3325 #undef msb_mask_high
3326
3327 #define psx_gpu                                  r0
3328 #define num_blocks                               r1
3329 #define mask_msb_ptr                             r2
3330 #define color_ptr                                r3
3331
3332 #define block_ptr_load                           r0
3333 #define draw_mask_store_ptr                      r3
3334 #define draw_mask_bits_ptr                       r12
3335 #define draw_mask_ptr                            r12
3336 #define pixel_store_ptr                          r14
3337
3338 #define fb_ptr_cmp                               r4
3339
3340 #define fb_ptr                                   r3
3341 #define fb_ptr_next                              r14
3342
3343 #define c_64                                     r2
3344
3345 #define test_mask                                q0
3346 #define pixels                                   q1
3347 #define draw_mask                                q2
3348 #define zero_mask                                q3
3349 #define draw_mask_combined                       q4
3350 #define fb_pixels                                q5
3351 #define fb_pixels_next                           q6
3352 #define msb_mask                                 q7
3353
3354 #define draw_mask_low                            d4
3355 #define draw_mask_high                           d5
3356 #define msb_mask_low                             d14
3357 #define msb_mask_high                            d15
3358
3359 .align 3
3360 function(shade_blocks_textured_unmodulated_indirect)
3361   stmdb sp!, { r4, r14 }
3362   save_abi_regs()
3363   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3364
3365   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3366   add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3367
3368   vld1.u32 { test_mask }, [psx_gpu, :128]
3369   add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3370
3371   mov c_64, #64
3372   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3373
3374   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3375   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3376    [draw_mask_bits_ptr, :16], c_64
3377   vceq.u16 zero_mask, pixels, #0
3378
3379   vtst.u16 draw_mask, draw_mask, test_mask
3380   vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
3381
3382   subs num_blocks, num_blocks, #1
3383   beq 1f
3384
3385  0:
3386   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3387   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3388
3389   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3390    [draw_mask_bits_ptr, :16], c_64
3391   vceq.u16 zero_mask, pixels, #0
3392
3393   vtst.u16 draw_mask, draw_mask, test_mask
3394   vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
3395
3396   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
3397   subs num_blocks, num_blocks, #1
3398
3399   bne 0b
3400
3401  1:
3402   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3403   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
3404
3405   restore_abi_regs()
3406   ldmia sp!, { r4, pc }
3407
3408
3409 .align 3
3410
3411 function(shade_blocks_textured_unmodulated_direct)
3412   stmdb sp!, { r4, r14 }
3413   save_abi_regs()
3414   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3415
3416   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3417   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3418
3419   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
3420   mov c_64, #64
3421
3422   vld1.u32 { test_mask }, [psx_gpu, :128]
3423   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3424
3425   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3426    [draw_mask_bits_ptr, :16], c_64
3427   ldr fb_ptr_next, [block_ptr_load, #44]
3428
3429   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3430   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3431   vceq.u16 zero_mask, pixels, #0
3432   vtst.u16 draw_mask, draw_mask, test_mask
3433
3434   subs num_blocks, num_blocks, #1
3435   beq 1f
3436
3437  0:
3438   mov fb_ptr, fb_ptr_next
3439   ldr fb_ptr_next, [block_ptr_load, #44]
3440
3441   vorr.u16 pixels, pixels, msb_mask
3442
3443   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3444   vmov fb_pixels, fb_pixels_next
3445
3446   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3447    [draw_mask_bits_ptr, :16], c_64
3448   vbif.u16 fb_pixels, pixels, draw_mask_combined
3449
3450   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3451   pld [fb_ptr_next, #64]
3452
3453   add fb_ptr_cmp, fb_ptr_cmp, #14
3454   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3455
3456   cmp fb_ptr_cmp, #28
3457   bls 4f
3458
3459   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3460   vceq.u16 zero_mask, pixels, #0
3461
3462   vst1.u16 { fb_pixels }, [fb_ptr]
3463   vtst.u16 draw_mask, draw_mask, test_mask
3464
3465  3:
3466   subs num_blocks, num_blocks, #1
3467   bne 0b
3468
3469  1:
3470   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3471   vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3472
3473   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
3474
3475   restore_abi_regs()
3476   ldmia sp!, { r4, pc }
3477
3478  4:
3479   vst1.u16 { fb_pixels }, [fb_ptr]
3480   vceq.u16 zero_mask, pixels, #0
3481
3482   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3483   vtst.u16 draw_mask, draw_mask, test_mask
3484
3485   bal 3b
3486
3487
3488 function(shade_blocks_unshaded_untextured_indirect)
3489   bx lr
3490
3491 .align 3
3492
3493 function(shade_blocks_unshaded_untextured_direct)
3494   stmdb sp!, { r4, r14 }
3495   save_abi_regs()
3496   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3497
3498   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3499   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3500
3501   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
3502   add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3503
3504   add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3505   vld1.u16 { pixels }, [color_ptr, :128]
3506
3507   mov c_64, #64
3508   vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
3509
3510   vorr.u16 pixels, pixels, msb_mask
3511   subs num_blocks, num_blocks, #1
3512
3513   ldr fb_ptr_next, [block_ptr_load], #64
3514
3515   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3516   beq 1f
3517
3518  0:
3519   vmov fb_pixels, fb_pixels_next
3520   mov fb_ptr, fb_ptr_next
3521   ldr fb_ptr_next, [block_ptr_load], #64
3522
3523   vbif.u16 fb_pixels, pixels, draw_mask
3524   vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
3525
3526   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3527   add fb_ptr_cmp, fb_ptr_cmp, #14
3528   cmp fb_ptr_cmp, #28
3529   bls 4f
3530
3531   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3532   vst1.u16 { fb_pixels }, [fb_ptr]
3533
3534  3:
3535   subs num_blocks, num_blocks, #1
3536   bne 0b
3537
3538  1:
3539   vbif.u16 fb_pixels_next, pixels, draw_mask
3540   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
3541
3542   restore_abi_regs()
3543   ldmia sp!, { r4, pc }
3544
3545  4:
3546   vst1.u16 { fb_pixels }, [fb_ptr]
3547   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3548   bal 3b
3549
3550
3551 #undef draw_mask_ptr
3552 #undef c_64
3553 #undef fb_ptr
3554 #undef fb_ptr_next
3555 #undef fb_ptr_cmp
3556
3557 #define psx_gpu                                  r0
3558 #define num_blocks                               r1
3559 #define msb_mask_ptr                             r2
3560 #define pixel_ptr                                r3
3561 #define draw_mask_ptr                            r0
3562 #define c_64                                     r2
3563 #define fb_ptr                                   r12
3564 #define fb_ptr_next                              r14
3565 #define fb_ptr_cmp                               r4
3566
3567 #undef msb_mask
3568 #undef draw_mask
3569 #undef pixels
3570 #undef fb_pixels
3571 #undef d128_0x8000
3572 #undef msb_mask_low
3573 #undef msb_mask_high
3574 #undef draw_mask_next
3575 #undef pixels_g
3576 #undef blend_pixels
3577 #undef fb_pixels_next
3578
3579 #define msb_mask                                 q0
3580 #define draw_mask                                q1
3581 #define pixels                                   q2
3582 #define fb_pixels                                q3
3583 #define blend_pixels                             q4
3584 #define pixels_no_msb                            q5
3585 #define blend_mask                               q6
3586 #define fb_pixels_no_msb                         q7
3587 #define d128_0x8000                              q8
3588 #define d128_0x0421                              q9
3589 #define fb_pixels_next                           q10
3590 #define blend_pixels_next                        q11
3591 #define pixels_next                              q12
3592 #define draw_mask_next                           q13
3593 #define write_mask                               q14
3594
3595 #define pixels_rb                                q5
3596 #define pixels_mg                                q7
3597 #define pixels_g                                 q7
3598 #define d128_0x7C1F                              q8
3599 #define d128_0x03E0                              q9
3600 #define fb_pixels_rb                             q10
3601 #define fb_pixels_g                              q11
3602 #define fb_pixels_masked                         q11
3603 #define d128_0x83E0                              q15
3604 #define pixels_fourth                            q7
3605 #define d128_0x1C07                              q12
3606 #define d128_0x00E0                              q13
3607 #define d128_0x80E0                              q13
3608
3609 #define msb_mask_low                             d0
3610 #define msb_mask_high                            d1
3611
3612 #define blend_blocks_average_set_blend_mask_textured(source)                   \
3613   vclt.s16 blend_mask, source, #0                                              \
3614
3615 #define blend_blocks_average_set_stp_bit_textured()                            \
3616   vorr.u16 blend_pixels, #0x8000                                               \
3617
3618 #define blend_blocks_average_combine_textured(source)                          \
3619   vbif.u16 blend_pixels, source, blend_mask                                    \
3620   
3621 #define blend_blocks_average_set_blend_mask_untextured(source)                 \
3622
3623 #define blend_blocks_average_set_stp_bit_untextured()                          \
3624
3625 #define blend_blocks_average_combine_untextured(source)                        \
3626
3627 #define blend_blocks_average_mask_set_on()                                     \
3628   vclt.s16 write_mask, fb_pixels_next, #0                                      \
3629
3630 #define blend_blocks_average_mask_copy_on()                                    \
3631   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3632
3633 #define blend_blocks_average_mask_copy_b_on()                                  \
3634   vorr.u16 draw_mask_next, draw_mask_next, write_mask                          \
3635
3636 #define blend_blocks_average_mask_set_off()                                    \
3637
3638 #define blend_blocks_average_mask_copy_off()                                   \
3639   vmov draw_mask, draw_mask_next                                               \
3640
3641 #define blend_blocks_average_mask_copy_b_off()                                 \
3642
3643 #define blend_blocks_average_builder(texturing, mask_evaluate)                 \
3644 .align 3;                                                                      \
3645                                                                                \
3646 function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
3647   stmdb sp!, { r4, r14 };                                                      \
3648   save_abi_regs();                                                             \
3649   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3650   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3651                                                                                \
3652   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3653   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3654                                                                                \
3655   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3656   mov c_64, #64;                                                               \
3657                                                                                \
3658   vmov.u16 d128_0x8000, #0x8000;                                               \
3659   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
3660   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3661                                                                                \
3662   vmov.u16 d128_0x0421, #0x0400;                                               \
3663   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
3664                                                                                \
3665   vorr.u16 d128_0x0421, #0x0021;                                               \
3666   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3667                                                                                \
3668   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3669   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3670   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3671   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3672   blend_blocks_average_mask_set_##mask_evaluate();                             \
3673   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3674                                                                                \
3675   subs num_blocks, num_blocks, #1;                                             \
3676   beq 1f;                                                                      \
3677                                                                                \
3678  0:                                                                            \
3679   mov fb_ptr, fb_ptr_next;                                                     \
3680   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3681                                                                                \
3682   vmov pixels, pixels_next;                                                    \
3683   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
3684                                                                                \
3685   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3686                                                                                \
3687   blend_blocks_average_mask_copy_##mask_evaluate();                            \
3688   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
3689                                                                                \
3690   blend_blocks_average_set_blend_mask_##texturing(pixels);                     \
3691   blend_blocks_average_set_stp_bit_##texturing();                              \
3692   vmov fb_pixels, fb_pixels_next;                                              \
3693   blend_blocks_average_combine_##texturing(pixels);                            \
3694                                                                                \
3695   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3696   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3697   cmp fb_ptr_cmp, #28;                                                         \
3698   bls 2f;                                                                      \
3699                                                                                \
3700   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3701   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3702                                                                                \
3703   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3704   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3705                                                                                \
3706   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3707   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3708                                                                                \
3709   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3710   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3711   blend_blocks_average_mask_set_##mask_evaluate();                             \
3712   vst1.u16 { fb_pixels }, [fb_ptr];                                            \
3713                                                                                \
3714  3:                                                                            \
3715   subs num_blocks, num_blocks, #1;                                             \
3716   bne 0b;                                                                      \
3717                                                                                \
3718  1:                                                                            \
3719   blend_blocks_average_mask_copy_b_##mask_evaluate();                          \
3720   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3721                                                                                \
3722   blend_blocks_average_set_blend_mask_##texturing(pixels_next);                \
3723   blend_blocks_average_set_stp_bit_##texturing();                              \
3724   blend_blocks_average_combine_##texturing(pixels_next);                       \
3725                                                                                \
3726   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3727   vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next;                       \
3728   vst1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3729                                                                                \
3730   restore_abi_regs();                                                          \
3731   ldmia sp!, { r4, pc };                                                       \
3732                                                                                \
3733  2:                                                                            \
3734   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3735   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3736   vst1.u16 { fb_pixels }, [fb_ptr];                                            \
3737                                                                                \
3738   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3739   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3740   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3741   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3742   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3743   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3744                                                                                \
3745   bal 3b                                                                       \
3746
3747 blend_blocks_average_builder(textured, off)
3748 blend_blocks_average_builder(untextured, off)
3749 blend_blocks_average_builder(textured, on)
3750 blend_blocks_average_builder(untextured, on)
3751
3752
3753 #define blend_blocks_add_mask_set_on()                                         \
3754   vclt.s16 write_mask, fb_pixels, #0                                           \
3755
3756 #define blend_blocks_add_mask_copy_on()                                        \
3757   vorr.u16 draw_mask, draw_mask, write_mask                                    \
3758
3759 #define blend_blocks_add_mask_set_off()                                        \
3760
3761 #define blend_blocks_add_mask_copy_off()                                       \
3762
3763
3764 #define blend_blocks_add_textured_builder(mask_evaluate)                       \
3765 .align 3;                                                                      \
3766                                                                                \
3767 function(blend_blocks_textured_add_##mask_evaluate)                            \
3768   stmdb sp!, { r4, r14 };                                                      \
3769   save_abi_regs();                                                             \
3770   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3771   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3772                                                                                \
3773   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3774   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3775                                                                                \
3776   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3777   mov c_64, #64;                                                               \
3778                                                                                \
3779   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3780   vmov.u16 d128_0x03E0, #0x0300;                                               \
3781   vmov.u16 d128_0x83E0, #0x8000;                                               \
3782   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3783   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3784   vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0;                              \
3785                                                                                \
3786   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3787   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3788   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3789   vclt.s16 blend_mask, pixels, #0;                                             \
3790   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3791   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3792   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3793                                                                                \
3794   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3795   vorr.u16 pixels, pixels, msb_mask;                                           \
3796   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3797   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3798   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3799   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3800   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3801   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3802   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3803   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3804                                                                                \
3805   subs num_blocks, num_blocks, #1;                                             \
3806   beq 1f;                                                                      \
3807                                                                                \
3808  0:                                                                            \
3809   mov fb_ptr, fb_ptr_next;                                                     \
3810                                                                                \
3811   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3812                                                                                \
3813   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3814   vclt.s16 blend_mask, pixels, #0;                                             \
3815                                                                                \
3816   vorr.u16 pixels, pixels, msb_mask;                                           \
3817   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3818   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3819                                                                                \
3820   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3821   pld [fb_ptr_next, #64];                                                      \
3822                                                                                \
3823   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3824   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3825                                                                                \
3826   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3827   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3828                                                                                \
3829   cmp fb_ptr_cmp, #28;                                                         \
3830   bls 2f;                                                                      \
3831                                                                                \
3832   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3833   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3834   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3835   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3836   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3837   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3838   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3839                                                                                \
3840  3:                                                                            \
3841   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3842   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3843   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3844   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3845   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3846                                                                                \
3847   subs num_blocks, num_blocks, #1;                                             \
3848   bne 0b;                                                                      \
3849                                                                                \
3850  1:                                                                            \
3851   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3852   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3853   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
3854                                                                                \
3855   restore_abi_regs();                                                          \
3856   ldmia sp!, { r4, pc };                                                       \
3857                                                                                \
3858  2:                                                                            \
3859   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3860   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3861                                                                                \
3862   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3863   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3864   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3865   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3866   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3867   bal 3b                                                                       \
3868
3869
3870 #define blend_blocks_add_untextured_builder(mask_evaluate)                     \
3871 .align 3;                                                                      \
3872                                                                                \
3873 function(blend_blocks_untextured_add_##mask_evaluate)                          \
3874   stmdb sp!, { r4, r14 };                                                      \
3875   save_abi_regs();                                                             \
3876   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3877   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3878                                                                                \
3879   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3880   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3881                                                                                \
3882   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3883   mov c_64, #64;                                                               \
3884                                                                                \
3885   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3886   vmov.u16 d128_0x03E0, #0x0300;                                               \
3887   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3888   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3889                                                                                \
3890   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3891   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3892   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3893   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3894   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3895   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3896                                                                                \
3897   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3898   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3899   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3900   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3901   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3902   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3903   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3904   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3905                                                                                \
3906   subs num_blocks, num_blocks, #1;                                             \
3907   beq 1f;                                                                      \
3908                                                                                \
3909  0:                                                                            \
3910   mov fb_ptr, fb_ptr_next;                                                     \
3911                                                                                \
3912   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3913                                                                                \
3914   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3915                                                                                \
3916   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3917   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3918   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3919                                                                                \
3920   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3921   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3922                                                                                \
3923   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3924   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3925   cmp fb_ptr_cmp, #28;                                                         \
3926   bls 2f;                                                                      \
3927                                                                                \
3928   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3929   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3930   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3931   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3932   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3933   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3934                                                                                \
3935  3:                                                                            \
3936   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3937   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3938   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3939   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3940   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3941                                                                                \
3942   subs num_blocks, num_blocks, #1;                                             \
3943   bne 0b;                                                                      \
3944                                                                                \
3945  1:                                                                            \
3946   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3947   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3948   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3949   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
3950                                                                                \
3951   restore_abi_regs();                                                          \
3952   ldmia sp!, { r4, pc };                                                       \
3953                                                                                \
3954  2:                                                                            \
3955   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3956   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3957                                                                                \
3958   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3959   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3960   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3961   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3962   bal 3b                                                                       \
3963
3964
3965 blend_blocks_add_textured_builder(off)
3966 blend_blocks_add_textured_builder(on)
3967 blend_blocks_add_untextured_builder(off)
3968 blend_blocks_add_untextured_builder(on)
3969
3970 #define blend_blocks_subtract_set_blend_mask_textured()                        \
3971   vclt.s16 blend_mask, pixels_next, #0                                         \
3972
3973 #define blend_blocks_subtract_combine_textured()                               \
3974   vbif.u16 blend_pixels, pixels, blend_mask                                    \
3975
3976 #define blend_blocks_subtract_set_stp_textured()                               \
3977   vorr.u16 blend_pixels, #0x8000                                               \
3978
3979 #define blend_blocks_subtract_msb_mask_textured()                              \
3980   vorr.u16 pixels, pixels_next, msb_mask                                       \
3981
3982 #define blend_blocks_subtract_set_blend_mask_untextured()                      \
3983
3984 #define blend_blocks_subtract_combine_untextured()                             \
3985
3986 #define blend_blocks_subtract_set_stp_untextured()                             \
3987   vorr.u16 blend_pixels, blend_pixels, msb_mask                                \
3988
3989 #define blend_blocks_subtract_msb_mask_untextured()                            \
3990
3991
3992 #define blend_blocks_subtract_mask_set_on()                                    \
3993   vclt.s16 write_mask, fb_pixels, #0                                           \
3994
3995 #define blend_blocks_subtract_mask_copy_on()                                   \
3996   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3997
3998 #define blend_blocks_subtract_mask_set_off()                                   \
3999
4000 #define blend_blocks_subtract_mask_copy_off()                                  \
4001   vmov draw_mask, draw_mask_next                                               \
4002
4003
4004 #define blend_blocks_subtract_builder(texturing, mask_evaluate)                \
4005 .align 3;                                                                      \
4006                                                                                \
4007 function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
4008   stmdb sp!, { r4, r14 };                                                      \
4009   save_abi_regs();                                                             \
4010   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4011   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4012                                                                                \
4013   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4014   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4015                                                                                \
4016   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4017   mov c_64, #64;                                                               \
4018                                                                                \
4019   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4020   vmov.u16 d128_0x03E0, #0x0300;                                               \
4021   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4022   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4023                                                                                \
4024   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
4025   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4026   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
4027   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4028   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4029   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4030   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4031                                                                                \
4032   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4033   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4034   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4035   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4036   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4037                                                                                \
4038   subs num_blocks, num_blocks, #1;                                             \
4039   beq 1f;                                                                      \
4040                                                                                \
4041  0:                                                                            \
4042   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4043   mov fb_ptr, fb_ptr_next;                                                     \
4044   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4045                                                                                \
4046   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
4047   blend_blocks_subtract_msb_mask_##texturing();                                \
4048                                                                                \
4049   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
4050   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4051   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4052   blend_blocks_subtract_set_stp_##texturing();                                 \
4053   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4054   blend_blocks_subtract_combine_##texturing();                                 \
4055   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4056   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4057                                                                                \
4058   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4059   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4060   cmp fb_ptr_cmp, #28;                                                         \
4061   bls 2f;                                                                      \
4062                                                                                \
4063   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4064   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4065   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4066   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4067   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4068   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4069   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4070                                                                                \
4071  3:                                                                            \
4072   subs num_blocks, num_blocks, #1;                                             \
4073   bne 0b;                                                                      \
4074                                                                                \
4075  1:                                                                            \
4076   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4077                                                                                \
4078   blend_blocks_subtract_msb_mask_##texturing();                                \
4079   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4080   blend_blocks_subtract_set_stp_##texturing();                                 \
4081   blend_blocks_subtract_combine_##texturing();                                 \
4082   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4083   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4084                                                                                \
4085   restore_abi_regs();                                                          \
4086   ldmia sp!, { r4, pc };                                                       \
4087                                                                                \
4088  2:                                                                            \
4089   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4090   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4091   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4092   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4093   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4094   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4095   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4096   bal 3b                                                                       \
4097
4098
4099 blend_blocks_subtract_builder(textured, off)
4100 blend_blocks_subtract_builder(textured, on)
4101 blend_blocks_subtract_builder(untextured, off)
4102 blend_blocks_subtract_builder(untextured, on)
4103
4104
4105 #define blend_blocks_add_fourth_textured_builder(mask_evaluate)                \
4106 .align 3;                                                                      \
4107                                                                                \
4108 function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
4109   stmdb sp!, { r4, r14 };                                                      \
4110   save_abi_regs();                                                             \
4111   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4112   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4113                                                                                \
4114   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4115   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4116                                                                                \
4117   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4118   mov c_64, #64;                                                               \
4119                                                                                \
4120   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4121   vmov.u16 d128_0x03E0, #0x0300;                                               \
4122   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4123   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4124   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4125   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4126   vorr.u16 d128_0x1C07, #0x0007;                                               \
4127                                                                                \
4128   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4129   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4130   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4131   vclt.s16 blend_mask, pixels, #0;                                             \
4132   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4133   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4134   vshr.s16 pixels_fourth, pixels, #2;                                          \
4135   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4136                                                                                \
4137   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4138   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4139   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4140   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4141   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4142   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4143   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4144   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4145                                                                                \
4146   subs num_blocks, num_blocks, #1;                                             \
4147   beq 1f;                                                                      \
4148                                                                                \
4149  0:                                                                            \
4150   mov fb_ptr, fb_ptr_next;                                                     \
4151   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4152                                                                                \
4153   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4154   vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
4155   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4156                                                                                \
4157   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4158   vclt.s16 blend_mask, pixels, #0;                                             \
4159   vshr.s16 pixels_fourth, pixels, #2;                                          \
4160   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4161   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4162                                                                                \
4163   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4164   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4165                                                                                \
4166   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4167   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4168   cmp fb_ptr_cmp, #28;                                                         \
4169   bls 2f;                                                                      \
4170                                                                                \
4171   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4172   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4173   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4174   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4175   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4176   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4177                                                                                \
4178  3:                                                                            \
4179   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4180   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4181   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4182   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4183   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4184                                                                                \
4185   subs num_blocks, num_blocks, #1;                                             \
4186   bne 0b;                                                                      \
4187                                                                                \
4188  1:                                                                            \
4189   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4190   vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
4191   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4192   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4193   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4194   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4195                                                                                \
4196   restore_abi_regs();                                                          \
4197   ldmia sp!, { r4, pc };                                                       \
4198                                                                                \
4199  2:                                                                            \
4200   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4201   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4202                                                                                \
4203   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4204   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4205   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4206   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4207   bal 3b                                                                       \
4208
4209
4210
4211 #define blend_blocks_add_fourth_untextured_builder(mask_evaluate)              \
4212 .align 3;                                                                      \
4213                                                                                \
4214 function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
4215   stmdb sp!, { r4, r14 };                                                      \
4216   save_abi_regs();                                                             \
4217   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4218   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4219                                                                                \
4220   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4221   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4222                                                                                \
4223   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4224   mov c_64, #64;                                                               \
4225                                                                                \
4226   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4227   vmov.u16 d128_0x03E0, #0x0300;                                               \
4228   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4229   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4230   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4231   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4232   vorr.u16 d128_0x1C07, #0x0007;                                               \
4233                                                                                \
4234   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4235   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4236   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4237   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4238   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4239   vshr.s16 pixels_fourth, pixels, #2;                                          \
4240   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4241                                                                                \
4242   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4243   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4244   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4245   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4246   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4247   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4248   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4249   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4250                                                                                \
4251   subs num_blocks, num_blocks, #1;                                             \
4252   beq 1f;                                                                      \
4253                                                                                \
4254  0:                                                                            \
4255   mov fb_ptr, fb_ptr_next;                                                     \
4256   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4257                                                                                \
4258   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4259                                                                                \
4260   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4261   vshr.s16 pixels_fourth, pixels, #2;                                          \
4262   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4263   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4264                                                                                \
4265   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4266   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4267                                                                                \
4268   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4269   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4270   cmp fb_ptr_cmp, #28;                                                         \
4271   bls 2f;                                                                      \
4272                                                                                \
4273   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4274   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4275   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4276   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4277   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4278   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4279                                                                                \
4280  3:                                                                            \
4281   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4282   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4283   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4284   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4285   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4286                                                                                \
4287   subs num_blocks, num_blocks, #1;                                             \
4288   bne 0b;                                                                      \
4289                                                                                \
4290  1:                                                                            \
4291   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4292   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4293   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4294   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4295                                                                                \
4296   restore_abi_regs();                                                          \
4297   ldmia sp!, { r4, pc };                                                       \
4298                                                                                \
4299  2:                                                                            \
4300   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4301   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4302                                                                                \
4303   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4304   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4305   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4306   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4307   bal 3b                                                                       \
4308
4309
4310 blend_blocks_add_fourth_textured_builder(off)
4311 blend_blocks_add_fourth_textured_builder(on)
4312 blend_blocks_add_fourth_untextured_builder(off)
4313 blend_blocks_add_fourth_untextured_builder(on)
4314
4315 // TODO: Optimize this more. Need a scene that actually uses it for
4316 // confirmation..
4317
4318 .align 3
4319
4320 function(blend_blocks_textured_unblended_on)         
4321   stmdb sp!, { r4, r14 }
4322   save_abi_regs()
4323   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4324   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
4325
4326   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4327   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
4328
4329   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4330   mov c_64, #64
4331
4332   ldr fb_ptr, [pixel_ptr, #28]
4333   vld1.u16 { fb_pixels }, [fb_ptr]
4334   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
4335   vclt.s16 write_mask, fb_pixels, #0
4336   vld1.u32 { pixels }, [pixel_ptr, :128], c_64
4337
4338   subs num_blocks, num_blocks, #1
4339   beq 1f
4340
4341  0:
4342   vorr.u16 pixels, pixels, msb_mask
4343   vorr.u16 draw_mask, draw_mask, write_mask
4344   vbif.u16 fb_pixels, pixels, draw_mask
4345   vst1.u16 { fb_pixels }, [fb_ptr]
4346
4347   ldr fb_ptr, [pixel_ptr, #28]
4348   vld1.u16 { fb_pixels }, [fb_ptr]
4349   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
4350   vclt.s16 write_mask, fb_pixels, #0
4351   vld1.u32 { pixels }, [pixel_ptr, :128], c_64
4352
4353   subs num_blocks, num_blocks, #1
4354   bne 0b
4355  
4356  1:
4357   vorr.u16 pixels, pixels, msb_mask
4358   vorr.u16 draw_mask, draw_mask, write_mask
4359   vbif.u16 fb_pixels, pixels, draw_mask
4360   vst1.u16 { fb_pixels }, [fb_ptr]
4361
4362   restore_abi_regs()
4363   ldmia sp!, { r4, pc }
4364
4365
4366 function(blend_blocks_textured_unblended_off)
4367   bx lr
4368
4369
4370 function(warmup)
4371   mov r3, #64
4372   cmp r0, #0
4373   bxeq lr
4374
4375  0:
4376   vld1.u32 { u_whole_8, v_whole_8 }, [r1, :128], r3
4377
4378   subs r0, r0, #1
4379   bne 0b
4380
4381   bx lr
4382
4383 #undef vram_ptr
4384 #undef color
4385 #undef width
4386 #undef height
4387 #undef pitch
4388
4389 #define vram_ptr                                          r0
4390 #define color                                             r1
4391 #define width                                             r2
4392 #define height                                            r3
4393
4394 #define pitch                                             r1
4395
4396 #define num_width                                         r12
4397
4398 #undef colors_a
4399 #undef colors_b
4400
4401 #define colors_a                                          q0
4402 #define colors_b                                          q1
4403
4404 .align 3
4405
4406 function(render_block_fill_body)
4407   vdup.u16 colors_a, color
4408   mov pitch, #2048
4409
4410   vmov colors_b, colors_a
4411   sub pitch, pitch, width, lsl #1
4412
4413   mov num_width, width
4414
4415  0:  
4416   vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]!
4417
4418   subs num_width, num_width, #16
4419   bne 0b
4420
4421   add vram_ptr, vram_ptr, pitch
4422   mov num_width, width
4423
4424   subs height, height, #1
4425   bne 0b
4426
4427   bx lr
4428  
4429
4430 #undef x
4431 #undef y
4432 #undef width
4433 #undef height
4434 #undef fb_ptr
4435 #undef texture_mask
4436 #undef num_blocks
4437 #undef temp
4438 #undef dirty_textures_mask
4439 #undef clut_ptr
4440 #undef current_texture_mask
4441
4442 #define psx_gpu                                           r0
4443 #define x                                                 r1
4444 #define y                                                 r2
4445 #define u                                                 r3
4446 #define v                                                 r4
4447 #define width                                             r5
4448 #define height                                            r6
4449 #define offset_u                                          r8
4450 #define offset_v                                          r9
4451 #define offset_u_right                                    r10
4452 #define width_rounded                                     r11
4453 #define height_rounded                                    r12
4454
4455 #define texture_offset_base                               r1
4456 #define tile_width                                        r2
4457 #define tile_height                                       r3
4458 #define num_blocks                                        r4
4459 #define block                                             r5
4460 #define sub_tile_height                                   r6
4461 #define fb_ptr                                            r7
4462 #define texture_mask                                      r8
4463 #define column_data                                       r9
4464 #define texture_offset                                    r10
4465 #define tiles_remaining                                   r11
4466 #define fb_ptr_advance_column                             r12
4467 #define texture_block_ptr                                 r14
4468
4469 #define temp                                              r14
4470
4471 #define texture_page_ptr                                  r3
4472 #define left_block_mask                                   r4
4473 #define right_block_mask                                  r5
4474 #define texture_mask_rev                                  r10
4475 #define control_mask                                      r11
4476
4477 #define dirty_textures_mask                               r4
4478 #define clut_ptr                                          r5
4479 #define current_texture_mask                              r6
4480
4481
4482 #undef texels
4483 #undef clut_low_a
4484 #undef clut_low_b
4485 #undef clut_high_a
4486 #undef clut_high_b
4487 #undef clut_a
4488 #undef clut_b
4489 #undef texels_low
4490 #undef texels_high
4491
4492 #define texels                                            d0
4493 #define draw_masks_fb_ptrs                                q1
4494
4495 #define draw_mask_fb_ptr_left                             d2
4496 #define draw_mask_fb_ptr_right                            d3
4497
4498 #define draw_mask_fb_ptr_left_a                           d2
4499 #define draw_mask_fb_ptr_left_b                           d3
4500 #define draw_mask_fb_ptr_right_a                          d10
4501 #define draw_mask_fb_ptr_right_b                          d11
4502 #define draw_masks_fb_ptrs2                               q5
4503
4504 #define clut_low_a                                        d4
4505 #define clut_low_b                                        d5
4506 #define clut_high_a                                       d6
4507 #define clut_high_b                                       d7
4508
4509 #define block_masks                                       d8
4510 #define block_masks_shifted                               d9
4511
4512 #define clut_a                                            q2
4513 #define clut_b                                            q3
4514
4515 #define texels_low                                        d12
4516 #define texels_high                                       d13
4517
4518 #define texels_wide_low                                   d14
4519 #define texels_wide_high                                  d15
4520 #define texels_wide                                       q7
4521
4522
4523 setup_sprite_flush_blocks:
4524   vpush { q1 - q5 }
4525
4526   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
4527   bl flush_render_block_buffer
4528   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
4529
4530   vpop { q1 - q5 }
4531
4532   add block, psx_gpu, #psx_gpu_blocks_offset
4533   bx lr
4534
4535
4536 setup_sprite_update_texture_4bpp_cache:
4537   stmdb sp!, { r0 - r3, r14 }
4538   bl update_texture_4bpp_cache
4539   ldmia sp!, { r0 - r3, pc }
4540
4541
4542 setup_sprite_update_texture_8bpp_cache:
4543   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
4544   bl update_texture_8bpp_cache
4545   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
4546
4547
4548 #define setup_sprite_tiled_initialize_4bpp()                                   \
4549   ldr dirty_textures_mask,                                                     \
4550    [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset];                        \
4551   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset];                           \
4552                                                                                \
4553   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset];   \
4554   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128];                               \
4555                                                                                \
4556   tst current_texture_mask, dirty_textures_mask;                               \
4557   vuzp.u8 clut_a, clut_b;                                                      \
4558                                                                                \
4559   blne setup_sprite_update_texture_4bpp_cache                                  \
4560
4561 #define setup_sprite_tiled_initialize_8bpp()                                   \
4562   ldr dirty_textures_mask,                                                     \
4563    [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset];                        \
4564   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset];   \
4565                                                                                \
4566   tst current_texture_mask, dirty_textures_mask;                               \
4567   blne setup_sprite_update_texture_8bpp_cache                                  \
4568
4569
4570 #define setup_sprite_block_count_single()                                      \
4571   sub_tile_height                                                              \
4572
4573 #define setup_sprite_block_count_double()                                      \
4574   sub_tile_height, lsl #1                                                      \
4575
4576 #define setup_sprite_tile_add_blocks(type)                                     \
4577   add num_blocks, num_blocks, setup_sprite_block_count_##type();               \
4578   cmp num_blocks, #MAX_BLOCKS;                                                 \
4579                                                                                \
4580   movgt num_blocks, setup_sprite_block_count_##type();                         \
4581   blgt setup_sprite_flush_blocks                                               \
4582
4583
4584 #define setup_sprite_tile_full_4bpp(edge)                                      \
4585   setup_sprite_tile_add_blocks(double);                                        \
4586                                                                                \
4587  4:                                                                            \
4588   and texture_block_ptr, texture_offset, texture_mask;                         \
4589   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4590                                                                                \
4591   pld [fb_ptr];                                                                \
4592   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4593   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4594                                                                                \
4595   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4596   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4597                                                                                \
4598   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4599   add texture_block_ptr, texture_offset, #8;                                   \
4600                                                                                \
4601   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4602   add block, block, #40;                                                       \
4603                                                                                \
4604   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4605   add fb_ptr, fb_ptr, #16;                                                     \
4606                                                                                \
4607   vst1.u32 { draw_mask_fb_ptr_left }, [block, :64];                            \
4608   add block, block, #24;                                                       \
4609                                                                                \
4610   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4611   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4612                                                                                \
4613   pld [fb_ptr];                                                                \
4614   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4615   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4616                                                                                \
4617   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4618   add block, block, #40;                                                       \
4619                                                                                \
4620   add texture_offset, texture_offset, #0x10;                                   \
4621   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4622                                                                                \
4623   vst1.u32 { draw_mask_fb_ptr_right }, [block, :64];                           \
4624   add block, block, #24;                                                       \
4625                                                                                \
4626   subs sub_tile_height, sub_tile_height, #1;                                   \
4627   bne 4b;                                                                      \
4628                                                                                \
4629   add texture_offset, texture_offset, #0xF00;                                  \
4630   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4631
4632   
4633 #define setup_sprite_tile_half_4bpp(edge)                                      \
4634   setup_sprite_tile_add_blocks(single);                                        \
4635                                                                                \
4636  4:                                                                            \
4637   and texture_block_ptr, texture_offset, texture_mask;                         \
4638   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4639                                                                                \
4640   pld [fb_ptr];                                                                \
4641   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4642   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4643                                                                                \
4644   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4645   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4646                                                                                \
4647   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4648   add block, block, #40;                                                       \
4649                                                                                \
4650   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4651   vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64];                          \
4652                                                                                \
4653   add block, block, #24;                                                       \
4654   add texture_offset, texture_offset, #0x10;                                   \
4655                                                                                \
4656   add fb_ptr, fb_ptr, #2048;                                                   \
4657   subs sub_tile_height, sub_tile_height, #1;                                   \
4658                                                                                \
4659   bne 4b;                                                                      \
4660                                                                                \
4661   add texture_offset, texture_offset, #0xF00;                                  \
4662   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4663  
4664  
4665 #define setup_sprite_tile_full_8bpp(edge)                                      \
4666   setup_sprite_tile_add_blocks(double);                                        \
4667   add block, block, #16;                                                       \
4668                                                                                \
4669  4:                                                                            \
4670   and texture_block_ptr, texture_offset, texture_mask;                         \
4671   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4672                                                                                \
4673   pld [fb_ptr];                                                                \
4674   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4675   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4676                                                                                \
4677   add texture_block_ptr, texture_offset, #8;                                   \
4678   vst1.u32 { texels }, [block, :64];                                           \
4679                                                                                \
4680   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4681   add block, block, #24;                                                       \
4682                                                                                \
4683   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4684                                                                                \
4685   add fb_ptr, fb_ptr, #16;                                                     \
4686   vst1.u32 { draw_mask_fb_ptr_left }, [block, :64];                            \
4687                                                                                \
4688   add block, block, #40;                                                       \
4689   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4690   pld [fb_ptr];                                                                \
4691                                                                                \
4692   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4693   vst1.u32 { texels }, [block, :64];                                           \
4694   add block, block, #24;                                                       \
4695                                                                                \
4696   add texture_offset, texture_offset, #0x10;                                   \
4697   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4698                                                                                \
4699   vst1.u32 { draw_mask_fb_ptr_right }, [block, :64];                           \
4700   add block, block, #40;                                                       \
4701                                                                                \
4702   subs sub_tile_height, sub_tile_height, #1;                                   \
4703   bne 4b;                                                                      \
4704                                                                                \
4705   sub block, block, #16;                                                       \
4706   add texture_offset, texture_offset, #0xF00;                                  \
4707   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4708
4709   
4710 #define setup_sprite_tile_half_8bpp(edge)                                      \
4711   setup_sprite_tile_add_blocks(single);                                        \
4712   add block, block, #16;                                                       \
4713                                                                                \
4714  4:                                                                            \
4715   and texture_block_ptr, texture_offset, texture_mask;                         \
4716   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4717   pld [fb_ptr];                                                                \
4718                                                                                \
4719   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4720   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4721                                                                                \
4722   vst1.u32 { texels }, [block, :64];                                           \
4723   add block, block, #24;                                                       \
4724                                                                                \
4725   vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64];                          \
4726   add block, block, #40;                                                       \
4727                                                                                \
4728   add texture_offset, texture_offset, #0x10;                                   \
4729   add fb_ptr, fb_ptr, #2048;                                                   \
4730                                                                                \
4731   subs sub_tile_height, sub_tile_height, #1;                                   \
4732   bne 4b;                                                                      \
4733                                                                                \
4734   sub block, block, #16;                                                       \
4735   add texture_offset, texture_offset, #0xF00;                                  \
4736   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4737
4738  
4739 #define setup_sprite_tile_column_edge_pre_adjust_half_right()                  \
4740   add texture_offset, texture_offset_base, #8;                                 \
4741   add fb_ptr, fb_ptr, #16                                                      \
4742
4743 #define setup_sprite_tile_column_edge_pre_adjust_half_left()                   \
4744   mov texture_offset, texture_offset_base                                      \
4745
4746 #define setup_sprite_tile_column_edge_pre_adjust_half(edge)                    \
4747   setup_sprite_tile_column_edge_pre_adjust_half_##edge()                       \
4748
4749 #define setup_sprite_tile_column_edge_pre_adjust_full(edge)                    \
4750   mov texture_offset, texture_offset_base                                      \
4751
4752 #define setup_sprite_tile_column_edge_post_adjust_half_right()                 \
4753   sub fb_ptr, fb_ptr, #16                                                      \
4754
4755 #define setup_sprite_tile_column_edge_post_adjust_half_left()                  \
4756
4757 #define setup_sprite_tile_column_edge_post_adjust_half(edge)                   \
4758   setup_sprite_tile_column_edge_post_adjust_half_##edge()                      \
4759
4760 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
4761
4762
4763 #define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode,  \
4764  x4mode)                                                                       \
4765   mov sub_tile_height, column_data;                                            \
4766   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4767   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4768   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4769
4770 #define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode,   \
4771  x4mode)                                                                       \
4772   and sub_tile_height, column_data, #0xFF;                                     \
4773   mov tiles_remaining, column_data, lsr #16;                                   \
4774   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4775   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4776                                                                                \
4777   subs tiles_remaining, tiles_remaining, #1;                                   \
4778   beq 2f;                                                                      \
4779                                                                                \
4780  3:                                                                            \
4781   mov sub_tile_height, #16;                                                    \
4782   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4783   subs tiles_remaining, tiles_remaining, #1;                                   \
4784   bne 3b;                                                                      \
4785                                                                                \
4786  2:                                                                            \
4787   uxtb sub_tile_height, column_data, ror #8;                                   \
4788   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4789   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4790
4791
4792 #define setup_sprite_column_data_single()                                      \
4793   mov column_data, height;                                                     \
4794   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]            \
4795
4796 #define setup_sprite_column_data_multi()                                       \
4797   and height_rounded, height_rounded, #0xF;                                    \
4798   rsb column_data, offset_v, #16;                                              \
4799                                                                                \
4800   add height_rounded, height_rounded, #1;                                      \
4801   sub tile_height, tile_height, #1;                                            \
4802                                                                                \
4803   orr column_data, column_data, tile_height, lsl #16;                          \
4804   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset];           \
4805                                                                                \
4806   orr column_data, column_data, height_rounded, lsl #8                         \
4807
4808 #define setup_sprite_setup_left_draw_mask_fb_ptr()                             \
4809   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4810   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4811
4812 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column()              \
4813   mov fb_ptr_advance_column, #32;                                              \
4814   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4815                                                                                \
4816   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11;           \
4817   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4818
4819 #define setup_sprite_setup_right_draw_mask_fb_ptr()                            \
4820   vdup.u8 draw_mask_fb_ptr_left, block_masks[4];                               \
4821   vdup.u8 draw_mask_fb_ptr_right, block_masks[5]                               \
4822
4823 #define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode,     \
4824  edge, x4mode)                                                                 \
4825  setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode:     \
4826   setup_sprite_column_data_##multi_height();                                   \
4827   vext.32 block_masks_shifted, block_masks, block_masks, #1;                   \
4828   vorr.u32 block_masks, block_masks, block_masks_shifted;                      \
4829   setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
4830                                                                                \
4831   setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
4832   restore_abi_regs();                                                          \
4833   ldmia sp!, { r4 - r11, pc }                                                  \
4834
4835 #define setup_sprite_tiled_advance_column()                                    \
4836   add texture_offset_base, texture_offset_base, #0x100;                        \
4837   tst texture_offset_base, #0xF00;                                             \
4838   subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00)             \
4839
4840 #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode,      \
4841  right_mode, x4mode)                                                           \
4842  setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
4843   setup_sprite_column_data_##multi_height();                                   \
4844                                                                                \
4845   setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode();           \
4846                                                                                \
4847   setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
4848                                                                                \
4849   subs tile_width, tile_width, #2;                                             \
4850   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4851                                                                                \
4852   beq 1f;                                                                      \
4853                                                                                \
4854   vmov.u8 draw_masks_fb_ptrs, #0;                                              \
4855   vmov.u8 draw_masks_fb_ptrs2, #0;                                             \
4856                                                                                \
4857  0:                                                                            \
4858   setup_sprite_tiled_advance_column();                                         \
4859   setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode);      \
4860   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4861   subs tile_width, tile_width, #1;                                             \
4862   bne 0b;                                                                      \
4863                                                                                \
4864  1:                                                                            \
4865   setup_sprite_setup_right_draw_mask_fb_ptr##x4mode();                         \
4866                                                                                \
4867   setup_sprite_tiled_advance_column();                                         \
4868   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
4869   restore_abi_regs();                                                          \
4870   ldmia sp!, { r4 - r11, pc }                                                  \
4871
4872
4873 #define setup_sprite_offset_u_adjust()                                         \
4874
4875 #define setup_sprite_get_left_block_mask()                                     \
4876   and left_block_mask, left_block_mask, #0xFF                                  \
4877
4878 #define setup_sprite_compare_left_block_mask()                                 \
4879   cmp left_block_mask, #0xFF                                                   \
4880
4881 #define setup_sprite_get_right_block_mask()                                    \
4882   uxtb right_block_mask, right_block_mask, ror #8                              \
4883
4884 #define setup_sprite_compare_right_block_mask()                                \
4885   cmp right_block_mask, #0xFF                                                  \
4886
4887
4888
4889 /* 4x stuff */
4890 #define fb_ptr2 column_data
4891
4892 #define setup_sprite_offset_u_adjust_4x()                                      \
4893   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
4894   lsl offset_u_right, #1;                                                      \
4895   lsl offset_u, #1;                                                            \
4896   add offset_u_right, #1                                                       \
4897
4898 #define setup_sprite_get_left_block_mask_4x()                                  \
4899   sxth left_block_mask, left_block_mask                                        \
4900
4901 #define setup_sprite_compare_left_block_mask_4x()                              \
4902   cmp left_block_mask, #0xFFFFFFFF                                             \
4903
4904 #define setup_sprite_get_right_block_mask_4x()                                 \
4905   sxth right_block_mask, right_block_mask, ror #16                             \
4906
4907 #define setup_sprite_compare_right_block_mask_4x()                             \
4908   cmp right_block_mask, #0xFFFFFFFF                                            \
4909
4910
4911 #define widen_texels_16bpp(texels_)                                            \
4912   vmov texels_wide_low, texels_;                                               \
4913   vmov texels_wide_high, texels_;                                              \
4914   vzip.16 texels_wide_low, texels_wide_high                                    \
4915
4916 #define widen_texels_8bpp(texels_)                                             \
4917   vmov texels_wide_low, texels_;                                               \
4918   vmov texels_wide_high, texels_;                                              \
4919   vzip.8 texels_wide_low, texels_wide_high                                     \
4920
4921 #define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)         \
4922   vst1.u32 { texels_ }, [block_, :128];                                        \
4923   add block_, block_, #40;                                                     \
4924                                                                                \
4925   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4926   vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64];                               \
4927   add block_, block_, #24                                                      \
4928
4929 /* assumes 16-byte offset already added to block_ */
4930 #define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)          \
4931   vst1.u32 { texels_ }, [block_, :64];                                         \
4932   add block_, block_, #24;                                                     \
4933                                                                                \
4934   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4935   vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64];                               \
4936   add block_, block_, #40                                                      \
4937
4938 #define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,             \
4939  draw_mask_fb_ptr_b_)                                                          \
4940   widen_texels_16bpp(texels_low);                                              \
4941   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4942                                                                                \
4943   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr);          \
4944                                                                                \
4945   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);      \
4946   widen_texels_16bpp(texels_high);                                             \
4947                                                                                \
4948   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4949   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);      \
4950                                                                                \
4951   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4952   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)       \
4953
4954 #define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,              \
4955  draw_mask_fb_ptr_b_)                                                          \
4956   widen_texels_8bpp(texels);                                                   \
4957   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4958                                                                                \
4959   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr);       \
4960   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);   \
4961                                                                                \
4962   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4963   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);  \
4964                                                                                \
4965   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4966   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)   \
4967
4968
4969 #define setup_sprite_tiled_initialize_4bpp_4x()                                \
4970   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset];                           \
4971   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128];                               \
4972                                                                                \
4973   vuzp.u8 clut_a, clut_b                                                       \
4974
4975 #define setup_sprite_tiled_initialize_8bpp_4x()                                \
4976
4977
4978 #define setup_sprite_block_count_single_4x()                                   \
4979   sub_tile_height, lsl #2                                                      \
4980
4981 #define setup_sprite_block_count_double_4x()                                   \
4982   sub_tile_height, lsl #(1+2)                                                  \
4983
4984 #define setup_sprite_tile_full_4bpp_4x(edge)                                   \
4985   setup_sprite_tile_add_blocks(double_4x);                                     \
4986   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
4987                                                                                \
4988  4:                                                                            \
4989   and texture_block_ptr, texture_offset, texture_mask;                         \
4990   pld [fb_ptr];                                                                \
4991                                                                                \
4992   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4993   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4994                                                                                \
4995   add texture_block_ptr, texture_offset, #8;                                   \
4996   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4997                                                                                \
4998   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4999   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5000                                                                                \
5001   vzip.8 texels_low, texels_high;                                              \
5002   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                  \
5003    draw_mask_fb_ptr_left_b);                                                   \
5004                                                                                \
5005   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5006   pld [fb_ptr, #2048];                                                         \
5007                                                                                \
5008   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5009   add fb_ptr, fb_ptr, #16*2;                                                   \
5010                                                                                \
5011   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5012   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5013                                                                                \
5014   vzip.8 texels_low, texels_high;                                              \
5015   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                 \
5016    draw_mask_fb_ptr_right_b);                                                  \
5017                                                                                \
5018   add texture_offset, texture_offset, #0x10;                                   \
5019   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5020                                                                                \
5021   subs sub_tile_height, sub_tile_height, #1;                                   \
5022   bne 4b;                                                                      \
5023                                                                                \
5024   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5025   add texture_offset, texture_offset, #0xF00;                                  \
5026   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5027
5028
5029 #define setup_sprite_tile_half_4bpp_4x(edge)                                   \
5030   setup_sprite_tile_add_blocks(single_4x);                                     \
5031   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5032                                                                                \
5033  4:                                                                            \
5034   and texture_block_ptr, texture_offset, texture_mask;                         \
5035   pld [fb_ptr];                                                                \
5036                                                                                \
5037   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5038   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5039                                                                                \
5040   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5041   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5042                                                                                \
5043   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5044   add texture_offset, texture_offset, #0x10;                                   \
5045                                                                                \
5046   vzip.8 texels_low, texels_high;                                              \
5047   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,              \
5048    draw_mask_fb_ptr_##edge##_b);                                               \
5049                                                                                \
5050   pld [fb_ptr, #2048];                                                         \
5051   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5052                                                                                \
5053   subs sub_tile_height, sub_tile_height, #1;                                   \
5054   bne 4b;                                                                      \
5055                                                                                \
5056   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5057   add texture_offset, texture_offset, #0xF00;                                  \
5058   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5059
5060
5061 #define setup_sprite_tile_full_8bpp_4x(edge)                                   \
5062   setup_sprite_tile_add_blocks(double_4x);                                     \
5063   add block, block, #16;                                                       \
5064   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5065                                                                                \
5066  4:                                                                            \
5067   and texture_block_ptr, texture_offset, texture_mask;                         \
5068   pld [fb_ptr];                                                                \
5069                                                                                \
5070   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5071   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5072                                                                                \
5073   add texture_block_ptr, texture_offset, #8;                                   \
5074   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                   \
5075    draw_mask_fb_ptr_left_b);                                                   \
5076                                                                                \
5077   pld [fb_ptr, #2048];                                                         \
5078   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
5079                                                                                \
5080   add fb_ptr, fb_ptr, #16*2;                                                   \
5081   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5082                                                                                \
5083   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5084                                                                                \
5085   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                  \
5086    draw_mask_fb_ptr_right_b);                                                  \
5087                                                                                \
5088   add texture_offset, texture_offset, #0x10;                                   \
5089   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5090                                                                                \
5091   subs sub_tile_height, sub_tile_height, #1;                                   \
5092   bne 4b;                                                                      \
5093                                                                                \
5094   sub block, block, #16;                                                       \
5095   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5096   add texture_offset, texture_offset, #0xF00;                                  \
5097   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5098
5099   
5100 #define setup_sprite_tile_half_8bpp_4x(edge)                                   \
5101   setup_sprite_tile_add_blocks(single_4x);                                     \
5102   add block, block, #16;                                                       \
5103   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5104                                                                                \
5105  4:                                                                            \
5106   and texture_block_ptr, texture_offset, texture_mask;                         \
5107   pld [fb_ptr];                                                                \
5108                                                                                \
5109   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5110   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5111                                                                                \
5112   pld [fb_ptr, #2048];                                                         \
5113   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,               \
5114    draw_mask_fb_ptr_##edge##_b);                                               \
5115                                                                                \
5116   add texture_offset, texture_offset, #0x10;                                   \
5117   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5118                                                                                \
5119   subs sub_tile_height, sub_tile_height, #1;                                   \
5120   bne 4b;                                                                      \
5121                                                                                \
5122   sub block, block, #16;                                                       \
5123   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5124   add texture_offset, texture_offset, #0xF00;                                  \
5125   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5126
5127  
5128 #define setup_sprite_tile_column_edge_pre_adjust_half_right_4x()               \
5129   add texture_offset, texture_offset_base, #8;                                 \
5130   add fb_ptr, fb_ptr, #16 * 2                                                  \
5131
5132 #define setup_sprite_tile_column_edge_pre_adjust_half_left_4x()                \
5133   mov texture_offset, texture_offset_base                                      \
5134
5135 #define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge)                 \
5136   setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x()                  \
5137
5138 #define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge)                 \
5139   mov texture_offset, texture_offset_base                                      \
5140
5141 #define setup_sprite_tile_column_edge_post_adjust_half_right_4x()              \
5142   sub fb_ptr, fb_ptr, #16 * 2                                                  \
5143
5144 #define setup_sprite_tile_column_edge_post_adjust_half_left_4x()               \
5145
5146 #define setup_sprite_tile_column_edge_post_adjust_half_4x(edge)                \
5147   setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x()                 \
5148
5149 #define setup_sprite_tile_column_edge_post_adjust_full_4x(edge)                \
5150
5151
5152 #define setup_sprite_setup_left_draw_mask_fb_ptr_4x()                          \
5153   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5154   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5155   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5156   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5157
5158 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x()           \
5159   mov fb_ptr_advance_column, #32 * 2;                                          \
5160   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5161   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5162   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1;       \
5163   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5164   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5165
5166 #define setup_sprite_setup_right_draw_mask_fb_ptr_4x()                         \
5167   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4];                             \
5168   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5];                             \
5169   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6];                            \
5170   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7]                             \
5171
5172
5173 // r0: psx_gpu
5174 // r1: x
5175 // r2: y
5176 // r3: u
5177 // [sp]: v
5178 // [sp + 4]: width
5179 // [sp + 8]: height
5180 // [sp + 12]: color (unused)
5181
5182 #define setup_sprite_tiled_builder(texture_mode, x4mode)                       \
5183                                                                                \
5184 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, full,        \
5185   x4mode);                                                                     \
5186 setup_sprite_tile_column_width_single(texture_mode, multi,  full, none,        \
5187   x4mode);                                                                     \
5188 setup_sprite_tile_column_width_multi(texture_mode,  single, full, full,        \
5189   x4mode);                                                                     \
5190 setup_sprite_tile_column_width_single(texture_mode, single, full, none,        \
5191   x4mode);                                                                     \
5192 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, full,        \
5193   x4mode);                                                                     \
5194 setup_sprite_tile_column_width_single(texture_mode, multi,  half, right,       \
5195   x4mode);                                                                     \
5196 setup_sprite_tile_column_width_multi(texture_mode,  single, half, full,        \
5197   x4mode);                                                                     \
5198 setup_sprite_tile_column_width_single(texture_mode, single, half, right,       \
5199   x4mode);                                                                     \
5200 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, half,        \
5201   x4mode);                                                                     \
5202 setup_sprite_tile_column_width_single(texture_mode, multi,  half, left,        \
5203   x4mode);                                                                     \
5204 setup_sprite_tile_column_width_multi(texture_mode,  single, full, half,        \
5205   x4mode);                                                                     \
5206 setup_sprite_tile_column_width_single(texture_mode, single, half, left,        \
5207   x4mode);                                                                     \
5208 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, half,        \
5209   x4mode);                                                                     \
5210 setup_sprite_tile_column_width_multi(texture_mode,  single, half, half,        \
5211   x4mode);                                                                     \
5212                                                                                \
5213 .align 4;                                                                      \
5214                                                                                \
5215 function(setup_sprite_##texture_mode##x4mode)                                  \
5216   stmdb sp!, { r4 - r11, r14 };                                                \
5217   setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
5218                                                                                \
5219   ldr v, [sp, #36];                                                            \
5220   and offset_u, u, #0xF;                                                       \
5221                                                                                \
5222   ldr width, [sp, #40];                                                        \
5223   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
5224                                                                                \
5225   ldr height, [sp, #44];                                                       \
5226   add fb_ptr, fb_ptr, y, lsl #11;                                              \
5227                                                                                \
5228   save_abi_regs();                                                             \
5229                                                                                \
5230   add fb_ptr, fb_ptr, x, lsl #1;                                               \
5231   and offset_v, v, #0xF;                                                       \
5232                                                                                \
5233   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
5234   add width_rounded, offset_u, width;                                          \
5235                                                                                \
5236   add height_rounded, offset_v, height;                                        \
5237   add width_rounded, width_rounded, #15;                                       \
5238                                                                                \
5239   add height_rounded, height_rounded, #15;                                     \
5240   mov tile_width, width_rounded, lsr #4;                                       \
5241                                                                                \
5242   /* texture_offset_base = VH-VL-00-00                                       */\
5243   mov texture_offset_base, v, lsl #8;                                          \
5244   and offset_u_right, width_rounded, #0xF;                                     \
5245                                                                                \
5246   /* texture_offset_base = VH-UH-UL-00                                       */\
5247   bfi texture_offset_base, u, #4, #8;                                          \
5248   mov right_block_mask, #0xFFFFFFFE;                                           \
5249                                                                                \
5250   setup_sprite_offset_u_adjust##x4mode();                                      \
5251                                                                                \
5252   /* texture_offset_base = VH-UH-VL-00                                       */\
5253   bfi texture_offset_base, v, #4, #4;                                          \
5254   mov left_block_mask, #0xFFFFFFFF;                                            \
5255                                                                                \
5256   mov tile_height, height_rounded, lsr #4;                                     \
5257   mvn left_block_mask, left_block_mask, lsl offset_u;                          \
5258                                                                                \
5259   /* texture_mask = HH-HL-WH-WL                                              */\
5260   ldrh texture_mask, [psx_gpu, #psx_gpu_texture_mask_width_offset];            \
5261   mov right_block_mask, right_block_mask, lsl offset_u_right;                  \
5262                                                                                \
5263   /* texture_mask_rev = WH-WL-HH-HL                                          */\
5264   rev16 texture_mask_rev, texture_mask;                                        \
5265   vmov block_masks, left_block_mask, right_block_mask;                         \
5266                                                                                \
5267   /* texture_mask = HH-HL-HL-WL                                              */\
5268   bfi texture_mask, texture_mask_rev, #4, #4;                                  \
5269   /* texture_mask_rev = 00-00-00-WH                                          */\
5270   mov texture_mask_rev, texture_mask_rev, lsr #12;                             \
5271                                                                                \
5272   /* texture_mask = HH-WH-HL-WL                                              */\
5273   bfi texture_mask, texture_mask_rev, #8, #4;                                  \
5274   setup_sprite_get_left_block_mask##x4mode();                                  \
5275                                                                                \
5276   mov control_mask, #0;                                                        \
5277   setup_sprite_compare_left_block_mask##x4mode();                              \
5278                                                                                \
5279   setup_sprite_get_right_block_mask##x4mode();                                 \
5280   orreq control_mask, control_mask, #0x4;                                      \
5281                                                                                \
5282   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
5283   setup_sprite_compare_right_block_mask##x4mode();                             \
5284                                                                                \
5285   orreq control_mask, control_mask, #0x8;                                      \
5286   cmp tile_width, #1;                                                          \
5287                                                                                \
5288   add block, psx_gpu, #psx_gpu_blocks_offset;                                  \
5289   orreq control_mask, control_mask, #0x1;                                      \
5290                                                                                \
5291   cmp tile_height, #1;                                                         \
5292   add block, block, num_blocks, lsl #6;                                        \
5293                                                                                \
5294   orreq control_mask, control_mask, #0x2;                                      \
5295   JT_OP_REL(9f, control_mask, temp);                                           \
5296   JT_OP(ldr pc, [pc, control_mask, lsl #2]);                                   \
5297   nop;                                                                         \
5298                                                                                \
5299  9:                                                                            \
5300  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode);   \
5301  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode);  \
5302  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode);  \
5303  .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5304  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode);   \
5305  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5306  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode);  \
5307  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5308  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode);   \
5309  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode);  \
5310  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode);  \
5311  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5312  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode);   \
5313  .word 0x00000000;                                                             \
5314  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode);  \
5315
5316
5317 setup_sprite_tiled_builder(4bpp,);
5318 setup_sprite_tiled_builder(8bpp,);
5319
5320 #undef draw_mask_fb_ptr_left
5321 #undef draw_mask_fb_ptr_right
5322
5323 setup_sprite_tiled_builder(4bpp, _4x);
5324 setup_sprite_tiled_builder(8bpp, _4x);
5325
5326
5327 #undef block_ptr
5328 #undef num_blocks
5329 #undef clut_ptr
5330
5331 #define psx_gpu                                           r0
5332 #define block_ptr                                         r0
5333 #define num_blocks                                        r1
5334 #define clut_ptr                                          r2
5335 #define texel_shift_mask                                  r3
5336 #define block_pixels_a                                    r4
5337 #define block_pixels_b                                    r5
5338 #define texel_0                                           r6
5339 #define texel_2                                           r7
5340 #define texel_4                                           r8
5341 #define texel_6                                           r9
5342 #define texel_1                                           r10
5343 #define texel_3                                           r11
5344 #define texel_5                                           r12
5345 #define texel_7                                           r14
5346 #define texels_01                                         r6
5347 #define texels_23                                         r7
5348 #define texels_45                                         r8
5349 #define texels_67                                         r9
5350
5351 function(texture_sprite_blocks_8bpp)
5352   stmdb sp!, { r4 - r11, r14 }
5353   movw texel_shift_mask, #(0xFF << 1)
5354
5355   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5356   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
5357
5358   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5359   ldr block_pixels_a, [block_ptr, #16]
5360
5361  0:
5362   and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5363   ldr block_pixels_b, [block_ptr, #20]
5364
5365   and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5366   ldrh texel_0, [clut_ptr, texel_0]
5367
5368   and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5369   ldrh texel_1, [clut_ptr, texel_1]
5370
5371   and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5372   ldr block_pixels_a, [block_ptr, #(64 + 16)]
5373
5374   ldrh texel_2, [clut_ptr, texel_2]
5375   and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5376
5377   ldrh texel_3, [clut_ptr, texel_3]
5378   and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5379
5380   ldrh texel_4, [clut_ptr, texel_4]
5381   and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5382
5383   ldrh texel_5, [clut_ptr, texel_5]
5384   and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5385
5386   ldrh texel_6, [clut_ptr, texel_6]
5387   orr texels_01, texel_0, texel_1, lsl #16
5388
5389   ldrh texel_7, [clut_ptr, texel_7]
5390   orr texels_23, texel_2, texel_3, lsl #16
5391
5392   orr texels_45, texel_4, texel_5, lsl #16
5393   str texels_01, [block_ptr, #0]
5394
5395   orr texels_67, texel_6, texel_7, lsl #16
5396   str texels_23, [block_ptr, #4]
5397
5398   subs num_blocks, num_blocks, #1
5399   str texels_45, [block_ptr, #8]
5400
5401   str texels_67, [block_ptr, #12]
5402   add block_ptr, block_ptr, #64
5403
5404   bne 0b
5405
5406   ldmia sp!, { r4 - r11, pc }
5407
5408
5409 #undef width_rounded
5410 #undef texture_mask
5411 #undef num_blocks
5412 #undef texture_offset
5413 #undef texels_low
5414 #undef texels_high
5415 #undef texels_wide_low
5416 #undef texels_wide_high
5417 #undef texels_wide
5418 #undef fb_ptr2
5419 #undef temp
5420
5421 #define psx_gpu                                           r0
5422 #define x                                                 r1
5423 #define y                                                 r2
5424 #define u                                                 r3
5425 #define v                                                 r4
5426 #define width                                             r5
5427 #define height                                            r6
5428 #define left_offset                                       r8
5429 #define width_rounded                                     r9
5430 #define right_width                                       r10
5431
5432 #define block_width                                       r11
5433
5434 #define texture_offset_base                               r1
5435 #define texture_mask                                      r2
5436 #define texture_page_ptr                                  r3
5437 #define num_blocks                                        r4
5438 #define block                                             r5
5439 #define fb_ptr                                            r7
5440 #define texture_offset                                    r8
5441 #define blocks_remaining                                  r9
5442 #define fb_ptr2                                           r10
5443 #define fb_ptr_pitch                                      r12
5444 #define texture_block_ptr                                 r14
5445
5446 #define texture_mask_width                                r2
5447 #define texture_mask_height                               r3
5448 #define left_mask_bits                                    r4
5449 #define right_mask_bits                                   r5
5450
5451
5452 #undef block_masks
5453 #undef block_masks_shifted
5454 #undef texels
5455
5456 #define block_masks                                       d0
5457 #define block_masks_shifted                               d1
5458 #define draw_mask_fb_ptr                                  d2
5459 #define texels                                            q2
5460
5461 #define draw_mask_fb_ptr_a                                d2
5462 #define draw_mask_fb_ptr_b                                d3
5463 #define texels_low                                        d4
5464 #define texels_high                                       d5
5465 #define texels_wide_low                                   d6
5466 #define texels_wide_high                                  d7
5467 #define texels_wide                                       q3
5468
5469
5470 setup_sprites_16bpp_flush:
5471   vpush { d0 - d3 }
5472
5473   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
5474   bl flush_render_block_buffer
5475   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
5476
5477   vpop { d0 - d3 }
5478
5479   add block, psx_gpu, #psx_gpu_blocks_offset
5480   mov num_blocks, block_width
5481
5482   bx lr
5483
5484 function(setup_sprite_16bpp)
5485   stmdb sp!, { r4 - r11, r14 }
5486   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5487
5488   ldr v, [sp, #36]
5489   add fb_ptr, fb_ptr, y, lsl #11
5490
5491   ldr width, [sp, #40]
5492   add fb_ptr, fb_ptr, x, lsl #1
5493
5494   ldr height, [sp, #44]
5495   and left_offset, u, #0x7
5496
5497   add texture_offset_base, u, u
5498   add width_rounded, width, #7
5499
5500   add texture_offset_base, texture_offset_base, v, lsl #11
5501   mov left_mask_bits, #0xFF
5502   
5503   ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
5504   add width_rounded, width_rounded, left_offset
5505
5506   ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
5507   sub fb_ptr, fb_ptr, left_offset, lsl #1
5508
5509   add texture_mask, texture_mask_width, texture_mask_width
5510   mov right_mask_bits, #0xFE
5511
5512   and right_width, width_rounded, #0x7
5513   mvn left_mask_bits, left_mask_bits, lsl left_offset
5514
5515   add texture_mask, texture_mask, texture_mask_height, lsl #11
5516   mov block_width, width_rounded, lsr #3
5517
5518   mov right_mask_bits, right_mask_bits, lsl right_width
5519   movw fb_ptr_pitch, #(2048 + 16)
5520
5521   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5522   vmov block_masks, left_mask_bits, right_mask_bits
5523
5524   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5525   add block, psx_gpu, #psx_gpu_blocks_offset
5526
5527   bic texture_offset_base, texture_offset_base, #0xF
5528   cmp block_width, #1
5529
5530   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
5531   add block, block, num_blocks, lsl #6
5532
5533   bne 0f
5534
5535   vext.32 block_masks_shifted, block_masks, block_masks, #1
5536   vorr.u32 block_masks, block_masks, block_masks_shifted
5537   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5538
5539  1:
5540   add num_blocks, num_blocks, #1
5541   cmp num_blocks, #MAX_BLOCKS
5542   blgt setup_sprites_16bpp_flush
5543
5544   and texture_block_ptr, texture_offset_base, texture_mask
5545   subs height, height, #1
5546
5547   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5548   vld1.u32 { texels }, [texture_block_ptr, :128]
5549
5550   vst1.u32 { texels }, [block, :128]
5551   add block, block, #40
5552
5553   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5554   pld [fb_ptr]
5555
5556   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5557
5558   add block, block, #24
5559   add texture_offset_base, texture_offset_base, #2048
5560   add fb_ptr, fb_ptr, #2048
5561   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5562   bne 1b
5563
5564   ldmia sp!, { r4 - r11, pc }
5565
5566  0:
5567   add num_blocks, num_blocks, block_width
5568   mov texture_offset, texture_offset_base
5569
5570   cmp num_blocks, #MAX_BLOCKS
5571   blgt setup_sprites_16bpp_flush
5572
5573   add texture_offset_base, texture_offset_base, #2048
5574   and texture_block_ptr, texture_offset, texture_mask
5575
5576   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5577   vld1.u32 { texels }, [texture_block_ptr, :128]  
5578
5579   vst1.u32 { texels }, [block, :128]
5580   add block, block, #40
5581
5582   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5583   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5584   pld [fb_ptr]
5585
5586   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5587   subs blocks_remaining, block_width, #2
5588
5589   add texture_offset, texture_offset, #16
5590   add fb_ptr, fb_ptr, #16
5591
5592   vmov.u8 draw_mask_fb_ptr, #0
5593
5594   add block, block, #24
5595   beq 2f
5596
5597  1:
5598   and texture_block_ptr, texture_offset, texture_mask
5599   subs blocks_remaining, blocks_remaining, #1
5600
5601   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5602   vld1.u32 { texels }, [texture_block_ptr, :128]
5603
5604   vst1.u32 { texels }, [block, :128]
5605   add block, block, #40
5606
5607   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5608   pld [fb_ptr]
5609
5610   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5611   
5612   add texture_offset, texture_offset, #16
5613   add fb_ptr, fb_ptr, #16
5614
5615   add block, block, #24
5616   bne 1b
5617
5618  2:
5619   and texture_block_ptr, texture_offset, texture_mask
5620   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5621
5622   vld1.u32 { texels }, [texture_block_ptr, :128]
5623   vdup.u8 draw_mask_fb_ptr, block_masks[4]
5624
5625   vst1.u32 { texels }, [block, :128]
5626   add block, block, #40
5627
5628   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5629   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5630   
5631   add block, block, #24
5632   subs height, height, #1
5633
5634   add fb_ptr, fb_ptr, fb_ptr_pitch
5635   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5636
5637   bne 0b
5638
5639   ldmia sp!, { r4 - r11, pc }
5640
5641
5642 // 4x version
5643 // FIXME: duplicate code with normal version :(
5644 #undef draw_mask_fb_ptr
5645
5646 function(setup_sprite_16bpp_4x)
5647   stmdb sp!, { r4 - r11, r14 }
5648   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5649
5650   ldr v, [sp, #36]
5651   add fb_ptr, fb_ptr, y, lsl #11
5652
5653   ldr width, [sp, #40]
5654   add fb_ptr, fb_ptr, x, lsl #1
5655
5656   ldr height, [sp, #44]
5657   and left_offset, u, #0x7
5658
5659   add texture_offset_base, u, u
5660   add width_rounded, width, #7
5661
5662   add texture_offset_base, texture_offset_base, v, lsl #11
5663   movw left_mask_bits, #0xFFFF
5664   
5665   ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
5666   add width_rounded, width_rounded, left_offset
5667
5668   lsl left_offset, #1
5669
5670   ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
5671   sub fb_ptr, fb_ptr, left_offset, lsl #1
5672
5673   add texture_mask, texture_mask_width, texture_mask_width
5674   movw right_mask_bits, #0xFFFC
5675
5676   and right_width, width_rounded, #0x7
5677   mvn left_mask_bits, left_mask_bits, lsl left_offset
5678
5679   lsl right_width, #1
5680
5681   add texture_mask, texture_mask, texture_mask_height, lsl #11
5682   mov block_width, width_rounded, lsr #3
5683
5684   mov right_mask_bits, right_mask_bits, lsl right_width
5685   movw fb_ptr_pitch, #(2048 + 16) * 2
5686
5687   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5688   vmov block_masks, left_mask_bits, right_mask_bits
5689
5690   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5691   add block, psx_gpu, #psx_gpu_blocks_offset
5692
5693   bic texture_offset_base, texture_offset_base, #0xF
5694   cmp block_width, #1
5695
5696   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
5697   add block, block, num_blocks, lsl #6
5698
5699   lsl block_width, #2
5700   bne 0f
5701
5702   vext.32 block_masks_shifted, block_masks, block_masks, #1
5703   vorr.u32 block_masks, block_masks, block_masks_shifted
5704   vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5705   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5706
5707  1:
5708   add num_blocks, num_blocks, block_width
5709   cmp num_blocks, #MAX_BLOCKS
5710   blgt setup_sprites_16bpp_flush
5711
5712   and texture_block_ptr, texture_offset_base, texture_mask
5713   subs height, height, #1
5714
5715   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5716   vld1.u32 { texels }, [texture_block_ptr, :128]
5717
5718   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5719
5720   add texture_offset_base, texture_offset_base, #2048
5721   add fb_ptr, fb_ptr, #2048*2
5722   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5723   bne 1b
5724
5725   ldmia sp!, { r4 - r11, pc }
5726
5727  0:
5728   add num_blocks, num_blocks, block_width
5729   mov texture_offset, texture_offset_base
5730
5731   vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5732   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5733
5734   cmp num_blocks, #MAX_BLOCKS
5735   blgt setup_sprites_16bpp_flush
5736
5737   add texture_offset_base, texture_offset_base, #2048
5738   and texture_block_ptr, texture_offset, texture_mask
5739
5740   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5741   vld1.u32 { texels }, [texture_block_ptr, :128]
5742
5743   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5744
5745   subs blocks_remaining, block_width, #2*4
5746   add texture_offset, texture_offset, #16
5747
5748   vmov.u8 draw_mask_fb_ptr_a, #0
5749   vmov.u8 draw_mask_fb_ptr_b, #0
5750
5751   add fb_ptr, fb_ptr, #16*2
5752   beq 2f
5753
5754  1:
5755   and texture_block_ptr, texture_offset, texture_mask
5756   subs blocks_remaining, blocks_remaining, #4
5757
5758   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5759   vld1.u32 { texels }, [texture_block_ptr, :128]
5760
5761   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5762   add texture_offset, texture_offset, #16
5763
5764   add fb_ptr, fb_ptr, #16*2
5765   bgt 1b
5766
5767  2:
5768   vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5769   vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5770
5771   and texture_block_ptr, texture_offset, texture_mask
5772   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5773
5774   vld1.u32 { texels }, [texture_block_ptr, :128]
5775
5776   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5777   subs height, height, #1
5778
5779   add fb_ptr, fb_ptr, fb_ptr_pitch
5780   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5781
5782   bne 0b
5783
5784   ldmia sp!, { r4 - r11, pc }
5785
5786
5787 #undef width
5788 #undef right_width
5789 #undef right_mask_bits
5790 #undef color
5791 #undef height
5792 #undef blocks_remaining
5793 #undef colors
5794 #undef right_mask
5795 #undef test_mask
5796 #undef draw_mask
5797
5798 #define psx_gpu                                           r0
5799 #define x                                                 r1
5800 #define y                                                 r2
5801 #define width                                             r3
5802 #define right_width                                       r5
5803 #define right_mask_bits                                   r6
5804 #define fb_ptr                                            r7
5805 #define color                                             r8
5806 #define height                                            r9
5807 #define fb_ptr_pitch                                      r12
5808
5809 // referenced by setup_sprites_16bpp_flush
5810 #define num_blocks                                        r4
5811 #define block                                             r5
5812 #define block_width                                       r11
5813
5814 #define color_r                                           r1
5815 #define color_g                                           r2
5816 #define color_b                                           r8
5817 #define blocks_remaining                                  r6
5818
5819 #define colors                                            q0
5820 #define right_mask                                        q1
5821 #define test_mask                                         q2
5822 #define draw_mask                                         q2
5823 #define draw_mask_bits_fb_ptr                             d6
5824
5825
5826 .align 3
5827
5828 function(setup_sprite_untextured)
5829   ldrh r12, [psx_gpu, #psx_gpu_render_state_offset]
5830   tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
5831     | RENDER_FLAGS_BLEND)
5832   ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset]
5833   tsteq r12, #RENDER_INTERLACE_ENABLED
5834   beq setup_sprite_untextured_simple
5835
5836   stmdb sp!, { r4 - r11, r14 }
5837
5838   ldr width, [sp, #40]
5839   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5840
5841   ldr height, [sp, #44]
5842   add fb_ptr, fb_ptr, y, lsl #11
5843
5844   add fb_ptr, fb_ptr, x, lsl #1
5845   sub right_width, width, #1
5846
5847   ldr color, [sp, #48]
5848   and right_width, #7
5849
5850   add block_width, width, #7
5851   add right_width, #1
5852
5853   lsr block_width, #3
5854   mov right_mask_bits, #0xff
5855
5856   sub fb_ptr_pitch, block_width, #1
5857   lsl right_mask_bits, right_width
5858
5859   lsl fb_ptr_pitch, #3+1
5860   ubfx color_r, color, #3, #5
5861
5862   rsb fb_ptr_pitch, #1024*2
5863   ubfx color_g, color, #11, #5
5864
5865   vld1.u32 { test_mask }, [psx_gpu, :128]
5866   ubfx color_b, color, #19, #5
5867
5868   vdup.u16 right_mask, right_mask_bits
5869   orr color, color_r, color_b, lsl #10
5870
5871   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5872   orr color, color, color_g, lsl #5
5873
5874   vtst.u16 right_mask, right_mask, test_mask
5875   add block, psx_gpu, #psx_gpu_blocks_offset
5876
5877   vdup.u16 colors, color
5878   add block, block, num_blocks, lsl #6
5879
5880
5881 setup_sprite_untextured_height_loop:
5882   add num_blocks, block_width
5883   sub blocks_remaining, block_width, #1
5884
5885   cmp num_blocks, #MAX_BLOCKS
5886   blgt setup_sprites_16bpp_flush
5887
5888   cmp blocks_remaining, #0
5889   ble 1f
5890
5891   vmov.u8 draw_mask, #0 /* zero_mask */
5892   vmov.u8 draw_mask_bits_fb_ptr, #0
5893
5894  0:
5895   vst1.u32 { draw_mask }, [block, :128]!
5896   subs blocks_remaining, #1
5897
5898   vst1.u32 { colors }, [block, :128]
5899   add block, block, #24
5900
5901   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5902   vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
5903   
5904   add block, block, #24
5905   add fb_ptr, #8*2
5906   bgt 0b
5907
5908  1:
5909   vst1.u32 { right_mask }, [block, :128]!
5910   subs height, #1
5911
5912   vst1.u32 { colors }, [block, :128]
5913   add block, block, #24
5914
5915   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5916   vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
5917   
5918   add block, block, #24
5919   add fb_ptr, fb_ptr_pitch
5920
5921   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5922   bgt setup_sprite_untextured_height_loop
5923
5924   ldmia sp!, { r4 - r11, pc }
5925
5926
5927
5928 #undef texture_page_ptr
5929 #undef vram_ptr
5930 #undef dirty_textures_mask
5931 #undef current_texture_mask
5932
5933 #define psx_gpu                                           r0
5934 #define current_texture_page                              r1
5935 #define texture_page_ptr                                  r2
5936 #define vram_ptr_a                                        r3
5937 #define current_texture_page_x                            r12
5938 #define current_texture_page_y                            r4
5939 #define dirty_textures_mask                               r5
5940 #define tile_y                                            r6
5941 #define tile_x                                            r7
5942 #define sub_y                                             r8
5943 #define current_texture_mask                              r9
5944 #define c_4096                                            r10
5945 #define vram_ptr_b                                        r11
5946
5947 #define texel_block_a                                     d0
5948 #define texel_block_b                                     d1
5949 #define texel_block_expanded_a                            q1
5950 #define texel_block_expanded_b                            q2
5951 #define texel_block_expanded_ab                           q2
5952 #define texel_block_expanded_c                            q3
5953 #define texel_block_expanded_d                            q4
5954 #define texel_block_expanded_cd                           q3
5955
5956 function(update_texture_4bpp_cache)
5957   stmdb sp!, { r4 - r11, r14 }
5958   vpush { q0 - q3 }
5959
5960   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
5961
5962   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
5963   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
5964
5965   and current_texture_page_x, current_texture_page, #0xF
5966   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
5967
5968   mov current_texture_page_y, current_texture_page, lsr #4
5969   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
5970
5971   add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5972   mov tile_y, #16
5973
5974   add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7  
5975   bic dirty_textures_mask, current_texture_mask
5976   
5977   mov tile_x, #16
5978   str dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
5979
5980   mov sub_y, #8
5981   movw c_4096, #4096
5982
5983   add vram_ptr_b, vram_ptr_a, #2048
5984
5985  0:
5986   vld1.u32 { texel_block_a }, [vram_ptr_a, :64], c_4096
5987   vld1.u32 { texel_block_b }, [vram_ptr_b, :64], c_4096
5988
5989   vmovl.u8 texel_block_expanded_a, texel_block_a
5990   vshll.u8 texel_block_expanded_b, texel_block_a, #4
5991   vmovl.u8 texel_block_expanded_c, texel_block_b
5992   vshll.u8 texel_block_expanded_d, texel_block_b, #4
5993
5994   vbic.u16 texel_block_expanded_a, #0x00F0
5995   vbic.u16 texel_block_expanded_b, #0x00F0
5996   vbic.u16 texel_block_expanded_c, #0x00F0
5997   vbic.u16 texel_block_expanded_d, #0x00F0
5998
5999   vorr.u16 texel_block_expanded_ab, texel_block_expanded_a,                    \
6000    texel_block_expanded_b
6001   vorr.u16 texel_block_expanded_cd, texel_block_expanded_c,                    \
6002    texel_block_expanded_d
6003
6004   vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd },               \
6005    [texture_page_ptr, :256]!
6006
6007   subs sub_y, sub_y, #1
6008   bne 0b
6009
6010   mov sub_y, #8
6011   add vram_ptr_a, vram_ptr_a, #8
6012   add vram_ptr_b, vram_ptr_b, #8
6013
6014   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6015   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6016
6017   subs tile_x, tile_x, #1
6018   bne 0b
6019
6020   mov tile_x, #16
6021   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6022   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6023
6024   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6025   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6026
6027   subs tile_y, tile_y, #1
6028   bne 0b
6029
6030   vpop { q0 - q3 }
6031   ldmia sp!, { r4 - r11, pc }
6032
6033
6034 #undef current_texture_page
6035
6036 #define psx_gpu                                           r0
6037 #define texture_page                                      r1
6038 #define texture_page_ptr                                  r2
6039 #define vram_ptr_a                                        r3
6040 #define texture_page_x                                    r12
6041 #define texture_page_y                                    r4
6042 #define current_texture_page                              r5
6043 #define tile_y                                            r6
6044 #define tile_x                                            r7
6045 #define sub_y                                             r8
6046 #define c_4096                                            r10
6047 #define vram_ptr_b                                        r11
6048
6049
6050 #undef texels_a
6051 #undef texels_b
6052
6053 #define texels_a                                          q0
6054 #define texels_b                                          q1
6055 #define texels_c                                          q2
6056 #define texels_d                                          q3
6057
6058
6059 function(update_texture_8bpp_cache_slice)
6060   stmdb sp!, { r4 - r11, r14 }
6061   vpush { q0 - q3 }
6062
6063   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
6064   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
6065
6066   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
6067   mov tile_y, #16
6068
6069   and texture_page_x, texture_page, #0xF
6070   mov texture_page_y, texture_page, lsr #4
6071
6072   add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7  
6073   mov tile_x, #8
6074
6075   add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6076   eor current_texture_page, current_texture_page, texture_page
6077
6078   ands current_texture_page, current_texture_page, #0x1
6079   mov sub_y, #4
6080
6081   addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6082   movw c_4096, #4096
6083
6084   add vram_ptr_b, vram_ptr_a, #2048
6085
6086  0:
6087   vld1.u32 { texels_a }, [vram_ptr_a, :128], c_4096
6088   vld1.u32 { texels_b }, [vram_ptr_b, :128], c_4096
6089   vld1.u32 { texels_c }, [vram_ptr_a, :128], c_4096
6090   vld1.u32 { texels_d }, [vram_ptr_b, :128], c_4096
6091
6092   vst1.u32 { texels_a, texels_b }, [texture_page_ptr, :256]!
6093   vst1.u32 { texels_c, texels_d }, [texture_page_ptr, :256]!
6094
6095   subs sub_y, sub_y, #1
6096   bne 0b
6097
6098   mov sub_y, #4
6099
6100   add vram_ptr_a, vram_ptr_a, #16
6101   add vram_ptr_b, vram_ptr_b, #16
6102
6103   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6104   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6105
6106   subs tile_x, tile_x, #1
6107   bne 0b
6108
6109   mov tile_x, #8
6110
6111   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6112   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6113
6114   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6115   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6116
6117   subs tile_y, tile_y, #1
6118   add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6119
6120   bne 0b
6121
6122   vpop { q0 - q3 }
6123   ldmia sp!, { r4 - r11, pc }
6124
6125
6126 /* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6127 function(scale2x_tiles8)
6128   push { r4, r14 }
6129
6130   mov r4, r1
6131   add r12, r0, #1024*2
6132   mov r14, r2
6133
6134 0:
6135   vld1.u16 { q0 }, [r1, :128]!
6136   vld1.u16 { q2 }, [r1, :128]!
6137   vmov q1, q0
6138   vmov q3, q2
6139   vzip.16 q0, q1
6140   vzip.16 q2, q3
6141   subs r14, #2
6142   vst1.u16 { q0, q1 }, [r0, :128]!
6143   vst1.u16 { q0, q1 }, [r12, :128]!
6144   blt 1f
6145   vst1.u16 { q2, q3 }, [r0, :128]!
6146   vst1.u16 { q2, q3 }, [r12, :128]!
6147   bgt 0b
6148 1:
6149   subs r3, #1
6150   mov r14, r2
6151   add r0, #1024*2*2
6152   add r4, #1024*2
6153   sub r0, r0, r2, lsl #4+1
6154   mov r1, r4
6155   add r12, r0, #1024*2
6156   bgt 0b
6157   nop
6158
6159   pop { r4, pc }
6160
6161 // vim:filetype=armasm