cdrom: change pause timing again
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  * Copyright (C) 2012 GraÅžvydas Ignotas "notaz" <notasas@gmail.com>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation; either version 2 of
8  * the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  */
15
16 #define RENDER_INTERLACE_ENABLED                          0x1
17
18 #include "psx_gpu.h"
19 #include "psx_gpu_offsets.h"
20
21 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
22
23 #define edge_data_left_x_offset                           0
24 #define edge_data_num_blocks_offset                       2
25 #define edge_data_right_mask_offset                       4
26 #define edge_data_y_offset                                6
27
28 .syntax unified
29 .text
30
31 #if 0
32 #define save_abi_regs() \
33   vpush {q4-q7}
34 #define restore_abi_regs() \
35   vpop  {q4-q7}
36 #else
37 #define save_abi_regs()
38 #define restore_abi_regs()
39 #endif
40
41 #define psx_gpu                                           r0
42 #define v_a                                               r1
43 #define v_b                                               r2
44 #define v_c                                               r3
45
46 #define x0                                                r4
47 #define x1                                                r5
48 #define x2                                                r6
49 #define x0_x1                                             r5
50 #define x1_x2                                             r6
51 #define y0                                                r7
52 #define y1                                                r8
53 #define y2                                                r9
54 #define y0_y1                                             r7
55 #define y1_y2                                             r8
56 #define b0                                                r9
57 #define b1                                                r10
58 #define b2                                                r11
59 #define b0_b1                                             r10
60 #define b1_b2                                             r11
61
62
63 #define area_r_s                                          r5
64
65 #define g_bx0                                             r2
66 #define g_bx                                              r3
67 #define g_bx2                                             r4
68 #define g_bx3                                             r5
69 #define b_base                                            r6
70 #define g_by                                              r8
71
72 #define gs_bx                                             r7
73 #define gs_by                                             r10
74
75 #define ga_bx                                             g_bx
76 #define ga_by                                             g_by
77
78 #define gw_bx_h                                           g_bx
79 #define gw_by_h                                           g_by
80
81 #define gw_bx_l                                           r11
82 #define gw_by_l                                           gw_bx_l
83
84 #define store_a                                           r0
85 #define store_b                                           r1
86 #define store_inc                                         r5
87
88
89 #define v0                                                q0
90 #define uvrgb0                                            d0
91 #define x0_y0                                             d1
92
93 #define v1                                                q1
94 #define uvrgb1                                            d2
95 #define x1_y1                                             d3
96
97 #define v2                                                q2
98 #define uvrgb2                                            d4
99 #define x2_y2                                             d5
100
101 #define x0_ab                                             q3
102 #define uvrg_xxxx0                                        q3
103 #define uvrg0                                             d6
104 #define xxxx0                                             d7
105
106 #define x1_ab                                             q4
107 #define uvrg_xxxx1                                        q4
108 #define uvrg1                                             d8
109 #define xxxx1                                             d9
110
111 #define x2_ab                                             q5
112 #define uvrg_xxxx2                                        q5
113 #define uvrg2                                             d10
114 #define xxxx2                                             d11
115
116 #define y0_ab                                             q6
117 #define yyyy_uvrg0                                        q6
118 #define yyyy0                                             d12
119 #define uvrg0b                                            d13
120
121 #define y1_ab                                             q7
122 #define yyyy_uvrg1                                        q7
123 #define yyyy1                                             d14
124 #define uvrg1b                                            d15
125
126 #define y2_ab                                             q8
127 #define yyyy_uvrg2                                        q8
128 #define yyyy2                                             d16
129 #define uvrg2b                                            d17
130
131 #define d0_ab                                             q9
132 #define d0_a                                              d18
133 #define d0_b                                              d19
134
135 #define d1_ab                                             q10
136 #define d1_a                                              d20
137 #define d1_b                                              d21
138
139 #define d2_ab                                             q11
140 #define d2_a                                              d22
141 #define d2_b                                              d23
142
143 #define d3_ab                                             q12
144 #define d3_a                                              d24
145 #define d3_b                                              d25
146
147 #define ga_uvrg_x                                         q1
148 #define ga_uvrg_y                                         q4
149
150 #define dx                                                x0_x1
151 #define dy                                                y0_y1
152 #define db                                                b0_b1
153
154 #define uvrg_base                                         q11
155
156 #define gs_uvrg_x                                         q5
157 #define gs_uvrg_y                                         q6
158
159 #define g_uvrg_x                                          q1
160 #define ga_uv_x                                           d2
161 #define g_uv_x                                            d2
162 #define ga_rg_x                                           d3
163 #define g_rg_x                                            d3
164
165 #define g_uvrg_y                                          q4
166 #define ga_uv_y                                           d8
167 #define g_uv_y                                            d8
168 #define ga_rg_y                                           d9
169 #define g_rg_y                                            d9
170
171 #define gw_uv_x                                           q1
172 #define gw_rg_x                                           q2
173 #define gw_uv_y                                           q4
174 #define gw_rg_y                                           q3
175
176 #define w_mask                                            q9
177 #define w_mask_l                                          d18
178
179 #define r_shift                                           q10
180
181 #define uvrg_dx0                                          q0
182 #define uvrg_dx0l                                         d0
183 #define uvrg_dx0h                                         d1
184
185 #define uvrg_dx1                                          q1
186 #define uvrg_dx1l                                         d2
187 #define uvrg_dx1h                                         d3
188
189 #define uvrg_dx2                                          q2
190 #define uvrg_dx2l                                         d4
191 #define uvrg_dx2h                                         d5
192
193 #define uvrg_dx3                                          q3
194 #define uvrg_dx3l                                         d6
195 #define uvrg_dx3h                                         d7
196
197 #define uvrgb_phase                                       q13
198
199 .align 4
200
201 #include "arm_features.h"
202
203 #define function(name) FUNCTION(name):
204
205 #ifndef TEXRELS_FORBIDDEN
206
207 #define JT_OP_REL(table_label, index_reg, temp)
208 #define JT_OP(x...) x
209 #define JTE(start, target) target
210
211 #else
212
213 #define JT_OP_REL(table_label, index_reg, temp)                                \
214   adr temp, table_label;                                                       \
215   ldr temp, [temp, index_reg, lsl #2];                                         \
216   add pc, pc, temp                                                             \
217
218 #define JT_OP(x...)
219 #define JTE(start, target) (target - start)
220
221 #endif
222
223 #ifdef __MACH__
224 #define flush_render_block_buffer _flush_render_block_buffer
225 #define update_texture_8bpp_cache _update_texture_8bpp_cache
226 #endif
227
228 @ r0: psx_gpu
229 @ r1: v_a
230 @ r2: v_b
231 @ r3: v_c
232
233 function(compute_all_gradients)
234   // First compute the triangle area reciprocal and shift. The division will
235   // happen concurrently with much of the work which follows.
236   @ r12 = psx_gpu->triangle_area
237   ldr r12, [psx_gpu, #psx_gpu_triangle_area_offset]
238   stmdb sp!, { r4 - r11, lr }
239   save_abi_regs()
240
241   @ load exponent of 62 into upper half of double
242   movw r4, #0
243   clz r14, r12                       @ r14 = shift
244
245   movt r4, #((62 + 1023) << 4)
246   mov r12, r12, lsl r14              @ r12 = triangle_area_normalized
247
248   @ load area normalized into lower half of double
249   mov r5, r12, lsr #10
250   vmov.f64 d30, r5, r4               @ d30 = (1 << 62) + ta_n
251
252   movt r4, #((1022 + 31) << 4)
253   mov r5, r12, lsl #20
254
255   add r4, r4, r12, lsr #11
256   vmov.f64 d31, r5, r4
257
258   vdiv.f64 d30, d30, d31             @ d30 = ((1 << 62) + ta_n) / ta_n
259
260   // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
261   // ( d0       *  d1      ) - ( d2       *  d3      ) =
262   // ( m0                  ) - ( m1                  ) = gradient
263
264   // This is split to do 12 elements at a time over three sets: a, b, and c.
265   // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
266   // two of the slots are unused.
267
268   // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
269   // is g.
270
271   // First type is:  uvrg bxxx xxxx 
272   // Second type is: yyyy ybyy uvrg 
273   // Since x_a and y_c are the same the same variable is used for both. 
274
275   vld1.u32 { v0 }, [v_a, :128]       @ v0 = { uvrg0, b0, x0, y0 }
276   ldrsh x0, [v_a, #8]                @ load x0
277
278   vld1.u32 { v1 }, [v_b, :128]       @ v1 = { uvrg1, b1, x1, y1}
279   ldrh x1, [v_b, #8]                 @ load x1
280
281   vld1.u32 { v2 }, [v_c, :128]       @ v2 = { uvrg2, b2, x2, y2 }
282   ldrh x2, [v_c, #8]                 @ load x2
283
284   vmovl.u8 uvrg_xxxx0, uvrgb0        @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
285   ldrh y0, [v_a, #10]                @ load y0
286
287   vmovl.u8 uvrg_xxxx1, uvrgb1        @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
288   ldrh y1, [v_b, #10]                @ load y1
289
290   vmovl.u8 uvrg_xxxx2, uvrgb2        @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
291   ldrh y2, [v_c, #10]                @ load y2
292
293   vmov.u8 uvrg0b, uvrg0              @ uvrg0b = { uv0, rg0 }
294   vdup.u16 xxxx0, x0_y0[0]           @ xxxx0 = { xx0, xx0 }
295
296   orr x1_x2, x1, x2, lsl #16         @ x1_x2 = { x1, x2 }
297   pkhbt x0_x1, x0, x1, lsl #16       @ x0_x1 = { x0, x1 }
298
299   vmov.u8 uvrg1b, uvrg1              @ uvrg1b = { uv1, rg1 }
300   vdup.u16 xxxx1, x1_y1[0]           @ xxxx1 = { xx1, xx1 }
301
302   vmov.u8 uvrg2b, uvrg2              @ uvrg2b = { uv2, rg2 }
303   vdup.u16 xxxx2, x2_y2[0]           @ xxxx2 = { xx2, xx2 }
304
305   ldrb b2, [v_c, #4]                 @ load b2
306   orr y0_y1, y0, y1, lsl #16         @ y0_y1 = { y0, y1 }
307
308   ldrb b1, [v_b, #4]                 @ load b1
309   orr y1_y2, y1, y2, lsl #16         @ y1_y2 = { y1, y2 }
310
311   vdup.u16 yyyy0, x0_y0[1]           @ yyyy0 = { yy0, yy0 }
312   vsub.s16 d0_ab, x1_ab, x0_ab
313
314   ldrb b0, [v_a, #4]                 @ load b0
315   orr b1_b2, b1, b2, lsl #16         @ b1_b2 = { b1, b2 }
316
317   vdup.u16 yyyy1, x1_y1[1]           @ yyyy1 = { yy1, yy1 }
318   vsub.s16 d2_ab, x2_ab, x1_ab
319
320   vdup.u16 yyyy2, x2_y2[1]           @ yyyy2 = { yy2, yy2 }
321   vsub.s16 d1_ab, y2_ab, y1_ab
322
323   orr b0_b1, b0, b1, lsl #16         @ b1_b2 = { b1, b2 }
324   ssub16 dx, x1_x2, x0_x1            @ dx = { x1 - x0, x2 - x1 }
325
326   ssub16 dy, y1_y2, y0_y1            @ dy = { y1 - y0, y2 - y1 }
327   ssub16 db, b1_b2, b0_b1            @ db = { b1 - b0, b2 - b1 }
328
329   vsub.s16 d3_ab, y1_ab, y0_ab
330   smusdx ga_by, dx, db               @ ga_by = ((x1 - x0) * (b2 - b1)) -
331                                      @         ((x2 - X1) * (b1 - b0)) 
332   vmull.s16 ga_uvrg_x, d0_a, d1_a
333   smusdx ga_bx, db, dy               @ ga_bx = ((b1 - b0) * (y2 - y1)) -
334                                      @         ((b2 - b1) * (y1 - y0))
335   vmlsl.s16 ga_uvrg_x, d2_a, d3_a
336   movs gs_bx, ga_bx, asr #31
337
338   vmull.s16 ga_uvrg_y, d0_b, d1_b
339   rsbmi ga_bx, ga_bx, #0
340
341   @ r12 = psx_gpu->uvrgb_phase
342   ldr r12, [psx_gpu, #psx_gpu_uvrgb_phase_offset]
343
344   vmlsl.s16 ga_uvrg_y, d2_b, d3_b
345   movs gs_by, ga_by, asr #31
346
347   vshr.u64 d0, d30, #22
348   add b_base, r12, b0, lsl #16
349
350   vdup.u32 uvrgb_phase, r12
351
352   rsbmi ga_by, ga_by, #0
353   vclt.s32 gs_uvrg_x, ga_uvrg_x, #0  @ gs_uvrg_x = ga_uvrg_x < 0
354
355   @ r12 = psx_gpu->triangle_winding_offset
356   ldrb r12, [psx_gpu, #psx_gpu_triangle_winding_offset]
357   vclt.s32 gs_uvrg_y, ga_uvrg_y, #0  @ gs_uvrg_y = ga_uvrg_y < 0
358
359   rsb r12, r12, #0                   @ r12 = -(triangle->winding)
360
361   vdup.u32 w_mask, r12               @ w_mask = { -w, -w, -w, -w }
362   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
363
364   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
365   vdup.u32 r_shift, r14              @ r_shift = { shift, shift*, shift, shift* }
366                                      @ * - vshl.u64: ignored by hw
367   vadd.u32 uvrg_base, uvrgb_phase
368   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
369
370   vmov area_r_s, s0                  @ area_r_s = triangle_reciprocal
371   vabs.s32 ga_uvrg_y, ga_uvrg_y      @ ga_uvrg_y = abs(ga_uvrg_y)
372
373   vmull.u32 gw_rg_x, ga_rg_x, d0[0]
374   vmull.u32 gw_uv_x, ga_uv_x, d0[0]
375   vmull.u32 gw_rg_y, ga_rg_y, d0[0]
376   vmull.u32 gw_uv_y, ga_uv_y, d0[0]
377
378   vshl.u64 gw_rg_x, gw_rg_x, r_shift
379   vshl.u64 gw_uv_x, gw_uv_x, r_shift
380   vshl.u64 gw_rg_y, gw_rg_y, r_shift
381   vshl.u64 gw_uv_y, gw_uv_y, r_shift
382
383   veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
384   vmovn.u64 g_uv_x, gw_uv_x
385
386   veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
387   vmovn.u64 g_rg_x, gw_rg_x
388
389   veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
390   vmovn.u64 g_uv_y, gw_uv_y
391
392   vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
393   vmovn.u64 g_rg_y, gw_rg_y
394
395   veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
396   mov ga_bx, ga_bx, lsl #13
397
398   vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
399   mov ga_by, ga_by, lsl #13
400
401   vdup.u32 x0_y0, x0
402   umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
403
404   vshl.u32 g_uvrg_x, g_uvrg_x, #4
405   vshl.u32 g_uvrg_y, g_uvrg_y, #4
406
407   umull gw_by_l, gw_by_h, ga_by, area_r_s
408   vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
409
410   eor gs_bx, gs_bx, r12
411   vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
412
413   veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
414   eor gs_by, gs_by, r12
415
416   rsb r11, r14, #0                   @ r11 = negative shift for scalar lsr
417   add store_a, psx_gpu, #psx_gpu_uvrg_offset
418
419   sub r11, r11, #(32 - 13)
420
421   add store_b, store_a, #16
422   mov store_inc, #32
423
424   vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
425   vst1.u32 { uvrg_base }, [store_a, :128], store_inc
426
427   vst1.u32 { uvrg_dx1 }, [store_b, :128], store_inc
428   mov g_bx, gw_bx_h, lsr r11
429
430   vst1.u32 { g_uvrg_y }, [store_a, :128], store_inc
431   mov g_by, gw_by_h, lsr r11
432
433   vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l },                     \
434    [store_b, :128], store_inc
435   eor g_bx, g_bx, gs_bx
436
437   vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h },                     \
438    [store_b, :128], store_inc
439   sub g_bx, g_bx, gs_bx
440
441   lsl g_bx, g_bx, #4  
442   eor g_by, g_by, gs_by
443
444   mls b_base, g_bx, x0, b_base
445   sub g_by, g_by, gs_by
446
447   lsl g_by, g_by, #4
448   mov g_bx0, #0
449
450   add g_bx2, g_bx, g_bx
451   add g_bx3, g_bx, g_bx2
452
453   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
454
455   restore_abi_regs()
456   ldmia sp!, { r4 - r11, pc }
457
458
459 #define psx_gpu                                  r0
460 #define v_a                                      r1
461 #define v_b                                      r2
462 #define v_c                                      r3
463
464 #define temp                                     r14
465
466 #define x_a                                      r4
467 #define x_b                                      r5
468 #define x_c                                      r6
469 #define y_a                                      r1
470 #define y_b                                      r2
471 #define y_c                                      r3
472
473 #define height_minor_a                           r7
474 #define height_minor_b                           r8
475 #define height_major                             r9
476 #define height                                   r9
477
478 #define reciprocal_table_ptr                     r10
479
480 #define edge_alt_low                             r4
481 #define edge_alt_high                            r5
482 #define edge_dx_dy_alt                           r6
483 #define edge_shift_alt                           r10
484
485 #define edge_dx_dy_alt_low                       r4
486 #define edge_dx_dy_alt_high                      r5
487
488 #define span_edge_data                           r4
489 #define span_uvrg_offset                         r5
490 #define span_b_offset                            r6
491
492 #define clip                                     r14
493
494 #define b                                        r11
495 #define b_dy                                     r12
496
497
498 #define alternate_x                              q0
499 #define alternate_dx_dy                          q1
500 #define alternate_x_32                           q2
501
502 #define alternate_x_low                          d0
503 #define alternate_x_high                         d1
504 #define alternate_dx_dy_low                      d2
505 #define alternate_dx_dy_high                     d3
506 #define alternate_x_32_low                       d4
507 #define alternate_x_32_high                      d5
508
509 #define left_x                                   q3
510 #define right_x                                  q4
511 #define left_dx_dy                               q5
512 #define right_dx_dy                              q6
513 #define left_edge                                q7
514 #define right_edge                               q8
515
516 #define left_x_low                               d6
517 #define left_x_high                              d7
518 #define right_x_low                              d8
519 #define right_x_high                             d9
520 #define left_dx_dy_low                           d10
521 #define left_dx_dy_high                          d11
522 #define right_dx_dy_low                          d12
523 #define right_dx_dy_high                         d13
524 #define left_edge_low                            d14
525 #define left_edge_high                           d15
526 #define right_edge_low                           d16
527 #define right_edge_high                          d17
528
529 #define y_mid_point                              d18
530 #define c_0x0004                                 d19
531
532 #define left_right_x_16                          q11
533 #define span_shifts_y                            q12
534 #define c_0x0001                                 q13
535
536 #define span_shifts                              d24
537 #define y_x4                                     d25
538 #define c_0xFFFE                                 d26
539 #define c_0x0007                                 d27
540
541 #define left_right_x_16_low                      d22
542 #define left_right_x_16_high                     d23
543
544 #define uvrg                                     q14
545 #define uvrg_dy                                  q15
546
547 #define alternate_x_16                           d4
548
549 #define v_clip                                   q3
550 #define v_clip_low                               d6
551
552 #define right_x_32                               q10
553 #define left_x_32                                q11
554 #define alternate_select                         d24
555
556 #define right_x_32_low                           d20
557 #define right_x_32_high                          d21
558 #define left_x_32_low                            d22
559 #define left_x_32_high                           d23
560
561 #define tmp_max_blocks                           d20
562
563 #define edges_xy                                 q0
564 #define edges_dx_dy                              d2
565 #define edge_shifts                              d3
566 #define edge_shifts_64                           q2
567
568 #define edges_xy_left                            d0
569 #define edges_xy_right                           d1
570
571 #define height_reciprocals                       d6
572 #define heights                                  d7
573
574 #define widths                                   d8
575 #define c_0x01                                   d9
576 #define x_starts                                 d10
577 #define x_ends                                   d11
578
579 #define heights_b                                d12
580 #define edges_dx_dy_64                           q10
581
582 #define edges_dx_dy_64_left                      d20
583 #define edges_dx_dy_64_right                     d21
584
585
586 #define setup_spans_prologue()                                                 \
587   stmdb sp!, { r4 - r11, lr };                                                 \
588   save_abi_regs();                                                             \
589                                                                                \
590   ldrsh x_a, [v_a, #8];                                                        \
591   ldrsh x_b, [v_b, #8];                                                        \
592   ldrsh x_c, [v_c, #8];                                                        \
593   ldrsh y_a, [v_a, #10];                                                       \
594   ldrsh y_b, [v_b, #10];                                                       \
595   ldrsh y_c, [v_c, #10];                                                       \
596                                                                                \
597   add temp, psx_gpu, #psx_gpu_uvrg_offset;                                     \
598   vld1.32 { uvrg }, [temp];                                                    \
599   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
600   vld1.32 { uvrg_dy }, [temp];                                                 \
601   ldr reciprocal_table_ptr, [psx_gpu, #psx_gpu_reciprocal_table_ptr_offset];   \
602                                                                                \
603   vmov.u32 c_0x01, #0x01                                                       \
604
605 #define setup_spans_load_b()                                                   \
606   ldr b, [psx_gpu, #psx_gpu_b_offset];                                         \
607   ldr b_dy, [psx_gpu, #psx_gpu_b_dy_offset]                                    \
608
609 #define setup_spans_prologue_b()                                               \
610   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
611   add temp, psx_gpu, #psx_gpu_viewport_start_x_offset;                         \
612                                                                                \
613   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
614   vmov.u16 c_0x0004, #0x0004;                                                  \
615                                                                                \
616   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
617   vmov.u16 c_0x0001, #0x0001;                                                  \
618                                                                                \
619   vld1.u16 { left_edge_low[], left_edge_high[] }, [temp];                      \
620   add temp, psx_gpu, #psx_gpu_viewport_end_x_offset;                           \
621                                                                                \
622   vld1.u16 { right_edge_low[], right_edge_high[] }, [temp];                    \
623   vadd.u16 right_edge, right_edge, c_0x0001;                                   \
624                                                                                \
625   vmov.u16 c_0x0007, #0x0007;                                                  \
626   vmvn.u16 c_0xFFFE, #0x0001                                                   \
627
628
629 #define compute_edge_delta_x2()                                                \
630   ldr temp, [reciprocal_table_ptr, height, lsl #2];                            \
631                                                                                \
632   vdup.u32 heights, height;                                                    \
633   vsub.u32 widths, x_ends, x_starts;                                           \
634                                                                                \
635   vdup.u32 edge_shifts, temp;                                                  \
636   vsub.u32 heights_b, heights, c_0x01;                                         \
637   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
638                                                                                \
639   vmla.s32 heights_b, x_starts, heights;                                       \
640   vbic.u16 edge_shifts, #0xE0;                                                 \
641   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
642   vmull.s32 edges_xy, heights_b, height_reciprocals                            \
643
644 #define width_alt                 r6
645 #define height_reciprocal_alt     r11
646 #define height_b_alt              r12
647
648 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
649   vmov heights, height_a, height_b;                                            \
650   ldr temp, [reciprocal_table_ptr, height_a, lsl #2];                          \
651   vmov.u32 edge_shifts[0], temp;                                               \
652   ldr temp, [reciprocal_table_ptr, height_b, lsl #2];                          \
653   vmov.u32 edge_shifts[1], temp;                                               \
654   ldr edge_shift_alt, [reciprocal_table_ptr, height_minor_b, lsl #2];          \
655                                                                                \
656   vsub.u32 widths, x_ends, x_starts;                                           \
657   sub width_alt, x_c, start_c;                                                 \
658                                                                                \
659   vsub.u32 heights_b, heights, c_0x01;                                         \
660   sub height_b_alt, height_minor_b, #1;                                        \
661                                                                                \
662   vshr.u32 height_reciprocals, edge_shifts, #10;                               \
663   lsr height_reciprocal_alt, edge_shift_alt, #10;                              \
664                                                                                \
665   vmla.s32 heights_b, x_starts, heights;                                       \
666   mla height_b_alt, height_minor_b, start_c, height_b_alt;                     \
667                                                                                \
668   vbic.u16 edge_shifts, #0xE0;                                                 \
669   and edge_shift_alt, edge_shift_alt, #0x1F;                                   \
670                                                                                \
671   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
672   mul edge_dx_dy_alt, width_alt, height_reciprocal_alt;                        \
673                                                                                \
674   vmull.s32 edges_xy, heights_b, height_reciprocals;                           \
675   smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt       \
676
677
678 #define setup_spans_adjust_y_up()                                              \
679   vsub.u32 y_x4, y_x4, c_0x0004                                                \
680
681 #define setup_spans_adjust_y_down()                                            \
682   vadd.u32 y_x4, y_x4, c_0x0004                                                \
683
684 #define setup_spans_adjust_interpolants_up()                                   \
685   vsub.u32 uvrg, uvrg, uvrg_dy;                                                \
686   sub b, b, b_dy                                                               \
687
688 #define setup_spans_adjust_interpolants_down()                                 \
689   vadd.u32 uvrg, uvrg, uvrg_dy;                                                \
690   add b, b, b_dy                                                               \
691
692
693 #define setup_spans_clip_interpolants_increment()                              \
694   mla b, b_dy, clip, b;                                                        \
695   vmla.s32 uvrg, uvrg_dy, v_clip                                               \
696
697 #define setup_spans_clip_interpolants_decrement()                              \
698   mls b, b_dy, clip, b;                                                        \
699   vmls.s32 uvrg, uvrg_dy, v_clip                                               \
700
701 #define setup_spans_clip_alternate_yes()                                       \
702   smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip                      \
703
704 #define setup_spans_clip_alternate_no()                                        \
705
706 #define setup_spans_clip(direction, alternate_active)                          \
707   vdup.u32 v_clip, clip;                                                       \
708   setup_spans_clip_alternate_##alternate_active();                             \
709   setup_spans_clip_interpolants_##direction();                                 \
710   vmlal.s32 edges_xy, edges_dx_dy, v_clip_low                                  \
711
712
713 #define setup_spans_adjust_edges_alternate_no(left_index, right_index)         \
714   vmovl.s32 edge_shifts_64, edge_shifts;                                       \
715   vmovl.s32 edges_dx_dy_64, edges_dx_dy;                                       \
716                                                                                \
717   vshl.s64 edges_xy, edges_xy, edge_shifts_64;                                 \
718   vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64;                     \
719                                                                                \
720   vmov left_x_low, edges_xy_##left_index;                                      \
721   vmov right_x_low, edges_xy_##right_index;                                    \
722                                                                                \
723   vmov left_dx_dy_low, edges_dx_dy_64_##left_index;                            \
724   vmov left_dx_dy_high, edges_dx_dy_64_##left_index;                           \
725   vmov right_dx_dy_low, edges_dx_dy_64_##right_index;                          \
726   vmov right_dx_dy_high, edges_dx_dy_64_##right_index;                         \
727                                                                                \
728   vadd.u64 left_x_high, left_x_low, left_dx_dy_low;                            \
729   vadd.u64 right_x_high, right_x_low, right_dx_dy_low;                         \
730                                                                                \
731   vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy;                                 \
732   vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy                               \
733
734
735 #define setup_spans_adjust_edges_alternate_yes(left_index, right_index)        \
736   setup_spans_adjust_edges_alternate_no(left_index, right_index);              \
737                                                                                \
738   vdup.u16 y_mid_point, y_b;                                                   \
739   rsb temp, edge_shift_alt, #32;                                               \
740                                                                                \
741   lsl edge_alt_high, edge_alt_high, edge_shift_alt;                            \
742   orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp;                    \
743   lsl edge_alt_low, edge_alt_low, edge_shift_alt;                              \
744   vmov alternate_x_low, edge_alt_low, edge_alt_high;                           \
745                                                                                \
746   asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp;                               \
747   lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt;                      \
748   vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high;           \
749   vmov alternate_dx_dy_high, alternate_dx_dy_low;                              \
750                                                                                \
751   vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low;             \
752   vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy                   \
753
754
755 #define setup_spans_y_select_up()                                              \
756   vclt.s16 alternate_select, y_x4, y_mid_point                                 \
757
758 #define setup_spans_y_select_down()                                            \
759   vcgt.s16 alternate_select, y_x4, y_mid_point                                 \
760
761
762 #define setup_spans_alternate_select_left()                                    \
763   vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select               \
764
765 #define setup_spans_alternate_select_right()                                   \
766   vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select              \
767
768
769 #define setup_spans_set_x4_alternate_yes(alternate, direction)                 \
770   vshrn.s64 alternate_x_32_low, alternate_x, #32;                              \
771   vshrn.s64 left_x_32_low, left_x, #32;                                        \
772   vshrn.s64 right_x_32_low, right_x, #32;                                      \
773                                                                                \
774   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
775   vadd.u64 left_x, left_x, left_dx_dy;                                         \
776   vadd.u64 right_x, right_x, right_dx_dy;                                      \
777                                                                                \
778   vshrn.s64 alternate_x_32_high, alternate_x, #32;                             \
779   vshrn.s64 left_x_32_high, left_x, #32;                                       \
780   vshrn.s64 right_x_32_high, right_x, #32;                                     \
781                                                                                \
782   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
783   vadd.u64 left_x, left_x, left_dx_dy;                                         \
784   vadd.u64 right_x, right_x, right_dx_dy;                                      \
785                                                                                \
786   vmovn.u32 alternate_x_16, alternate_x_32;                                    \
787   setup_spans_y_select_##direction();                                          \
788   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
789                                                                                \
790   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
791   setup_spans_alternate_select_##alternate();                                  \
792                                                                                \
793   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
794   str b, [span_b_offset], #4;                                                  \
795   setup_spans_adjust_interpolants_##direction();                               \
796                                                                                \
797   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
798                                                                                \
799   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
800   str b, [span_b_offset], #4;                                                  \
801   setup_spans_adjust_interpolants_##direction();                               \
802                                                                                \
803   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
804                                                                                \
805   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
806   str b, [span_b_offset], #4;                                                  \
807   setup_spans_adjust_interpolants_##direction();                               \
808                                                                                \
809   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
810   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
811   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
812                                                                                \
813   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
814   str b, [span_b_offset], #4;                                                  \
815   setup_spans_adjust_interpolants_##direction();                               \
816                                                                                \
817   vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
818   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
819   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
820   vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
821                                                                                \
822   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
823                                                                                \
824   setup_spans_adjust_y_##direction()                                           \
825
826
827 #define setup_spans_set_x4_alternate_no(alternate, direction)                  \
828   vshrn.s64 left_x_32_low, left_x, #32;                                        \
829   vshrn.s64 right_x_32_low, right_x, #32;                                      \
830                                                                                \
831   vadd.u64 left_x, left_x, left_dx_dy;                                         \
832   vadd.u64 right_x, right_x, right_dx_dy;                                      \
833                                                                                \
834   vshrn.s64 left_x_32_high, left_x, #32;                                       \
835   vshrn.s64 right_x_32_high, right_x, #32;                                     \
836                                                                                \
837   vadd.u64 left_x, left_x, left_dx_dy;                                         \
838   vadd.u64 right_x, right_x, right_dx_dy;                                      \
839                                                                                \
840   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
841   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
842                                                                                \
843   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
844   str b, [span_b_offset], #4;                                                  \
845   setup_spans_adjust_interpolants_##direction();                               \
846                                                                                \
847   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
848                                                                                \
849   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
850   str b, [span_b_offset], #4;                                                  \
851   setup_spans_adjust_interpolants_##direction();                               \
852                                                                                \
853   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
854                                                                                \
855   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
856   str b, [span_b_offset], #4;                                                  \
857   setup_spans_adjust_interpolants_##direction();                               \
858                                                                                \
859   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
860   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
861   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
862                                                                                \
863   vst1.u32 { uvrg }, [span_uvrg_offset, :128]!;                                \
864   str b, [span_b_offset], #4;                                                  \
865   setup_spans_adjust_interpolants_##direction();                               \
866                                                                                \
867   vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
868   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
869   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
870   vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
871                                                                                \
872   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
873                                                                                \
874   setup_spans_adjust_y_##direction()                                           \
875
876
877 #define edge_adjust_low           r11
878 #define edge_adjust_high          r12
879
880 #define setup_spans_alternate_adjust_yes()                                     \
881   smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a;     \
882   subs edge_alt_low, edge_alt_low, edge_adjust_low;                            \
883   sbc edge_alt_high, edge_alt_high, edge_adjust_high                           \
884
885 #define setup_spans_alternate_adjust_no()                                      \
886
887
888 #define setup_spans_down(left_index, right_index, alternate, alternate_active) \
889   setup_spans_alternate_adjust_##alternate_active();                           \
890   setup_spans_load_b();                                                        \
891                                                                                \
892   ldrsh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset];                       \
893   subs y_c, y_c, temp;                                                         \
894   subgt height, height, y_c;                                                   \
895   addgt height, height, #1;                                                    \
896                                                                                \
897   ldrsh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset];                     \
898   subs clip, temp, y_a;                                                        \
899   ble 0f;                                                                      \
900                                                                                \
901   sub height, height, clip;                                                    \
902   add y_a, y_a, clip;                                                          \
903   setup_spans_clip(increment, alternate_active);                               \
904                                                                                \
905  0:                                                                            \
906   cmp height, #0;                                                              \
907   ble 1f;                                                                      \
908                                                                                \
909   orr temp, y_a, y_a, lsl #16;                                                 \
910   cmp height, #512;                                                            \
911   add temp, temp, #(1 << 16);                                                  \
912   movgt height, #512;                                                          \
913   add y_a, temp, #2;                                                           \
914   add y_a, y_a, #(2 << 16);                                                    \
915   vmov y_x4, temp, y_a;                                                        \
916                                                                                \
917   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
918    right_index);                                                               \
919   setup_spans_prologue_b();                                                    \
920                                                                                \
921   strh height, [psx_gpu, #psx_gpu_num_spans_offset];                           \
922                                                                                \
923  2:                                                                            \
924   setup_spans_set_x4_alternate_##alternate_active(alternate, down);            \
925   subs height, height, #4;                                                     \
926   bhi 2b;                                                                      \
927                                                                                \
928  1:                                                                            \
929
930
931 #define setup_spans_alternate_pre_increment_yes()                              \
932   adds edge_alt_low, edge_alt_low, edge_dx_dy_alt;                             \
933   adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31                    \
934
935 #define setup_spans_alternate_pre_increment_no()                               \
936
937
938 #define setup_spans_up_decrement_yes()                                         \
939   suble height, height, #1                                                     \
940
941 #define setup_spans_up_decrement_no()                                          \
942
943
944 #define setup_spans_up(left_index, right_index, alternate, alternate_active)   \
945   setup_spans_alternate_adjust_##alternate_active();                           \
946   setup_spans_load_b();                                                        \
947   sub y_a, y_a, #1;                                                            \
948                                                                                \
949   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset];                      \
950   subs temp, temp, y_c;                                                        \
951   subgt height, height, temp;                                                  \
952   setup_spans_up_decrement_##alternate_active();                               \
953                                                                                \
954   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset];                        \
955   subs clip, y_a, temp;                                                        \
956   ble 0f;                                                                      \
957                                                                                \
958   sub height, height, clip;                                                    \
959   sub y_a, y_a, clip;                                                          \
960   setup_spans_clip(decrement, alternate_active);                               \
961                                                                                \
962  0:                                                                            \
963   cmp height, #0;                                                              \
964   ble 1f;                                                                      \
965                                                                                \
966   orr temp, y_a, y_a, lsl #16;                                                 \
967   cmp height, #512;                                                            \
968   sub temp, temp, #(1 << 16);                                                  \
969   movgt height, #512;                                                          \
970   sub y_a, temp, #2;                                                           \
971   sub y_a, y_a, #(2 << 16);                                                    \
972   vmov y_x4, temp, y_a;                                                        \
973                                                                                \
974   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
975                                                                                \
976   setup_spans_alternate_pre_increment_##alternate_active();                    \
977   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
978    right_index);                                                               \
979   setup_spans_adjust_interpolants_up();                                        \
980   setup_spans_prologue_b();                                                    \
981                                                                                \
982   strh height, [psx_gpu, #psx_gpu_num_spans_offset];                           \
983                                                                                \
984  2:                                                                            \
985   setup_spans_set_x4_alternate_##alternate_active(alternate, up);              \
986   subs height, height, #4;                                                     \
987   bhi 2b;                                                                      \
988                                                                                \
989  1:                                                                            \
990
991
992 #define setup_spans_epilogue()                                                 \
993   restore_abi_regs();                                                          \
994   ldmia sp!, { r4 - r11, pc }                                                  \
995
996
997 #define setup_spans_up_up(minor, major)                                        \
998   setup_spans_prologue();                                                      \
999   sub height_minor_a, y_a, y_b;                                                \
1000   sub height_minor_b, y_b, y_c;                                                \
1001   sub height, y_a, y_c;                                                        \
1002                                                                                \
1003   vdup.u32 x_starts, x_a;                                                      \
1004   vmov x_ends, x_c, x_b;                                                       \
1005                                                                                \
1006   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1007   setup_spans_up(major, minor, minor, yes);                                    \
1008   setup_spans_epilogue()                                                       \
1009
1010 function(setup_spans_up_left)
1011   setup_spans_up_up(left, right)
1012
1013 function(setup_spans_up_right)
1014   setup_spans_up_up(right, left)
1015
1016 #define setup_spans_down_down(minor, major)                                    \
1017   setup_spans_prologue();                                                      \
1018   sub height_minor_a, y_b, y_a;                                                \
1019   sub height_minor_b, y_c, y_b;                                                \
1020   sub height, y_c, y_a;                                                        \
1021                                                                                \
1022   vdup.u32 x_starts, x_a;                                                      \
1023   vmov x_ends, x_c, x_b;                                                       \
1024                                                                                \
1025   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1026   setup_spans_down(major, minor, minor, yes);                                  \
1027   setup_spans_epilogue()                                                       \
1028
1029 function(setup_spans_down_left)
1030   setup_spans_down_down(left, right)
1031
1032 function(setup_spans_down_right)
1033   setup_spans_down_down(right, left)
1034
1035
1036 #define setup_spans_up_flat()                                                  \
1037   sub height, y_a, y_c;                                                        \
1038                                                                                \
1039   compute_edge_delta_x2();                                                     \
1040   setup_spans_up(left, right, none, no);                                       \
1041   setup_spans_epilogue()                                                       \
1042
1043 function(setup_spans_up_a)
1044   setup_spans_prologue()
1045
1046   vmov x_starts, x_a, x_b
1047   vdup.u32 x_ends, x_c
1048
1049   setup_spans_up_flat()
1050
1051 function(setup_spans_up_b)
1052   setup_spans_prologue()
1053
1054   vdup.u32 x_starts, x_a
1055   vmov x_ends, x_b, x_c
1056
1057   setup_spans_up_flat()
1058
1059 #define setup_spans_down_flat()                                                \
1060   sub height, y_c, y_a;                                                        \
1061                                                                                \
1062   compute_edge_delta_x2();                                                     \
1063   setup_spans_down(left, right, none, no);                                     \
1064   setup_spans_epilogue()                                                       \
1065
1066 function(setup_spans_down_a)
1067   setup_spans_prologue()
1068
1069   vmov x_starts, x_a, x_b
1070   vdup.u32 x_ends, x_c
1071
1072   setup_spans_down_flat()
1073
1074 function(setup_spans_down_b)
1075   setup_spans_prologue()
1076
1077   vdup.u32 x_starts, x_a
1078   vmov x_ends, x_b, x_c
1079
1080   setup_spans_down_flat()
1081
1082
1083 #define middle_y                                          r9
1084
1085 #define edges_xy_b                                        q11
1086 #define edges_dx_dy_b                                     d26
1087 #define edge_shifts_b                                     d27
1088 #define edges_dx_dy_and_shifts_b                          q13
1089 #define height_increment                                  d20
1090
1091 #define edges_dx_dy_and_shifts                            q1
1092
1093 #define edges_xy_b_left                                   d22
1094 #define edges_xy_b_right                                  d23
1095
1096 #define setup_spans_up_down_load_edge_set_b()                                  \
1097   vmov edges_xy, edges_xy_b;                                                   \
1098   vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b                        \
1099
1100
1101 function(setup_spans_up_down)
1102   setup_spans_prologue()
1103
1104   // s32 middle_y = y_a;
1105   sub height_minor_a, y_a, y_b
1106   sub height_minor_b, y_c, y_a
1107   sub height_major, y_c, y_b
1108
1109   vmov x_starts, x_a, x_c
1110   vdup.u32 x_ends, x_b
1111
1112   compute_edge_delta_x3(x_a, height_minor_a, height_major)
1113
1114   mov temp, #0
1115   vmov height_increment, temp, height_minor_b
1116   vmlal.s32 edges_xy, edges_dx_dy, height_increment
1117
1118   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1119   vmov edges_xy_b_right, edges_xy_right
1120
1121   vmov edge_shifts_b, edge_shifts
1122   vmov.u32 edge_shifts_b[0], edge_shift_alt
1123
1124   vneg.s32 edges_dx_dy_b, edges_dx_dy
1125   vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1126
1127   mov middle_y, y_a
1128   
1129   setup_spans_load_b()
1130   sub y_a, y_a, #1
1131
1132   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
1133   subs temp, temp, y_b
1134   subgt height_minor_a, height_minor_a, temp
1135
1136   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
1137   subs clip, y_a, temp
1138   ble 0f
1139
1140   sub height_minor_a, height_minor_a, clip
1141   sub y_a, y_a, clip
1142   setup_spans_clip(decrement, no)
1143
1144  0:                                                                
1145   cmp height_minor_a, #0
1146   ble 3f
1147
1148   orr temp, y_a, y_a, lsl #16
1149   sub temp, temp, #(1 << 16)
1150   sub y_a, temp, #2
1151   sub y_a, y_a, #(2 << 16)
1152   vmov y_x4, temp, y_a
1153
1154   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1155
1156   strh height_minor_a, [psx_gpu, #psx_gpu_num_spans_offset]
1157
1158   setup_spans_adjust_edges_alternate_no(left, right); 
1159   setup_spans_adjust_interpolants_up()
1160   setup_spans_up_down_load_edge_set_b()
1161
1162   setup_spans_prologue_b()
1163
1164
1165  2: 
1166   setup_spans_set_x4_alternate_no(none, up)
1167   subs height_minor_a, height_minor_a, #4
1168   bhi 2b
1169
1170   add span_edge_data, span_edge_data, height_minor_a, lsl #3
1171   add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1172   add span_b_offset, span_b_offset, height_minor_a, lsl #2
1173
1174  4:
1175   add temp, psx_gpu, #psx_gpu_uvrg_offset
1176   vld1.32 { uvrg }, [temp]
1177   mov y_a, middle_y
1178   
1179   setup_spans_load_b()
1180
1181   ldrh temp, [psx_gpu, #psx_gpu_viewport_end_y_offset]
1182   subs y_c, y_c, temp
1183   subgt height_minor_b, height_minor_b, y_c
1184   addgt height_minor_b, height_minor_b, #1
1185
1186   ldrh temp, [psx_gpu, #psx_gpu_viewport_start_y_offset]
1187   subs clip, temp, y_a
1188   ble 0f
1189
1190   sub height_minor_b, height_minor_b, clip
1191   add y_a, y_a, clip
1192   setup_spans_clip(increment, no)
1193
1194  0:
1195   cmp height_minor_b, #0
1196   ble 1f
1197
1198   orr temp, y_a, y_a, lsl #16
1199   add temp, temp, #(1 << 16) 
1200   add y_a, temp, #2
1201   add y_a, y_a, #(2 << 16)
1202   vmov y_x4, temp, y_a
1203
1204   setup_spans_adjust_edges_alternate_no(left, right)
1205
1206   ldrh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1207   add temp, temp, height_minor_b
1208
1209   cmp temp, #MAX_SPANS
1210   beq 5f
1211
1212   strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1213
1214  2:                                                     
1215   setup_spans_set_x4_alternate_no(none, down)
1216   subs height_minor_b, height_minor_b, #4
1217   bhi 2b
1218
1219  1:
1220   setup_spans_epilogue()
1221
1222  3:
1223   setup_spans_up_down_load_edge_set_b()
1224   setup_spans_prologue_b()
1225   bal 4b
1226
1227  5:
1228   // FIXME: overflow corner case
1229   sub temp, temp, height_minor_b
1230   bics height_minor_b, #3
1231   add temp, temp, height_minor_b
1232   strh temp, [psx_gpu, #psx_gpu_num_spans_offset]
1233   bne 2b
1234   bal 1b
1235
1236 #undef span_uvrg_offset
1237 #undef span_edge_data
1238 #undef span_b_offset
1239 #undef left_x
1240 #undef b
1241
1242 #define psx_gpu                                  r0
1243 #define num_spans                                r1
1244 #define span_uvrg_offset                         r2
1245 #define span_edge_data                           r3
1246 #define span_b_offset                            r4
1247 #define b_dx                                     r5
1248 #define span_num_blocks                          r6
1249 #define y                                        r7
1250 #define left_x                                   r8
1251 #define b                                        r9
1252 #define dither_offset_ptr                        r10
1253 #define block_ptr_a                              r11
1254 #define fb_ptr                                   r12
1255 #define num_blocks                               r14
1256
1257 #define uvrg_dx_ptr                              r2
1258 #define texture_mask_ptr                         r3
1259 #define dither_shift                             r8
1260 #define dither_row                               r10
1261
1262 #define c_32                                     r7
1263 #define b_dx4                                    r8
1264 #define b_dx8                                    r9
1265 #define block_ptr_b                              r10
1266
1267 #define block_span_ptr                           r10
1268 #define right_mask                               r8
1269
1270 #define color                                    r2
1271 #define color_r                                  r3
1272 #define color_g                                  r4
1273 #define color_b                                  r5
1274
1275 #undef uvrg
1276
1277 #define u_block                                  q0
1278 #define v_block                                  q1
1279 #define r_block                                  q2
1280 #define g_block                                  q3
1281 #define b_block                                  q4
1282
1283 #define uv_dx4                                   d10
1284 #define rg_dx4                                   d11
1285 #define uv_dx8                                   d12
1286 #define rg_dx8                                   d13
1287 #define b_whole_8                                d14
1288 #define fb_mask_ptrs                             d15
1289
1290 #define uvrg_dx4                                 q5
1291 #define uvrg_dx8                                 q6
1292 #define uv_dx8                                   d12
1293 #define rg_dx8                                   d13
1294
1295 #define u_whole                                  q8
1296 #define v_whole                                  q9
1297 #define r_whole                                  q10
1298 #define g_whole                                  q11
1299 #define b_whole                                  q12
1300
1301 #define u_whole_low                              d16
1302 #define u_whole_high                             d17
1303 #define v_whole_low                              d18
1304 #define v_whole_high                             d19
1305 #define r_whole_low                              d20
1306 #define r_whole_high                             d21
1307 #define g_whole_low                              d22
1308 #define g_whole_high                             d23
1309 #define b_whole_low                              d24
1310 #define b_whole_high                             d25
1311
1312 #define dx4                                      q13
1313 #define dx8                                      q13
1314
1315 #define u_whole_8                                d26
1316 #define v_whole_8                                d27
1317 #define u_whole_8b                               d24
1318 #define r_whole_8                                d24
1319 #define g_whole_8                                d25
1320
1321 #define uv_whole_8                               q13
1322 #define uv_whole_8b                              q14
1323
1324 #define dither_offsets                           q14
1325 #define texture_mask                             q15
1326 #define texture_mask_u                           d30
1327 #define texture_mask_v                           d31
1328
1329 #define dither_offsets_short                     d28
1330
1331 #define v_left_x                                 q8
1332 #define uvrg                                     q9
1333 #define block_span                               q10
1334
1335 #define uv                                       d18
1336 #define rg                                       d19
1337
1338 #define draw_mask                                q1
1339 #define draw_mask_edge                           q13
1340 #define test_mask                                q0
1341
1342 #define uvrg_dx                                  q3
1343
1344 #define colors                                   q2
1345
1346 #define setup_blocks_texture_swizzled()                                        \
1347   vand.u8 u_whole_8b, u_whole_8, texture_mask_u;                               \
1348   vsli.u8 u_whole_8, v_whole_8, #4;                                            \
1349   vsri.u8 v_whole_8, u_whole_8b, #4                                            \
1350
1351 #define setup_blocks_texture_unswizzled()                                      \
1352
1353
1354 #define setup_blocks_shaded_textured_builder(swizzling)                        \
1355 .align 3;                                                                      \
1356                                                                                \
1357 function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
1358   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
1359   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1360                                                                                \
1361   vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128];                                   \
1362   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1363                                                                                \
1364   cmp num_spans, #0;                                                           \
1365   bxeq lr;                                                                     \
1366                                                                                \
1367   stmdb sp!, { r4 - r11, r14 };                                                \
1368   save_abi_regs();                                                             \
1369   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1370                                                                                \
1371   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
1372   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1373                                                                                \
1374   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16];     \
1375   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1376                                                                                \
1377   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1378   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1379                                                                                \
1380   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
1381   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1382                                                                                \
1383   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1384                                                                                \
1385  0:                                                                            \
1386   vmov.u8 fb_mask_ptrs, #0;                                                    \
1387                                                                                \
1388   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
1389   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1390                                                                                \
1391   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
1392   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
1393                                                                                \
1394   cmp span_num_blocks, #0;                                                     \
1395   beq 1f;                                                                      \
1396                                                                                \
1397   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
1398   add num_blocks, span_num_blocks, num_blocks;                                 \
1399                                                                                \
1400   cmp num_blocks, #MAX_BLOCKS;                                                 \
1401   bgt 2f;                                                                      \
1402                                                                                \
1403  3:                                                                            \
1404   ldr b, [span_b_offset];                                                      \
1405   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1406                                                                                \
1407   vdup.u32 v_left_x, left_x;                                                   \
1408   and y, y, #0x3;                                                              \
1409                                                                                \
1410   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
1411   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1412                                                                                \
1413   mla b, b_dx, left_x, b;                                                      \
1414   and dither_shift, left_x, #0x03;                                             \
1415                                                                                \
1416   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
1417   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1418                                                                                \
1419   mov dither_shift, dither_shift, lsl #3;                                      \
1420   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1421                                                                                \
1422   mov c_32, #32;                                                               \
1423   subs span_num_blocks, span_num_blocks, #1;                                   \
1424                                                                                \
1425   mov dither_row, dither_row, ror dither_shift;                                \
1426   mov b_dx4, b_dx, lsl #2;                                                     \
1427                                                                                \
1428   vdup.u32 dither_offsets_short, dither_row;                                   \
1429   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1430                                                                                \
1431   vdup.u32 b_block, b;                                                         \
1432   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1433                                                                                \
1434   vdup.u32 u_block, uv[0];                                                     \
1435   mov b_dx8, b_dx, lsl #3;                                                     \
1436                                                                                \
1437   vdup.u32 v_block, uv[1];                                                     \
1438   vdup.u32 r_block, rg[0];                                                     \
1439   vdup.u32 g_block, rg[1];                                                     \
1440                                                                                \
1441   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1442                                                                                \
1443   vadd.u32 u_block, u_block, block_span;                                       \
1444   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1445                                                                                \
1446   vadd.u32 v_block, v_block, block_span;                                       \
1447   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1448                                                                                \
1449   vadd.u32 r_block, r_block, block_span;                                       \
1450   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1451                                                                                \
1452   vadd.u32 g_block, g_block, block_span;                                       \
1453   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
1454                                                                                \
1455   vadd.u32 b_block, b_block, block_span;                                       \
1456   add block_ptr_b, block_ptr_a, #16;                                           \
1457                                                                                \
1458   vshrn.u32 u_whole_low, u_block, #16;                                         \
1459   vshrn.u32 v_whole_low, v_block, #16;                                         \
1460   vshrn.u32 r_whole_low, r_block, #16;                                         \
1461   vshrn.u32 g_whole_low, g_block, #16;                                         \
1462                                                                                \
1463   vdup.u32 dx4, uv_dx4[0];                                                     \
1464   vshrn.u32 b_whole_low, b_block, #16;                                         \
1465                                                                                \
1466   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1467   vdup.u32 dx4, uv_dx4[1];                                                     \
1468                                                                                \
1469   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1470   vdup.u32 dx4, rg_dx4[0];                                                     \
1471                                                                                \
1472   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1473   vdup.u32 dx4, rg_dx4[1];                                                     \
1474                                                                                \
1475   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1476   vdup.u32 dx4, b_dx4;                                                         \
1477                                                                                \
1478   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1479   vdup.u32 dx8, uv_dx8[0];                                                     \
1480                                                                                \
1481   vadd.u32 u_block, u_block, dx8;                                              \
1482   vdup.u32 dx8, uv_dx8[1];                                                     \
1483                                                                                \
1484   vadd.u32 v_block, v_block, dx8;                                              \
1485   vdup.u32 dx8, rg_dx8[0];                                                     \
1486                                                                                \
1487   vadd.u32 r_block, r_block, dx8;                                              \
1488   vdup.u32 dx8, rg_dx8[1];                                                     \
1489                                                                                \
1490   vadd.u32 g_block, g_block, dx8;                                              \
1491   vdup.u32 dx8, b_dx8;                                                         \
1492                                                                                \
1493   vadd.u32 b_block, b_block, dx8;                                              \
1494   vmovn.u16 u_whole_8, u_whole;                                                \
1495                                                                                \
1496   vmovn.u16 v_whole_8, v_whole;                                                \
1497                                                                                \
1498   vmovn.u16 b_whole_8, b_whole;                                                \
1499   pld [fb_ptr];                                                                \
1500   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1501                                                                                \
1502   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1503   setup_blocks_texture_##swizzling();                                          \
1504                                                                                \
1505   vmovn.u16 r_whole_8, r_whole;                                                \
1506   beq 5f;                                                                      \
1507                                                                                \
1508  4:                                                                            \
1509   vmovn.u16 g_whole_8, g_whole;                                                \
1510   vshrn.u32 u_whole_low, u_block, #16;                                         \
1511                                                                                \
1512   vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32;                 \
1513   vshrn.u32 v_whole_low, v_block, #16;                                         \
1514                                                                                \
1515   vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32;                \
1516   vshrn.u32 r_whole_low, r_block, #16;                                         \
1517                                                                                \
1518   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1519   vshrn.u32 g_whole_low, g_block, #16;                                         \
1520                                                                                \
1521   vdup.u32 dx4, uv_dx4[0];                                                     \
1522   vshrn.u32 b_whole_low, b_block, #16;                                         \
1523                                                                                \
1524   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1525   vdup.u32 dx4, uv_dx4[1];                                                     \
1526                                                                                \
1527   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1528   vdup.u32 dx4, rg_dx4[0];                                                     \
1529                                                                                \
1530   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1531   vdup.u32 dx4, rg_dx4[1];                                                     \
1532                                                                                \
1533   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1534   vdup.u32 dx4, b_dx4;                                                         \
1535                                                                                \
1536   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1537   vdup.u32 dx8, uv_dx8[0];                                                     \
1538                                                                                \
1539   vadd.u32 u_block, u_block, dx8;                                              \
1540   vdup.u32 dx8, uv_dx8[1];                                                     \
1541                                                                                \
1542   vadd.u32 v_block, v_block, dx8;                                              \
1543   vdup.u32 dx8, rg_dx8[0];                                                     \
1544                                                                                \
1545   vadd.u32 r_block, r_block, dx8;                                              \
1546   vdup.u32 dx8, rg_dx8[1];                                                     \
1547                                                                                \
1548   vadd.u32 g_block, g_block, dx8;                                              \
1549   vdup.u32 dx8, b_dx8;                                                         \
1550                                                                                \
1551   vadd.u32 b_block, b_block, dx8;                                              \
1552   vmovn.u16 u_whole_8, u_whole;                                                \
1553                                                                                \
1554   add fb_ptr, fb_ptr, #16;                                                     \
1555   vmovn.u16 v_whole_8, v_whole;                                                \
1556                                                                                \
1557   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1558   vmovn.u16 b_whole_8, b_whole;                                                \
1559                                                                                \
1560   pld [fb_ptr];                                                                \
1561                                                                                \
1562   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1563   subs span_num_blocks, span_num_blocks, #1;                                   \
1564                                                                                \
1565   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1566   setup_blocks_texture_##swizzling();                                          \
1567                                                                                \
1568   vmovn.u16 r_whole_8, r_whole;                                                \
1569   bne 4b;                                                                      \
1570                                                                                \
1571  5:                                                                            \
1572   vmovn.u16 g_whole_8, g_whole;                                                \
1573   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
1574                                                                                \
1575   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
1576   vdup.u8 draw_mask, right_mask;                                               \
1577                                                                                \
1578   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1579   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1580   vzip.u8 u_whole_8, v_whole_8;                                                \
1581                                                                                \
1582   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1583   vst1.u32 { r_whole_8, g_whole_8 }, [block_ptr_b, :128], c_32;                \
1584   vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32;                          \
1585   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1586   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1587                                                                                \
1588  1:                                                                            \
1589   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1590   add span_b_offset, span_b_offset, #4;                                        \
1591                                                                                \
1592   add span_edge_data, span_edge_data, #8;                                      \
1593   subs num_spans, num_spans, #1;                                               \
1594                                                                                \
1595   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1596   bne 0b;                                                                      \
1597                                                                                \
1598   restore_abi_regs();                                                          \
1599   ldmia sp!, { r4 - r11, pc };                                                 \
1600                                                                                \
1601  2:                                                                            \
1602   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1603   vpush { texture_mask };                                                      \
1604   vpush { uvrg_dx4 };                                                          \
1605                                                                                \
1606   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
1607   bl flush_render_block_buffer;                                                \
1608   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
1609                                                                                \
1610   vpop { uvrg_dx4 };                                                           \
1611   vpop { texture_mask };                                                       \
1612                                                                                \
1613   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1614   vmov.u8 fb_mask_ptrs, #0;                                                    \
1615                                                                                \
1616   mov num_blocks, span_num_blocks;                                             \
1617   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1618   bal 3b                                                                       \
1619
1620
1621 setup_blocks_shaded_textured_builder(swizzled)
1622 setup_blocks_shaded_textured_builder(unswizzled)
1623
1624
1625 #define setup_blocks_unshaded_textured_builder(swizzling)                      \
1626 .align 3;                                                                      \
1627                                                                                \
1628 function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
1629   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
1630   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1631                                                                                \
1632   vld1.u32 { uvrg_dx }, [uvrg_dx_ptr, :128];                                   \
1633   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1634                                                                                \
1635   cmp num_spans, #0;                                                           \
1636   bxeq lr;                                                                     \
1637                                                                                \
1638   stmdb sp!, { r4 - r11, r14 };                                                \
1639   save_abi_regs();                                                             \
1640   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1641                                                                                \
1642   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1643                                                                                \
1644   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [texture_mask_ptr, :16];     \
1645   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1646                                                                                \
1647   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1648   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1649                                                                                \
1650   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1651                                                                                \
1652   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1653                                                                                \
1654  0:                                                                            \
1655   vmov.u8 fb_mask_ptrs, #0;                                                    \
1656                                                                                \
1657   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
1658   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1659                                                                                \
1660   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
1661   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
1662                                                                                \
1663   cmp span_num_blocks, #0;                                                     \
1664   beq 1f;                                                                      \
1665                                                                                \
1666   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
1667   add num_blocks, span_num_blocks, num_blocks;                                 \
1668                                                                                \
1669   cmp num_blocks, #MAX_BLOCKS;                                                 \
1670   bgt 2f;                                                                      \
1671                                                                                \
1672  3:                                                                            \
1673   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1674                                                                                \
1675   vdup.u32 v_left_x, left_x;                                                   \
1676   and y, y, #0x3;                                                              \
1677                                                                                \
1678   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
1679   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1680                                                                                \
1681   and dither_shift, left_x, #0x03;                                             \
1682                                                                                \
1683   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
1684   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1685                                                                                \
1686   mov dither_shift, dither_shift, lsl #3;                                      \
1687   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1688                                                                                \
1689   mov c_32, #32;                                                               \
1690   subs span_num_blocks, span_num_blocks, #1;                                   \
1691                                                                                \
1692   mov dither_row, dither_row, ror dither_shift;                                \
1693                                                                                \
1694   vdup.u32 dither_offsets_short, dither_row;                                   \
1695   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1696                                                                                \
1697   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1698                                                                                \
1699   vdup.u32 u_block, uv[0];                                                     \
1700                                                                                \
1701   vdup.u32 v_block, uv[1];                                                     \
1702   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1703                                                                                \
1704   vadd.u32 u_block, u_block, block_span;                                       \
1705   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
1706                                                                                \
1707   vadd.u32 v_block, v_block, block_span;                                       \
1708   add block_ptr_b, block_ptr_a, #16;                                           \
1709                                                                                \
1710   vshrn.u32 u_whole_low, u_block, #16;                                         \
1711   vshrn.u32 v_whole_low, v_block, #16;                                         \
1712                                                                                \
1713   vdup.u32 dx4, uv_dx4[0];                                                     \
1714                                                                                \
1715   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1716   vdup.u32 dx4, uv_dx4[1];                                                     \
1717                                                                                \
1718   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1719   vdup.u32 dx8, uv_dx8[0];                                                     \
1720                                                                                \
1721   vadd.u32 u_block, u_block, dx8;                                              \
1722   vdup.u32 dx8, uv_dx8[1];                                                     \
1723                                                                                \
1724   vadd.u32 v_block, v_block, dx8;                                              \
1725   vmovn.u16 u_whole_8, u_whole;                                                \
1726                                                                                \
1727   vmovn.u16 v_whole_8, v_whole;                                                \
1728                                                                                \
1729   pld [fb_ptr];                                                                \
1730   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1731                                                                                \
1732   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1733   setup_blocks_texture_##swizzling();                                          \
1734                                                                                \
1735   beq 5f;                                                                      \
1736                                                                                \
1737  4:                                                                            \
1738   vshrn.u32 u_whole_low, u_block, #16;                                         \
1739                                                                                \
1740   vst2.u8 { u_whole_8, v_whole_8 }, [block_ptr_a, :128], c_32;                 \
1741   vshrn.u32 v_whole_low, v_block, #16;                                         \
1742                                                                                \
1743   add block_ptr_b, block_ptr_b, #32;                                           \
1744   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1745                                                                                \
1746   vdup.u32 dx4, uv_dx4[0];                                                     \
1747   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1748   vdup.u32 dx4, uv_dx4[1];                                                     \
1749                                                                                \
1750   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1751   vdup.u32 dx8, uv_dx8[0];                                                     \
1752                                                                                \
1753   vadd.u32 u_block, u_block, dx8;                                              \
1754   vdup.u32 dx8, uv_dx8[1];                                                     \
1755                                                                                \
1756   vadd.u32 v_block, v_block, dx8;                                              \
1757   vmovn.u16 u_whole_8, u_whole;                                                \
1758                                                                                \
1759   add fb_ptr, fb_ptr, #16;                                                     \
1760   vmovn.u16 v_whole_8, v_whole;                                                \
1761                                                                                \
1762   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1763   pld [fb_ptr];                                                                \
1764                                                                                \
1765   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1766   subs span_num_blocks, span_num_blocks, #1;                                   \
1767                                                                                \
1768   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1769   setup_blocks_texture_##swizzling();                                          \
1770                                                                                \
1771   bne 4b;                                                                      \
1772                                                                                \
1773  5:                                                                            \
1774   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
1775                                                                                \
1776   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
1777   vdup.u8 draw_mask, right_mask;                                               \
1778                                                                                \
1779   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1780   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1781   vzip.u8 u_whole_8, v_whole_8;                                                \
1782                                                                                \
1783   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1784   add block_ptr_b, block_ptr_b, #32;                                           \
1785   vst1.u32 { uv_whole_8 }, [block_ptr_a, :128], c_32;                          \
1786   vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32;                      \
1787   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32;             \
1788                                                                                \
1789  1:                                                                            \
1790   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1791   add span_edge_data, span_edge_data, #8;                                      \
1792   subs num_spans, num_spans, #1;                                               \
1793                                                                                \
1794   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
1795   bne 0b;                                                                      \
1796                                                                                \
1797   restore_abi_regs();                                                          \
1798   ldmia sp!, { r4 - r11, pc };                                                 \
1799                                                                                \
1800  2:                                                                            \
1801   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1802   vpush { texture_mask };                                                      \
1803   vpush { uvrg_dx4 };                                                          \
1804                                                                                \
1805   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 }; /* r14=num_blocks */         \
1806   bl flush_render_block_buffer;                                                \
1807   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12 };                              \
1808                                                                                \
1809   vpop { uvrg_dx4 };                                                           \
1810   vpop { texture_mask };                                                       \
1811                                                                                \
1812   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1813   vmov.u8 fb_mask_ptrs, #0;                                                    \
1814                                                                                \
1815   mov num_blocks, span_num_blocks;                                             \
1816   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1817   bal 3b                                                                       \
1818
1819
1820 setup_blocks_unshaded_textured_builder(swizzled)
1821 setup_blocks_unshaded_textured_builder(unswizzled)
1822
1823
1824 .align 3
1825
1826 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1827   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
1828   veor.u32 draw_mask, draw_mask, draw_mask
1829
1830   cmp num_spans, #0
1831   bxeq lr
1832
1833   stmdb sp!, { r4 - r11, r14 }
1834   save_abi_regs()
1835   vld1.u32 { test_mask }, [psx_gpu, :128]
1836
1837   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
1838
1839   ubfx color_r, color, #3, #5
1840   ubfx color_g, color, #11, #5
1841   ubfx color_b, color, #19, #5
1842
1843   orr color, color_r, color_b, lsl #10
1844   orr color, color, color_g, lsl #5
1845
1846   vdup.u16 colors, color
1847
1848   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
1849   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1850
1851   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1852   add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1853
1854  0:
1855   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1856   ldrh y, [span_edge_data, #edge_data_y_offset]
1857
1858   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
1859
1860   cmp span_num_blocks, #0
1861   beq 1f
1862
1863   ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
1864   add num_blocks, span_num_blocks, num_blocks
1865
1866   cmp num_blocks, #MAX_BLOCKS
1867   bgt 2f
1868
1869  3:
1870   add fb_ptr, fb_ptr, y, lsl #11
1871   and y, y, #0x3
1872
1873   add fb_ptr, fb_ptr, left_x, lsl #1
1874   mov c_32, #32
1875
1876   subs span_num_blocks, span_num_blocks, #1
1877
1878   add block_ptr_b, block_ptr_a, #16
1879   pld [fb_ptr]
1880
1881   vmov.u32 fb_mask_ptrs[1], fb_ptr
1882   beq 5f
1883
1884  4:
1885   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_32
1886   vst1.u32 { colors }, [block_ptr_b, :128], c_32
1887   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
1888
1889   add fb_ptr, fb_ptr, #16
1890   add block_ptr_b, block_ptr_b, #32
1891
1892   pld [fb_ptr]
1893
1894   vmov.u32 fb_mask_ptrs[1], fb_ptr
1895   subs span_num_blocks, span_num_blocks, #1
1896
1897   bne 4b
1898
1899  5:
1900   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset]
1901
1902   vdup.u8 draw_mask_edge, right_mask
1903   vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1904
1905   vst1.u32 { colors }, [block_ptr_b, :128], c_32
1906   vst1.u32 { draw_mask_edge }, [block_ptr_a, :128], c_32
1907   add block_ptr_b, block_ptr_b, #32
1908   vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32
1909
1910  1:
1911   add span_edge_data, span_edge_data, #8
1912   subs num_spans, num_spans, #1
1913
1914   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
1915   bne 0b
1916
1917   restore_abi_regs()
1918   ldmia sp!, { r4 - r11, pc }
1919                                                                            
1920  2:
1921   vpush { colors }
1922
1923   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
1924   bl flush_render_block_buffer
1925   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
1926
1927   vpop { colors }
1928
1929   vld1.u32 { test_mask }, [psx_gpu, :128]
1930   veor.u32 draw_mask, draw_mask, draw_mask
1931
1932   mov num_blocks, span_num_blocks
1933   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1934   bal 3b
1935
1936
1937 #define mask_msb_scalar                                   r14
1938
1939 #define msb_mask                                          q15
1940
1941 #define pixels_low                                        d16
1942
1943 #define msb_mask_low                                      d30
1944 #define msb_mask_high                                     d31
1945
1946
1947 .align 3
1948
1949 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1950   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset]
1951
1952   cmp num_spans, #0
1953   bxeq lr
1954
1955   stmdb sp!, { r4 - r11, r14 }
1956
1957   ldr color, [psx_gpu, #psx_gpu_triangle_color_offset]
1958
1959   ubfx color_r, color, #3, #5
1960   ubfx color_g, color, #11, #5
1961
1962   ldrh mask_msb_scalar, [psx_gpu, #psx_gpu_mask_msb_offset]
1963   ubfx color_b, color, #19, #5
1964
1965   orr color, color_r, color_b, lsl #10
1966   orr color, color, color_g, lsl #5
1967   orr color, color, mask_msb_scalar
1968
1969   vdup.u16 colors, color
1970
1971   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1972   orr color, color, color, lsl #16
1973
1974
1975  0:
1976   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset]
1977   ldrh y, [span_edge_data, #edge_data_y_offset]
1978
1979   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
1980
1981   cmp span_num_blocks, #0
1982   beq 1f
1983
1984   ldrh left_x, [span_edge_data, #edge_data_left_x_offset]
1985
1986   add fb_ptr, fb_ptr, y, lsl #11
1987   subs span_num_blocks, span_num_blocks, #1
1988
1989   add fb_ptr, fb_ptr, left_x, lsl #1
1990   beq 3f
1991
1992  2:
1993   vst1.u32 { colors }, [fb_ptr]!
1994   subs span_num_blocks, span_num_blocks, #1
1995
1996   bne 2b
1997
1998  3:
1999   ldrb right_mask, [span_edge_data, #edge_data_right_mask_offset]
2000
2001   cmp right_mask, #0x0
2002   beq 5f
2003
2004   tst right_mask, #0xF
2005   streq color, [fb_ptr], #4
2006   moveq right_mask, right_mask, lsr #4
2007   streq color, [fb_ptr], #4
2008
2009   tst right_mask, #0x3
2010   streq color, [fb_ptr], #4
2011   moveq right_mask, right_mask, lsr #2
2012
2013   tst right_mask, #0x1
2014   strheq color, [fb_ptr]
2015
2016  1:
2017   add span_edge_data, span_edge_data, #8
2018   subs num_spans, num_spans, #1
2019   bne 0b
2020
2021   ldmia sp!, { r4 - r11, pc }
2022                                                                            
2023  5:
2024   vst1.u32 { colors }, [fb_ptr]
2025   bal 1b
2026
2027
2028 #undef c_64
2029
2030 #define c_64                                              r7
2031 #define rg_dx_ptr                                         r2
2032
2033
2034 #undef r_block
2035 #undef g_block
2036 #undef b_block
2037 #undef r_whole
2038 #undef g_whole
2039 #undef b_whole
2040 #undef r_whole_low
2041 #undef r_whole_high
2042 #undef g_whole_low
2043 #undef g_whole_high
2044 #undef b_whole_low
2045 #undef b_whole_high
2046 #undef r_whole_8
2047 #undef g_whole_8
2048 #undef b_whole_8
2049 #undef dither_offsets
2050 #undef rg_dx4
2051 #undef rg_dx8
2052 #undef dx4
2053 #undef dx8
2054 #undef v_left_x
2055 #undef uvrg
2056 #undef block_span
2057 #undef rg
2058 #undef draw_mask
2059 #undef test_mask
2060
2061 #define r_block                                           q0
2062 #define g_block                                           q1
2063 #define b_block                                           q2
2064
2065 #define r_whole                                           q3
2066 #define g_whole                                           q4
2067 #define b_whole                                           q5
2068
2069 #define r_whole_low                                       d6
2070 #define r_whole_high                                      d7
2071 #define g_whole_low                                       d8
2072 #define g_whole_high                                      d9
2073 #define b_whole_low                                       d10
2074 #define b_whole_high                                      d11
2075
2076 #define gb_whole_8                                        q6
2077
2078 #define g_whole_8                                         d12
2079 #define b_whole_8                                         d13
2080
2081 #define r_whole_8                                         d14
2082
2083 #define pixels                                            q8
2084
2085 #define rg_dx4                                            d18
2086 #define rg_dx8                                            d19
2087
2088 #define dx4                                               q10
2089 #define dx8                                               q10
2090
2091 #define v_left_x                                          d6
2092 #define uvrg                                              q4
2093 #define block_span                                        q5
2094
2095 #define rg                                                d9
2096
2097 #define d64_1                                             d22
2098 #define d64_128                                           d23
2099
2100 #define d128_4                                            q12
2101 #define d128_0x7                                          q13
2102
2103 #define d64_4                                             d24
2104
2105 #define dither_offsets                                    q14
2106 #define draw_mask                                         q15
2107
2108 #define dither_offsets_low                                d28
2109
2110 #define rg_dx                                             d0
2111 #define test_mask                                         q10
2112
2113
2114 #define setup_blocks_shaded_untextured_dither_a_dithered()                     \
2115   vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low;                           \
2116   vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets;                             \
2117
2118 #define setup_blocks_shaded_untextured_dither_b_dithered()                     \
2119   vqsub.u8 r_whole_8, r_whole_8, d64_4;                                        \
2120   vqsub.u8 gb_whole_8, gb_whole_8, d128_4                                      \
2121
2122 #define setup_blocks_shaded_untextured_dither_a_undithered()                   \
2123
2124 #define setup_blocks_shaded_untextured_dither_b_undithered()                   \
2125
2126
2127 #define setup_blocks_shaded_untextured_indirect_builder(dithering)             \
2128 .align 3;                                                                      \
2129                                                                                \
2130 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
2131   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
2132   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2133                                                                                \
2134   vld1.u32 { rg_dx }, [rg_dx_ptr, :64];                                        \
2135                                                                                \
2136   cmp num_spans, #0;                                                           \
2137   bxeq lr;                                                                     \
2138                                                                                \
2139   stmdb sp!, { r4 - r11, r14 };                                                \
2140   save_abi_regs();                                                             \
2141   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2142                                                                                \
2143   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
2144   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2145                                                                                \
2146   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2147                                                                                \
2148   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
2149   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2150                                                                                \
2151   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2152   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2153                                                                                \
2154   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
2155   vmov.u8 d64_1, #1;                                                           \
2156                                                                                \
2157   vmov.u8 d128_4, #4;                                                          \
2158   vmov.u8 d64_128, #128;                                                       \
2159                                                                                \
2160   vmov.u8 d128_0x7, #0x7;                                                      \
2161                                                                                \
2162  0:                                                                            \
2163   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
2164   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2165                                                                                \
2166   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
2167   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
2168                                                                                \
2169   cmp span_num_blocks, #0;                                                     \
2170   beq 1f;                                                                      \
2171                                                                                \
2172   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
2173   add num_blocks, span_num_blocks, num_blocks;                                 \
2174                                                                                \
2175   cmp num_blocks, #MAX_BLOCKS;                                                 \
2176   bgt 2f;                                                                      \
2177                                                                                \
2178  3:                                                                            \
2179   ldr b, [span_b_offset];                                                      \
2180   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2181                                                                                \
2182   vdup.u32 v_left_x, left_x;                                                   \
2183   and y, y, #0x3;                                                              \
2184                                                                                \
2185   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
2186   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2187                                                                                \
2188   mla b, b_dx, left_x, b;                                                      \
2189   and dither_shift, left_x, #0x03;                                             \
2190                                                                                \
2191   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
2192   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2193                                                                                \
2194   mov dither_shift, dither_shift, lsl #3;                                      \
2195   vmla.u32 rg, rg_dx, v_left_x;                                                \
2196                                                                                \
2197   mov c_64, #64;                                                               \
2198   subs span_num_blocks, span_num_blocks, #1;                                   \
2199                                                                                \
2200   mov dither_row, dither_row, ror dither_shift;                                \
2201   mov b_dx4, b_dx, lsl #2;                                                     \
2202                                                                                \
2203   vdup.u32 dither_offsets, dither_row;                                         \
2204   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2205                                                                                \
2206   vdup.u32 b_block, b;                                                         \
2207   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2208                                                                                \
2209   mov b_dx8, b_dx, lsl #3;                                                     \
2210   vdup.u32 r_block, rg[0];                                                     \
2211   vdup.u32 g_block, rg[1];                                                     \
2212                                                                                \
2213   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2214                                                                                \
2215   vadd.u32 r_block, r_block, block_span;                                       \
2216   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2217                                                                                \
2218   vadd.u32 g_block, g_block, block_span;                                       \
2219   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
2220                                                                                \
2221   vadd.u32 b_block, b_block, block_span;                                       \
2222   add block_ptr_b, block_ptr_a, #16;                                           \
2223                                                                                \
2224   vshrn.u32 r_whole_low, r_block, #16;                                         \
2225   vshrn.u32 g_whole_low, g_block, #16;                                         \
2226   vshrn.u32 b_whole_low, b_block, #16;                                         \
2227   vdup.u32 dx4, rg_dx4[0];                                                     \
2228                                                                                \
2229   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2230   vdup.u32 dx4, rg_dx4[1];                                                     \
2231                                                                                \
2232   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2233   vdup.u32 dx4, b_dx4;                                                         \
2234                                                                                \
2235   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2236   vdup.u32 dx8, rg_dx8[0];                                                     \
2237                                                                                \
2238   vadd.u32 r_block, r_block, dx8;                                              \
2239   vdup.u32 dx8, rg_dx8[1];                                                     \
2240                                                                                \
2241   vadd.u32 g_block, g_block, dx8;                                              \
2242   vdup.u32 dx8, b_dx8;                                                         \
2243                                                                                \
2244   vadd.u32 b_block, b_block, dx8;                                              \
2245                                                                                \
2246   vmovn.u16 r_whole_8, r_whole;                                                \
2247   vmovn.u16 g_whole_8, g_whole;                                                \
2248   vmovn.u16 b_whole_8, b_whole;                                                \
2249                                                                                \
2250   beq 5f;                                                                      \
2251   veor.u32 draw_mask, draw_mask, draw_mask;                                    \
2252                                                                                \
2253  4:                                                                            \
2254   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2255   vshrn.u32 r_whole_low, r_block, #16;                                         \
2256                                                                                \
2257   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2258   vshrn.u32 g_whole_low, g_block, #16;                                         \
2259                                                                                \
2260   vshrn.u32 b_whole_low, b_block, #16;                                         \
2261   str fb_ptr, [block_ptr_a, #44];                                              \
2262                                                                                \
2263   vdup.u32 dx4, rg_dx4[0];                                                     \
2264   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2265   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2266                                                                                \
2267   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2268   vdup.u32 dx4, rg_dx4[1];                                                     \
2269                                                                                \
2270   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2271   vdup.u32 dx4, b_dx4;                                                         \
2272                                                                                \
2273   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2274   vdup.u32 dx8, rg_dx8[0];                                                     \
2275                                                                                \
2276   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2277   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2278   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2279                                                                                \
2280   vadd.u32 r_block, r_block, dx8;                                              \
2281   vdup.u32 dx8, rg_dx8[1];                                                     \
2282                                                                                \
2283   vadd.u32 g_block, g_block, dx8;                                              \
2284   vdup.u32 dx8, b_dx8;                                                         \
2285                                                                                \
2286   vadd.u32 b_block, b_block, dx8;                                              \
2287   add fb_ptr, fb_ptr, #16;                                                     \
2288                                                                                \
2289   vmovn.u16 r_whole_8, r_whole;                                                \
2290   vmovn.u16 g_whole_8, g_whole;                                                \
2291   vmovn.u16 b_whole_8, b_whole;                                                \
2292                                                                                \
2293   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64;                           \
2294   vst1.u32 { pixels }, [block_ptr_b, :128], c_64;                              \
2295                                                                                \
2296   pld [fb_ptr];                                                                \
2297                                                                                \
2298   subs span_num_blocks, span_num_blocks, #1;                                   \
2299   bne 4b;                                                                      \
2300                                                                                \
2301  5:                                                                            \
2302   str fb_ptr, [block_ptr_a, #44];                                              \
2303   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2304                                                                                \
2305   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
2306   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2307                                                                                \
2308   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2309   vdup.u8 draw_mask, right_mask;                                               \
2310                                                                                \
2311   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2312   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
2313                                                                                \
2314   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
2315                                                                                \
2316   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2317   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2318   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2319                                                                                \
2320   vst1.u32 { draw_mask }, [block_ptr_a, :128], c_64;                           \
2321   vst1.u32 { pixels }, [block_ptr_b, :128], c_64;                              \
2322                                                                                \
2323  1:                                                                            \
2324   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2325   add span_b_offset, span_b_offset, #4;                                        \
2326                                                                                \
2327   add span_edge_data, span_edge_data, #8;                                      \
2328   subs num_spans, num_spans, #1;                                               \
2329                                                                                \
2330   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
2331   bne 0b;                                                                      \
2332                                                                                \
2333   restore_abi_regs();                                                          \
2334   ldmia sp!, { r4 - r11, pc };                                                 \
2335                                                                                \
2336  2:                                                                            \
2337   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
2338   vpush { rg_dx4 };                                                            \
2339                                                                                \
2340   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
2341   bl flush_render_block_buffer;                                                \
2342   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
2343                                                                                \
2344   vpop { rg_dx4 };                                                             \
2345                                                                                \
2346   vmov.u8 d64_1, #1;                                                           \
2347   vmov.u8 d128_4, #4;                                                          \
2348   vmov.u8 d64_128, #128;                                                       \
2349   vmov.u8 d128_0x7, #0x7;                                                      \
2350                                                                                \
2351   vadd.u32 rg_dx8, rg_dx4, rg_dx4;                                             \
2352                                                                                \
2353   mov num_blocks, span_num_blocks;                                             \
2354   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2355   bal 3b                                                                       \
2356
2357
2358 setup_blocks_shaded_untextured_indirect_builder(undithered)
2359 setup_blocks_shaded_untextured_indirect_builder(dithered)
2360
2361
2362 #undef draw_mask
2363
2364 #define mask_msb_ptr                                      r14
2365
2366 #define draw_mask                                         q0
2367 #define pixels_low                                        d16
2368 #define pixels_high                                       d17
2369
2370
2371
2372 #define setup_blocks_shaded_untextured_direct_builder(dithering)               \
2373 .align 3;                                                                      \
2374                                                                                \
2375 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
2376   ldrh num_spans, [psx_gpu, #psx_gpu_num_spans_offset];                        \
2377   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2378                                                                                \
2379   vld1.u32 { rg_dx }, [rg_dx_ptr, :64];                                        \
2380                                                                                \
2381   cmp num_spans, #0;                                                           \
2382   bxeq lr;                                                                     \
2383                                                                                \
2384   stmdb sp!, { r4 - r11, r14 };                                                \
2385   save_abi_regs();                                                             \
2386   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2387                                                                                \
2388   ldr b_dx, [psx_gpu, #psx_gpu_b_dx_offset];                                   \
2389   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2390                                                                                \
2391   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2392   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2393                                                                                \
2394   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2395   vmov.u8 d64_1, #1;                                                           \
2396                                                                                \
2397   vmov.u8 d128_4, #4;                                                          \
2398   vmov.u8 d64_128, #128;                                                       \
2399                                                                                \
2400   vmov.u8 d128_0x7, #0x7;                                                      \
2401   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
2402   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
2403                                                                                \
2404  0:                                                                            \
2405   ldrh span_num_blocks, [span_edge_data, #edge_data_num_blocks_offset];        \
2406   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2407                                                                                \
2408   ldrh y, [span_edge_data, #edge_data_y_offset];                               \
2409   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
2410                                                                                \
2411   cmp span_num_blocks, #0;                                                     \
2412   beq 1f;                                                                      \
2413                                                                                \
2414   ldrh left_x, [span_edge_data, #edge_data_left_x_offset];                     \
2415   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2416                                                                                \
2417   ldr b, [span_b_offset];                                                      \
2418   vdup.u32 v_left_x, left_x;                                                   \
2419   and y, y, #0x3;                                                              \
2420                                                                                \
2421   ldr dither_row, [dither_offset_ptr, y, lsl #2];                              \
2422   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2423                                                                                \
2424   mla b, b_dx, left_x, b;                                                      \
2425   and dither_shift, left_x, #0x03;                                             \
2426                                                                                \
2427   vld1.u32 { uvrg }, [span_uvrg_offset, :128];                                 \
2428   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2429                                                                                \
2430   mov dither_shift, dither_shift, lsl #3;                                      \
2431   vmla.u32 rg, rg_dx, v_left_x;                                                \
2432                                                                                \
2433   subs span_num_blocks, span_num_blocks, #1;                                   \
2434                                                                                \
2435   mov dither_row, dither_row, ror dither_shift;                                \
2436   mov b_dx4, b_dx, lsl #2;                                                     \
2437                                                                                \
2438   vdup.u32 dither_offsets, dither_row;                                         \
2439   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2440                                                                                \
2441   vdup.u32 b_block, b;                                                         \
2442   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2443                                                                                \
2444   mov b_dx8, b_dx, lsl #3;                                                     \
2445   vdup.u32 r_block, rg[0];                                                     \
2446   vdup.u32 g_block, rg[1];                                                     \
2447                                                                                \
2448   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2449                                                                                \
2450   vadd.u32 r_block, r_block, block_span;                                       \
2451   vld1.u32 { block_span }, [block_span_ptr, :128]!;                            \
2452                                                                                \
2453   vadd.u32 g_block, g_block, block_span;                                       \
2454   vld1.u32 { block_span }, [block_span_ptr, :128];                             \
2455                                                                                \
2456   vadd.u32 b_block, b_block, block_span;                                       \
2457   add block_ptr_b, block_ptr_a, #16;                                           \
2458                                                                                \
2459   vshrn.u32 r_whole_low, r_block, #16;                                         \
2460   vshrn.u32 g_whole_low, g_block, #16;                                         \
2461   vshrn.u32 b_whole_low, b_block, #16;                                         \
2462   vdup.u32 dx4, rg_dx4[0];                                                     \
2463                                                                                \
2464   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2465   vdup.u32 dx4, rg_dx4[1];                                                     \
2466                                                                                \
2467   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2468   vdup.u32 dx4, b_dx4;                                                         \
2469                                                                                \
2470   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2471   vdup.u32 dx8, rg_dx8[0];                                                     \
2472                                                                                \
2473   vadd.u32 r_block, r_block, dx8;                                              \
2474   vdup.u32 dx8, rg_dx8[1];                                                     \
2475                                                                                \
2476   vadd.u32 g_block, g_block, dx8;                                              \
2477   vdup.u32 dx8, b_dx8;                                                         \
2478                                                                                \
2479   vadd.u32 b_block, b_block, dx8;                                              \
2480                                                                                \
2481   vmovn.u16 r_whole_8, r_whole;                                                \
2482   vmovn.u16 g_whole_8, g_whole;                                                \
2483   vmovn.u16 b_whole_8, b_whole;                                                \
2484                                                                                \
2485   beq 3f;                                                                      \
2486                                                                                \
2487  2:                                                                            \
2488   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2489   vshrn.u32 r_whole_low, r_block, #16;                                         \
2490                                                                                \
2491   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2492   vshrn.u32 g_whole_low, g_block, #16;                                         \
2493                                                                                \
2494   vshrn.u32 b_whole_low, b_block, #16;                                         \
2495                                                                                \
2496   vdup.u32 dx4, rg_dx4[0];                                                     \
2497   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2498   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2499                                                                                \
2500   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2501   vdup.u32 dx4, rg_dx4[1];                                                     \
2502                                                                                \
2503   vmov pixels, msb_mask;                                                       \
2504   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2505   vdup.u32 dx4, b_dx4;                                                         \
2506                                                                                \
2507   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2508   vdup.u32 dx8, rg_dx8[0];                                                     \
2509                                                                                \
2510   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2511   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2512   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2513                                                                                \
2514   vadd.u32 r_block, r_block, dx8;                                              \
2515   vdup.u32 dx8, rg_dx8[1];                                                     \
2516                                                                                \
2517   vadd.u32 g_block, g_block, dx8;                                              \
2518   vdup.u32 dx8, b_dx8;                                                         \
2519                                                                                \
2520   vadd.u32 b_block, b_block, dx8;                                              \
2521                                                                                \
2522   vmovn.u16 r_whole_8, r_whole;                                                \
2523   vmovn.u16 g_whole_8, g_whole;                                                \
2524   vmovn.u16 b_whole_8, b_whole;                                                \
2525                                                                                \
2526   vst1.u32 { pixels }, [fb_ptr]!;                                              \
2527   subs span_num_blocks, span_num_blocks, #1;                                   \
2528   bne 2b;                                                                      \
2529                                                                                \
2530  3:                                                                            \
2531   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2532                                                                                \
2533   ldrh right_mask, [span_edge_data, #edge_data_right_mask_offset];             \
2534   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2535                                                                                \
2536   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2537   rbit right_mask, right_mask;                                                 \
2538   vmov pixels, msb_mask;                                                       \
2539   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2540   clz right_mask, right_mask;                                                  \
2541                                                                                \
2542   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2543   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2544   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2545                                                                                \
2546   JT_OP_REL(100f, right_mask, temp);                                           \
2547   JT_OP(ldr pc, [pc, right_mask, lsl #2]);                                     \
2548   nop;                                                                         \
2549  100:                                                                          \
2550   nop;                                                                         \
2551   .word JTE(100b, 4f);                                                         \
2552   .word JTE(100b, 5f);                                                         \
2553   .word JTE(100b, 6f);                                                         \
2554   .word JTE(100b, 7f);                                                         \
2555   .word JTE(100b, 8f);                                                         \
2556   .word JTE(100b, 9f);                                                         \
2557   .word JTE(100b, 10f);                                                        \
2558   .word JTE(100b, 11f);                                                        \
2559                                                                                \
2560  4:                                                                            \
2561   vst1.u16 { pixels_low[0] }, [fb_ptr];                                        \
2562   bal 1f;                                                                      \
2563                                                                                \
2564  5:                                                                            \
2565   vst1.u32 { pixels_low[0] }, [fb_ptr];                                        \
2566   bal 1f;                                                                      \
2567                                                                                \
2568  6:                                                                            \
2569   vst1.u32 { pixels_low[0] }, [fb_ptr]!;                                       \
2570   vst1.u16 { pixels_low[2] }, [fb_ptr];                                        \
2571   bal 1f;                                                                      \
2572                                                                                \
2573  7:                                                                            \
2574   vst1.u32 { pixels_low }, [fb_ptr];                                           \
2575   bal 1f;                                                                      \
2576                                                                                \
2577  8:                                                                            \
2578   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2579   vst1.u16 { pixels_high[0] }, [fb_ptr];                                       \
2580   bal 1f;                                                                      \
2581                                                                                \
2582  9:                                                                            \
2583   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2584   vst1.u32 { pixels_high[0] }, [fb_ptr]!;                                      \
2585   bal 1f;                                                                      \
2586                                                                                \
2587  10:                                                                           \
2588   vst1.u32 { pixels_low }, [fb_ptr]!;                                          \
2589   vst1.u32 { pixels_high[0] }, [fb_ptr]!;                                      \
2590   vst1.u16 { pixels_high[2] }, [fb_ptr];                                       \
2591   bal 1f;                                                                      \
2592                                                                                \
2593  11:                                                                           \
2594   vst1.u32 { pixels }, [fb_ptr];                                               \
2595   bal 1f;                                                                      \
2596                                                                                \
2597  1:                                                                            \
2598   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2599   add span_b_offset, span_b_offset, #4;                                        \
2600                                                                                \
2601   add span_edge_data, span_edge_data, #8;                                      \
2602   subs num_spans, num_spans, #1;                                               \
2603                                                                                \
2604   bne 0b;                                                                      \
2605                                                                                \
2606   restore_abi_regs();                                                          \
2607   ldmia sp!, { r4 - r11, pc }                                                  \
2608
2609 setup_blocks_shaded_untextured_direct_builder(undithered)
2610 setup_blocks_shaded_untextured_direct_builder(dithered)
2611
2612
2613 #undef psx_gpu
2614 #undef num_blocks
2615 #undef triangle
2616 #undef c_64
2617
2618 #define psx_gpu                                  r0
2619 #define block_ptr                                r1
2620 #define num_blocks                               r2
2621 #define uv_01                                    r3
2622 #define uv_23                                    r4
2623 #define uv_45                                    r5
2624 #define uv_67                                    r6
2625 #define uv_0                                     r7
2626 #define uv_1                                     r3
2627 #define uv_2                                     r8
2628 #define uv_3                                     r4
2629 #define uv_4                                     r9
2630 #define uv_5                                     r5
2631 #define uv_6                                     r10
2632 #define uv_7                                     r6
2633 #define texture_ptr                              r11
2634
2635 #define pixel_0                                  r7
2636 #define pixel_1                                  r3
2637 #define pixel_2                                  r8
2638 #define pixel_3                                  r4
2639 #define pixel_4                                  r9
2640 #define pixel_5                                  r5
2641 #define pixel_6                                  r10
2642 #define pixel_7                                  r6
2643
2644 #define pixels_a                                 r7
2645 #define pixels_b                                 r9
2646 #define pixels_c                                 r8
2647 #define pixels_d                                 r10
2648
2649 #define c_64                                     r0
2650
2651 #define clut_ptr                                 r12
2652 #define current_texture_mask                     r5
2653 #define dirty_textures_mask                      r6
2654
2655 #define texels                                   d0
2656
2657 #define clut_low_a                               d2
2658 #define clut_low_b                               d3
2659 #define clut_high_a                              d4
2660 #define clut_high_b                              d5
2661
2662 #define clut_a                                   q1
2663 #define clut_b                                   q2
2664
2665 #define texels_low                               d6
2666 #define texels_high                              d7
2667
2668 .align 3
2669
2670 function(texture_blocks_untextured)
2671   bx lr
2672
2673
2674 .align 3
2675
2676 function(texture_blocks_4bpp)
2677   stmdb sp!, { r3 - r11, r14 }
2678   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2679
2680   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2681   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2682
2683   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2684   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128]
2685
2686   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
2687   vuzp.u8 clut_a, clut_b
2688
2689   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
2690   tst dirty_textures_mask, current_texture_mask
2691
2692   bne 1f
2693   mov c_64, #64
2694
2695 0:
2696   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2697
2698   uxtah uv_0, texture_ptr, uv_01
2699   uxtah uv_1, texture_ptr, uv_01, ror #16
2700
2701   uxtah uv_2, texture_ptr, uv_23
2702   uxtah uv_3, texture_ptr, uv_23, ror #16
2703
2704   uxtah uv_4, texture_ptr, uv_45
2705   ldrb pixel_0, [uv_0]
2706
2707   uxtah uv_5, texture_ptr, uv_45, ror #16
2708   ldrb pixel_1, [uv_1]
2709
2710   uxtah uv_6, texture_ptr, uv_67
2711   ldrb pixel_2, [uv_2]
2712
2713   uxtah uv_7, texture_ptr, uv_67, ror #16
2714   ldrb pixel_3, [uv_3]
2715
2716   ldrb pixel_4, [uv_4]
2717   subs num_blocks, num_blocks, #1
2718
2719   ldrb pixel_5, [uv_5]
2720   orr pixels_a, pixel_0, pixel_1, lsl #8
2721
2722   ldrb pixel_6, [uv_6]
2723   orr pixels_b, pixel_4, pixel_5, lsl #8
2724
2725   ldrb pixel_7, [uv_7]
2726   orr pixels_a, pixels_a, pixel_2, lsl #16
2727
2728   orr pixels_b, pixels_b, pixel_6, lsl #16
2729   orr pixels_a, pixels_a, pixel_3, lsl #24
2730
2731   orr pixels_b, pixels_b, pixel_7, lsl #24
2732   vmov texels, pixels_a, pixels_b
2733
2734   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2735   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2736
2737   vst2.u8 { texels_low, texels_high }, [block_ptr, :128], c_64
2738   bne 0b
2739
2740   ldmia sp!, { r3 - r11, pc }
2741
2742 1:
2743   stmdb sp!, { r1 - r2 }  
2744   bl update_texture_4bpp_cache
2745
2746   mov c_64, #64
2747   ldmia sp!, { r1 - r2 }
2748   bal 0b
2749
2750
2751 .align 3
2752
2753 function(texture_blocks_8bpp)
2754   stmdb sp!, { r3 - r11, r14 }
2755   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2756
2757   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2758   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2759
2760   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
2761   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
2762
2763   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset]
2764   tst dirty_textures_mask, current_texture_mask
2765
2766   bne 1f
2767   nop
2768
2769 0:
2770   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2771
2772   uxtah uv_0, texture_ptr, uv_01
2773   uxtah uv_1, texture_ptr, uv_01, ror #16
2774
2775   uxtah uv_2, texture_ptr, uv_23
2776   uxtah uv_3, texture_ptr, uv_23, ror #16
2777
2778   uxtah uv_4, texture_ptr, uv_45
2779   ldrb pixel_0, [uv_0]
2780
2781   uxtah uv_5, texture_ptr, uv_45, ror #16
2782   ldrb pixel_1, [uv_1]
2783
2784   uxtah uv_6, texture_ptr, uv_67
2785   ldrb pixel_2, [uv_2]
2786
2787   uxtah uv_7, texture_ptr, uv_67, ror #16
2788   ldrb pixel_3, [uv_3]
2789
2790   ldrb pixel_4, [uv_4]
2791   add pixel_0, pixel_0, pixel_0
2792
2793   ldrb pixel_5, [uv_5]
2794   add pixel_1, pixel_1, pixel_1
2795
2796   ldrb pixel_6, [uv_6]
2797   add pixel_2, pixel_2, pixel_2
2798
2799   ldrb pixel_7, [uv_7]
2800   add pixel_3, pixel_3, pixel_3
2801
2802   ldrh pixel_0, [clut_ptr, pixel_0]
2803   add pixel_4, pixel_4, pixel_4
2804
2805   ldrh pixel_1, [clut_ptr, pixel_1]
2806   add pixel_5, pixel_5, pixel_5
2807
2808   ldrh pixel_2, [clut_ptr, pixel_2]
2809   add pixel_6, pixel_6, pixel_6
2810
2811   ldrh pixel_3, [clut_ptr, pixel_3]
2812   add pixel_7, pixel_7, pixel_7
2813
2814   ldrh pixel_4, [clut_ptr, pixel_4]
2815   orr pixels_a, pixel_0, pixel_1, lsl #16
2816
2817   ldrh pixel_5, [clut_ptr, pixel_5]
2818   orr pixels_c, pixel_2, pixel_3, lsl #16
2819
2820   ldrh pixel_6, [clut_ptr, pixel_6]
2821   subs num_blocks, num_blocks, #1
2822
2823   ldrh pixel_7, [clut_ptr, pixel_7]
2824   orr pixels_b, pixel_4, pixel_5, lsl #16
2825
2826   orr pixels_d, pixel_6, pixel_7, lsl #16
2827   stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d } 
2828
2829   add block_ptr, block_ptr, #64
2830   bne 0b
2831
2832   ldmia sp!, { r3 - r11, pc }
2833
2834 1:
2835   stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
2836
2837   bl update_texture_8bpp_cache
2838
2839   ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
2840   bal 0b
2841
2842
2843 #undef uv_0
2844 #undef uv_1
2845 #undef uv_2
2846 #undef uv_3
2847 #undef uv_4
2848 #undef uv_5
2849 #undef uv_6
2850 #undef uv_7
2851
2852 #undef pixel_0
2853 #undef pixel_1
2854 #undef pixel_2
2855 #undef pixel_3
2856 #undef pixel_4
2857 #undef pixel_5
2858 #undef pixel_6
2859 #undef pixel_7
2860
2861 #undef texture_ptr
2862
2863 #undef pixels_a
2864 #undef pixels_b
2865 #undef pixels_c
2866 #undef pixels_d
2867
2868 #define psx_gpu                                  r0
2869 #define block_ptr                                r1
2870 #define num_blocks                               r2
2871
2872 #define uv_0                                     r3
2873 #define uv_1                                     r4
2874 #define u_0                                      r3
2875 #define u_1                                      r4
2876 #define v_0                                      r5
2877 #define v_1                                      r6
2878
2879 #define uv_2                                     r5
2880 #define uv_3                                     r6
2881 #define u_2                                      r5
2882 #define u_3                                      r6
2883 #define v_2                                      r7
2884 #define v_3                                      r8
2885
2886 #define uv_4                                     r7
2887 #define uv_5                                     r8
2888 #define u_4                                      r7
2889 #define u_5                                      r8
2890 #define v_4                                      r9
2891 #define v_5                                      r10
2892
2893 #define uv_6                                     r9
2894 #define uv_7                                     r10
2895 #define u_6                                      r9
2896 #define u_7                                      r10
2897 #define v_6                                      r11
2898 #define v_7                                      r0
2899
2900 #define pixel_0                                  r3
2901 #define pixel_1                                  r4
2902 #define pixel_2                                  r5
2903 #define pixel_3                                  r6
2904 #define pixel_4                                  r7
2905 #define pixel_5                                  r8
2906 #define pixel_6                                  r9
2907 #define pixel_7                                  r10
2908
2909 #define pixels_a                                 r3
2910 #define pixels_b                                 r5
2911 #define pixels_c                                 r7
2912 #define pixels_d                                 r9
2913
2914 #define texture_ptr                              r12
2915
2916
2917 .align 3
2918
2919 function(texture_blocks_16bpp)
2920   stmdb sp!, { r3 - r11, r14 }
2921   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2922
2923   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
2924   ldr texture_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
2925
2926 0:
2927   ldrh uv_0, [block_ptr]
2928   subs num_blocks, num_blocks, #1
2929
2930   ldrh uv_1, [block_ptr, #2]
2931
2932   and v_0, uv_0, #0xFF00
2933   and v_1, uv_1, #0xFF00
2934
2935   and u_0, uv_0, #0xFF
2936   and u_1, uv_1, #0xFF
2937
2938   add uv_0, u_0, v_0, lsl #2
2939   ldrh uv_2, [block_ptr, #4]
2940
2941   add uv_1, u_1, v_1, lsl #2
2942   ldrh uv_3, [block_ptr, #6]
2943
2944   add uv_0, uv_0, uv_0
2945   add uv_1, uv_1, uv_1
2946
2947   and v_2, uv_2, #0xFF00
2948   and v_3, uv_3, #0xFF00
2949
2950   and u_2, uv_2, #0xFF
2951   and u_3, uv_3, #0xFF
2952
2953   add uv_2, u_2, v_2, lsl #2
2954   ldrh uv_4, [block_ptr, #8]
2955
2956   add uv_3, u_3, v_3, lsl #2
2957   ldrh uv_5, [block_ptr, #10]
2958
2959   add uv_2, uv_2, uv_2
2960   add uv_3, uv_3, uv_3
2961
2962   and v_4, uv_4, #0xFF00
2963   and v_5, uv_5, #0xFF00
2964
2965   and u_4, uv_4, #0xFF
2966   and u_5, uv_5, #0xFF
2967
2968   add uv_4, u_4, v_4, lsl #2
2969   ldrh uv_6, [block_ptr, #12]
2970
2971   add uv_5, u_5, v_5, lsl #2
2972   ldrh uv_7, [block_ptr, #14]
2973
2974   add uv_4, uv_4, uv_4
2975   ldrh pixel_0, [texture_ptr, uv_0]
2976
2977   add uv_5, uv_5, uv_5
2978   ldrh pixel_1, [texture_ptr, uv_1]
2979
2980   and v_6, uv_6, #0xFF00
2981   ldrh pixel_2, [texture_ptr, uv_2]
2982
2983   and v_7, uv_7, #0xFF00
2984   ldrh pixel_3, [texture_ptr, uv_3]
2985
2986   and u_6, uv_6, #0xFF
2987   ldrh pixel_4, [texture_ptr, uv_4]
2988
2989   and u_7, uv_7, #0xFF
2990   ldrh pixel_5, [texture_ptr, uv_5]
2991
2992   add uv_6, u_6, v_6, lsl #2
2993   add uv_7, u_7, v_7, lsl #2
2994
2995   add uv_6, uv_6, uv_6
2996   add uv_7, uv_7, uv_7
2997
2998   orr pixels_a, pixel_0, pixel_1, lsl #16
2999   orr pixels_b, pixel_2, pixel_3, lsl #16
3000
3001   ldrh pixel_6, [texture_ptr, uv_6]
3002   orr pixels_c, pixel_4, pixel_5, lsl #16
3003
3004   ldrh pixel_7, [texture_ptr, uv_7]
3005   orr pixels_d, pixel_6, pixel_7, lsl #16
3006
3007   stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
3008   add block_ptr, block_ptr, #64
3009
3010   bne 0b
3011
3012   ldmia sp!, { r3 - r11, pc }
3013
3014
3015 #undef num_blocks
3016
3017 #undef test_mask
3018 #undef texels
3019 #undef pixels_b
3020 #undef pixels
3021 #undef d64_1
3022 #undef d64_4
3023 #undef d64_128
3024 #undef draw_mask
3025 #undef msb_mask
3026 #undef msb_mask_low
3027 #undef msb_mask_high
3028 #undef fb_pixels
3029
3030 #undef c_32
3031 #undef fb_ptr
3032 #undef mask_msb_ptr
3033
3034 #define psx_gpu                                  r0
3035 #define num_blocks                               r1
3036 #define color_ptr                                r2
3037 #define colors_scalar                            r2
3038 #define colors_scalar_compare                    r3
3039 #define mask_msb_ptr                             r2
3040
3041 #define block_ptr_load_a                         r0
3042 #define block_ptr_store                          r3
3043 #define block_ptr_load_b                         r12
3044 #define c_32                                     r2
3045
3046 #define c_48                                     r4
3047 #define fb_ptr                                   r14
3048 #define draw_mask_bits_scalar                    r5
3049
3050 #define d128_0x07                                q0
3051 #define d128_0x1F                                q1
3052 #define d128_0x8000                              q2
3053 #define test_mask                                q3
3054 #define texels                                   q4
3055 #define colors_rg                                q5
3056 #define colors_b_dm_bits                         q6
3057 #define texels_rg                                q7
3058 #define pixels_r                                 q8
3059 #define pixels_g                                 q9
3060 #define pixels_b                                 q10
3061 #define pixels                                   q11
3062 #define zero_mask                                q4
3063 #define draw_mask                                q12
3064 #define msb_mask                                 q13
3065
3066 #define fb_pixels                                q8
3067
3068 #define pixels_gb_low                            q9
3069
3070 #define colors_r                                 d10
3071 #define colors_g                                 d11
3072 #define colors_b                                 d12
3073 #define draw_mask_bits                           d13
3074 #define texels_r                                 d14
3075 #define texels_g                                 d15
3076 #define pixels_r_low                             d16
3077 #define pixels_g_low                             d18
3078 #define pixels_b_low                             d19
3079 #define msb_mask_low                             d26
3080 #define msb_mask_high                            d27
3081
3082 #define d64_1                                    d28
3083 #define d64_4                                    d29
3084 #define d64_128                                  d30
3085 #define texels_b                                 d31
3086
3087 #define shade_blocks_textured_modulated_prologue_indirect()                    \
3088   mov c_48, #48;                                                               \
3089   add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset                         \
3090
3091 #define shade_blocks_textured_modulated_prologue_direct()                      \
3092   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3093   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]            \
3094
3095
3096 #define shade_blocks_textured_modulated_prologue_shaded(dithering, target)     \
3097   
3098 #define shade_blocks_textured_false_modulation_check_undithered(target)        \
3099   ldr colors_scalar, [psx_gpu, #psx_gpu_triangle_color_offset];                \
3100   movw colors_scalar_compare, #0x8080;                                         \
3101                                                                                \
3102   movt colors_scalar_compare, #0x80;                                           \
3103   cmp colors_scalar, colors_scalar_compare;                                    \
3104   beq shade_blocks_textured_unmodulated_##target                               \
3105
3106 #define shade_blocks_textured_false_modulation_check_dithered(target)          \
3107
3108 #define shade_blocks_textured_modulated_prologue_unshaded(dithering, target)   \
3109   shade_blocks_textured_false_modulation_check_##dithering(target);            \
3110   add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset;                      \
3111   vld1.u32 { colors_r[] }, [color_ptr, :32];                                   \
3112   vdup.u8 colors_g, colors_r[1];                                               \
3113   vdup.u8 colors_b, colors_r[2];                                               \
3114   vdup.u8 colors_r, colors_r[0]                                                \
3115
3116
3117 #define shade_blocks_textured_modulated_load_dithered(target)                  \
3118   vld1.u32 { target }, [block_ptr_load_b, :128]                                \
3119
3120 #define shade_blocks_textured_modulated_load_last_dithered(target)             \
3121   vld1.u32 { target }, [block_ptr_load_b, :128], c_32                          \
3122
3123 #define shade_blocks_textured_modulated_load_undithered(target)                \
3124
3125 #define shade_blocks_textured_modulated_load_last_undithered(target)           \
3126   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3127
3128 #define shade_blocks_textured_modulate_dithered(channel)                       \
3129   vmlal.u8 pixels_##channel, texels_##channel, colors_##channel                \
3130
3131 #define shade_blocks_textured_modulate_undithered(channel)                     \
3132   vmull.u8 pixels_##channel, texels_##channel, colors_##channel                \
3133
3134
3135 #define shade_blocks_textured_modulated_store_draw_mask_indirect(offset)       \
3136   vst1.u32 { draw_mask }, [block_ptr_store, :128]!                             \
3137
3138 #define shade_blocks_textured_modulated_store_draw_mask_direct(offset)         \
3139   ldr fb_ptr, [block_ptr_load_b, #(offset - 64)];                              \
3140   vld1.u32 { fb_pixels }, [fb_ptr];                                            \
3141   vbit.u16 pixels, fb_pixels, draw_mask                                        \
3142
3143 #define shade_blocks_textured_modulated_store_pixels_indirect()                \
3144   vst1.u32 { pixels }, [block_ptr_store, :128], c_48                           \
3145
3146 #define shade_blocks_textured_modulated_store_pixels_direct()                  \
3147   vst1.u32 { pixels }, [fb_ptr]                                                \
3148
3149
3150 #define shade_blocks_textured_modulated_load_rg_shaded()                       \
3151   vld1.u32 { colors_r, colors_g }, [block_ptr_load_b, :128], c_32              \
3152
3153 #define shade_blocks_textured_modulated_load_rg_unshaded()                     \
3154   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3155
3156 #define shade_blocks_textured_modulated_load_bdm_shaded()                      \
3157   vld1.u32 { colors_b, draw_mask_bits }, [block_ptr_load_a, :128], c_32        \
3158
3159 #define shade_blocks_textured_modulated_load_bdm_unshaded()                    \
3160   ldr draw_mask_bits_scalar, [block_ptr_load_a, #8];                           \
3161   add block_ptr_load_a, block_ptr_load_a, #32                                  \
3162
3163 #define shade_blocks_textured_modulated_expand_draw_mask_shaded()              \
3164   vdup.u16 draw_mask, draw_mask_bits[0]                                        \
3165
3166 #define shade_blocks_textured_modulated_expand_draw_mask_unshaded()            \
3167   vdup.u16 draw_mask, draw_mask_bits_scalar                                    \
3168
3169
3170 #define shade_blocks_textured_modulated_apply_msb_mask_indirect()              \
3171
3172 #define shade_blocks_textured_modulated_apply_msb_mask_direct()                \
3173   vorr.u16 pixels, pixels, msb_mask                                            \
3174
3175
3176 #define shade_blocks_textured_modulated_builder(shading, dithering, target)    \
3177 .align 3;                                                                      \
3178                                                                                \
3179 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
3180   save_abi_regs();                                                             \
3181   shade_blocks_textured_modulated_prologue_##shading(dithering, target);       \
3182   stmdb sp!, { r4 - r5, lr };                                                  \
3183   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3184                                                                                \
3185   vld1.u32 { test_mask }, [psx_gpu, :128];                                     \
3186                                                                                \
3187   shade_blocks_textured_modulated_prologue_##target();                         \
3188                                                                                \
3189   add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset;                       \
3190   mov c_32, #32;                                                               \
3191                                                                                \
3192   add block_ptr_load_b, block_ptr_load_a, #16;                                 \
3193   vmov.u8 d64_1, #1;                                                           \
3194   vmov.u8 d64_4, #4;                                                           \
3195   vmov.u8 d64_128, #128;                                                       \
3196                                                                                \
3197   vld1.u32 { texels }, [block_ptr_load_a, :128], c_32;                         \
3198   vmov.u8 d128_0x07, #0x07;                                                    \
3199                                                                                \
3200   shade_blocks_textured_modulated_load_rg_##shading();                         \
3201   vmov.u8 d128_0x1F, #0x1F;                                                    \
3202                                                                                \
3203   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3204   vmov.u16 d128_0x8000, #0x8000;                                               \
3205                                                                                \
3206   vmovn.u16 texels_r, texels;                                                  \
3207   vshrn.u16 texels_g, texels, #5;                                              \
3208                                                                                \
3209   vshrn.u16 texels_b, texels, #7;                                              \
3210   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3211                                                                                \
3212   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3213   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3214                                                                                \
3215   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3216   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3217                                                                                \
3218   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3219   vshr.u8 texels_b, texels_b, #3;                                              \
3220                                                                                \
3221   shade_blocks_textured_modulate_##dithering(r);                               \
3222   shade_blocks_textured_modulate_##dithering(g);                               \
3223   shade_blocks_textured_modulate_##dithering(b);                               \
3224                                                                                \
3225   vand.u16 pixels, texels, d128_0x8000;                                        \
3226   vceq.u16 zero_mask, texels, #0;                                              \
3227                                                                                \
3228   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3229   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3230   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3231                                                                                \
3232   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3233   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3234   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3235   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3236                                                                                \
3237   subs num_blocks, num_blocks, #1;                                             \
3238   beq 1f;                                                                      \
3239                                                                                \
3240  .align 3;                                                                     \
3241                                                                                \
3242  0:                                                                            \
3243   vld1.u32 { texels }, [block_ptr_load_a, :128], c_32;                         \
3244   shade_blocks_textured_modulated_load_rg_##shading();                         \
3245   vshrn.u16 texels_g, texels, #5;                                              \
3246                                                                                \
3247   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3248   vshrn.u16 texels_b, texels, #7;                                              \
3249                                                                                \
3250   pld [block_ptr_load_a];                                                      \
3251   vmovn.u16 texels_r, texels;                                                  \
3252   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3253                                                                                \
3254   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3255   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3256   shade_blocks_textured_modulated_store_draw_mask_##target(-4);                \
3257                                                                                \
3258   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3259   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3260                                                                                \
3261   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3262   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3263                                                                                \
3264   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3265   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3266                                                                                \
3267   shade_blocks_textured_modulated_store_pixels_##target();                     \
3268   vshr.u8 texels_b, texels_b, #3;                                              \
3269                                                                                \
3270   shade_blocks_textured_modulate_##dithering(r);                               \
3271   shade_blocks_textured_modulate_##dithering(g);                               \
3272   shade_blocks_textured_modulate_##dithering(b);                               \
3273                                                                                \
3274   vand.u16 pixels, texels, d128_0x8000;                                        \
3275   vceq.u16 zero_mask, texels, #0;                                              \
3276                                                                                \
3277   subs num_blocks, num_blocks, #1;                                             \
3278                                                                                \
3279   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3280   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3281   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3282                                                                                \
3283   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3284   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3285   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3286   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3287                                                                                \
3288   bne 0b;                                                                      \
3289                                                                                \
3290  1:                                                                            \
3291   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3292   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3293   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3294                                                                                \
3295   shade_blocks_textured_modulated_store_draw_mask_##target(28);                \
3296   shade_blocks_textured_modulated_store_pixels_##target();                     \
3297                                                                                \
3298   ldmia sp!, { r4 - r5, lr };                                                  \
3299   restore_abi_regs();                                                          \
3300   bx lr                                                                        \
3301
3302
3303 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3304 shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3305 shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3306 shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3307
3308 shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3309 shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3310 shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3311 shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3312
3313
3314 #undef c_64
3315 #undef fb_ptr
3316 #undef color_ptr
3317
3318 #undef color_r
3319 #undef color_g
3320 #undef color_b
3321
3322 #undef test_mask
3323 #undef pixels
3324 #undef draw_mask
3325 #undef zero_mask
3326 #undef fb_pixels
3327 #undef msb_mask
3328 #undef msb_mask_low
3329 #undef msb_mask_high
3330
3331 #define psx_gpu                                  r0
3332 #define num_blocks                               r1
3333 #define mask_msb_ptr                             r2
3334 #define color_ptr                                r3
3335
3336 #define block_ptr_load                           r0
3337 #define draw_mask_store_ptr                      r3
3338 #define draw_mask_bits_ptr                       r12
3339 #define draw_mask_ptr                            r12
3340 #define pixel_store_ptr                          r14
3341
3342 #define fb_ptr_cmp                               r4
3343
3344 #define fb_ptr                                   r3
3345 #define fb_ptr_next                              r14
3346
3347 #define c_64                                     r2
3348
3349 #define test_mask                                q0
3350 #define pixels                                   q1
3351 #define draw_mask                                q2
3352 #define zero_mask                                q3
3353 #define draw_mask_combined                       q4
3354 #define fb_pixels                                q5
3355 #define fb_pixels_next                           q6
3356 #define msb_mask                                 q7
3357
3358 #define draw_mask_low                            d4
3359 #define draw_mask_high                           d5
3360 #define msb_mask_low                             d14
3361 #define msb_mask_high                            d15
3362
3363 .align 3
3364 function(shade_blocks_textured_unmodulated_indirect)
3365   stmdb sp!, { r4, r14 }
3366   save_abi_regs()
3367   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3368
3369   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3370   add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3371
3372   vld1.u32 { test_mask }, [psx_gpu, :128]
3373   add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3374
3375   mov c_64, #64
3376   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3377
3378   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3379   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3380    [draw_mask_bits_ptr, :16], c_64
3381   vceq.u16 zero_mask, pixels, #0
3382
3383   vtst.u16 draw_mask, draw_mask, test_mask
3384   vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
3385
3386   subs num_blocks, num_blocks, #1
3387   beq 1f
3388
3389  0:
3390   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3391   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3392
3393   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3394    [draw_mask_bits_ptr, :16], c_64
3395   vceq.u16 zero_mask, pixels, #0
3396
3397   vtst.u16 draw_mask, draw_mask, test_mask
3398   vst1.u32 { pixels }, [pixel_store_ptr, :128], c_64
3399
3400   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
3401   subs num_blocks, num_blocks, #1
3402
3403   bne 0b
3404
3405  1:
3406   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3407   vst1.u32 { draw_mask_combined }, [draw_mask_store_ptr, :128], c_64
3408
3409   restore_abi_regs()
3410   ldmia sp!, { r4, pc }
3411
3412
3413 .align 3
3414
3415 function(shade_blocks_textured_unmodulated_direct)
3416   stmdb sp!, { r4, r14 }
3417   save_abi_regs()
3418   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3419
3420   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3421   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3422
3423   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
3424   mov c_64, #64
3425
3426   vld1.u32 { test_mask }, [psx_gpu, :128]
3427   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3428
3429   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3430    [draw_mask_bits_ptr, :16], c_64
3431   ldr fb_ptr_next, [block_ptr_load, #44]
3432
3433   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3434   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3435   vceq.u16 zero_mask, pixels, #0
3436   vtst.u16 draw_mask, draw_mask, test_mask
3437
3438   subs num_blocks, num_blocks, #1
3439   beq 1f
3440
3441  0:
3442   mov fb_ptr, fb_ptr_next
3443   ldr fb_ptr_next, [block_ptr_load, #44]
3444
3445   vorr.u16 pixels, pixels, msb_mask
3446
3447   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3448   vmov fb_pixels, fb_pixels_next
3449
3450   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3451    [draw_mask_bits_ptr, :16], c_64
3452   vbif.u16 fb_pixels, pixels, draw_mask_combined
3453
3454   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3455   pld [fb_ptr_next, #64]
3456
3457   add fb_ptr_cmp, fb_ptr_cmp, #14
3458   vld1.u32 { pixels }, [block_ptr_load, :128], c_64
3459
3460   cmp fb_ptr_cmp, #28
3461   bls 4f
3462
3463   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3464   vceq.u16 zero_mask, pixels, #0
3465
3466   vst1.u16 { fb_pixels }, [fb_ptr]
3467   vtst.u16 draw_mask, draw_mask, test_mask
3468
3469  3:
3470   subs num_blocks, num_blocks, #1
3471   bne 0b
3472
3473  1:
3474   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3475   vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3476
3477   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
3478
3479   restore_abi_regs()
3480   ldmia sp!, { r4, pc }
3481
3482  4:
3483   vst1.u16 { fb_pixels }, [fb_ptr]
3484   vceq.u16 zero_mask, pixels, #0
3485
3486   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3487   vtst.u16 draw_mask, draw_mask, test_mask
3488
3489   bal 3b
3490
3491
3492 function(shade_blocks_unshaded_untextured_indirect)
3493   bx lr
3494
3495 .align 3
3496
3497 function(shade_blocks_unshaded_untextured_direct)
3498   stmdb sp!, { r4, r14 }
3499   save_abi_regs()
3500   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3501
3502   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
3503   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3504
3505   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
3506   add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3507
3508   add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3509   vld1.u16 { pixels }, [color_ptr, :128]
3510
3511   mov c_64, #64
3512   vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
3513
3514   vorr.u16 pixels, pixels, msb_mask
3515   subs num_blocks, num_blocks, #1
3516
3517   ldr fb_ptr_next, [block_ptr_load], #64
3518
3519   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3520   beq 1f
3521
3522  0:
3523   vmov fb_pixels, fb_pixels_next
3524   mov fb_ptr, fb_ptr_next
3525   ldr fb_ptr_next, [block_ptr_load], #64
3526
3527   vbif.u16 fb_pixels, pixels, draw_mask
3528   vld1.u16 { draw_mask }, [draw_mask_ptr, :128], c_64
3529
3530   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3531   add fb_ptr_cmp, fb_ptr_cmp, #14
3532   cmp fb_ptr_cmp, #28
3533   bls 4f
3534
3535   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3536   vst1.u16 { fb_pixels }, [fb_ptr]
3537
3538  3:
3539   subs num_blocks, num_blocks, #1
3540   bne 0b
3541
3542  1:
3543   vbif.u16 fb_pixels_next, pixels, draw_mask
3544   vst1.u16 { fb_pixels_next }, [fb_ptr_next]
3545
3546   restore_abi_regs()
3547   ldmia sp!, { r4, pc }
3548
3549  4:
3550   vst1.u16 { fb_pixels }, [fb_ptr]
3551   vld1.u16 { fb_pixels_next }, [fb_ptr_next]
3552   bal 3b
3553
3554
3555 #undef draw_mask_ptr
3556 #undef c_64
3557 #undef fb_ptr
3558 #undef fb_ptr_next
3559 #undef fb_ptr_cmp
3560
3561 #define psx_gpu                                  r0
3562 #define num_blocks                               r1
3563 #define msb_mask_ptr                             r2
3564 #define pixel_ptr                                r3
3565 #define draw_mask_ptr                            r0
3566 #define c_64                                     r2
3567 #define fb_ptr                                   r12
3568 #define fb_ptr_next                              r14
3569 #define fb_ptr_cmp                               r4
3570
3571 #undef msb_mask
3572 #undef draw_mask
3573 #undef pixels
3574 #undef fb_pixels
3575 #undef d128_0x8000
3576 #undef msb_mask_low
3577 #undef msb_mask_high
3578 #undef draw_mask_next
3579 #undef pixels_g
3580 #undef blend_pixels
3581 #undef fb_pixels_next
3582
3583 #define msb_mask                                 q0
3584 #define draw_mask                                q1
3585 #define pixels                                   q2
3586 #define fb_pixels                                q3
3587 #define blend_pixels                             q4
3588 #define pixels_no_msb                            q5
3589 #define blend_mask                               q6
3590 #define fb_pixels_no_msb                         q7
3591 #define d128_0x8000                              q8
3592 #define d128_0x0421                              q9
3593 #define fb_pixels_next                           q10
3594 #define blend_pixels_next                        q11
3595 #define pixels_next                              q12
3596 #define draw_mask_next                           q13
3597 #define write_mask                               q14
3598
3599 #define pixels_rb                                q5
3600 #define pixels_mg                                q7
3601 #define pixels_g                                 q7
3602 #define d128_0x7C1F                              q8
3603 #define d128_0x03E0                              q9
3604 #define fb_pixels_rb                             q10
3605 #define fb_pixels_g                              q11
3606 #define fb_pixels_masked                         q11
3607 #define d128_0x83E0                              q15
3608 #define pixels_fourth                            q7
3609 #define d128_0x1C07                              q12
3610 #define d128_0x00E0                              q13
3611 #define d128_0x80E0                              q13
3612
3613 #define msb_mask_low                             d0
3614 #define msb_mask_high                            d1
3615
3616 #define blend_blocks_average_set_blend_mask_textured(source)                   \
3617   vclt.s16 blend_mask, source, #0                                              \
3618
3619 #define blend_blocks_average_set_stp_bit_textured()                            \
3620   vorr.u16 blend_pixels, #0x8000                                               \
3621
3622 #define blend_blocks_average_combine_textured(source)                          \
3623   vbif.u16 blend_pixels, source, blend_mask                                    \
3624   
3625 #define blend_blocks_average_set_blend_mask_untextured(source)                 \
3626
3627 #define blend_blocks_average_set_stp_bit_untextured()                          \
3628
3629 #define blend_blocks_average_combine_untextured(source)                        \
3630
3631 #define blend_blocks_average_mask_set_on()                                     \
3632   vclt.s16 write_mask, fb_pixels_next, #0                                      \
3633
3634 #define blend_blocks_average_mask_copy_on()                                    \
3635   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3636
3637 #define blend_blocks_average_mask_copy_b_on()                                  \
3638   vorr.u16 draw_mask_next, draw_mask_next, write_mask                          \
3639
3640 #define blend_blocks_average_mask_set_off()                                    \
3641
3642 #define blend_blocks_average_mask_copy_off()                                   \
3643   vmov draw_mask, draw_mask_next                                               \
3644
3645 #define blend_blocks_average_mask_copy_b_off()                                 \
3646
3647 #define blend_blocks_average_builder(texturing, mask_evaluate)                 \
3648 .align 3;                                                                      \
3649                                                                                \
3650 function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
3651   stmdb sp!, { r4, r14 };                                                      \
3652   save_abi_regs();                                                             \
3653   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3654   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3655                                                                                \
3656   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3657   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3658                                                                                \
3659   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3660   mov c_64, #64;                                                               \
3661                                                                                \
3662   vmov.u16 d128_0x8000, #0x8000;                                               \
3663   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
3664   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3665                                                                                \
3666   vmov.u16 d128_0x0421, #0x0400;                                               \
3667   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
3668                                                                                \
3669   vorr.u16 d128_0x0421, #0x0021;                                               \
3670   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3671                                                                                \
3672   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3673   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3674   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3675   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3676   blend_blocks_average_mask_set_##mask_evaluate();                             \
3677   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3678                                                                                \
3679   subs num_blocks, num_blocks, #1;                                             \
3680   beq 1f;                                                                      \
3681                                                                                \
3682  0:                                                                            \
3683   mov fb_ptr, fb_ptr_next;                                                     \
3684   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3685                                                                                \
3686   vmov pixels, pixels_next;                                                    \
3687   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
3688                                                                                \
3689   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3690                                                                                \
3691   blend_blocks_average_mask_copy_##mask_evaluate();                            \
3692   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
3693                                                                                \
3694   blend_blocks_average_set_blend_mask_##texturing(pixels);                     \
3695   blend_blocks_average_set_stp_bit_##texturing();                              \
3696   vmov fb_pixels, fb_pixels_next;                                              \
3697   blend_blocks_average_combine_##texturing(pixels);                            \
3698                                                                                \
3699   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3700   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3701   cmp fb_ptr_cmp, #28;                                                         \
3702   bls 2f;                                                                      \
3703                                                                                \
3704   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3705   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3706                                                                                \
3707   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3708   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3709                                                                                \
3710   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3711   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3712                                                                                \
3713   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3714   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3715   blend_blocks_average_mask_set_##mask_evaluate();                             \
3716   vst1.u16 { fb_pixels }, [fb_ptr];                                            \
3717                                                                                \
3718  3:                                                                            \
3719   subs num_blocks, num_blocks, #1;                                             \
3720   bne 0b;                                                                      \
3721                                                                                \
3722  1:                                                                            \
3723   blend_blocks_average_mask_copy_b_##mask_evaluate();                          \
3724   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3725                                                                                \
3726   blend_blocks_average_set_blend_mask_##texturing(pixels_next);                \
3727   blend_blocks_average_set_stp_bit_##texturing();                              \
3728   blend_blocks_average_combine_##texturing(pixels_next);                       \
3729                                                                                \
3730   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3731   vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next;                       \
3732   vst1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3733                                                                                \
3734   restore_abi_regs();                                                          \
3735   ldmia sp!, { r4, pc };                                                       \
3736                                                                                \
3737  2:                                                                            \
3738   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3739   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3740   vst1.u16 { fb_pixels }, [fb_ptr];                                            \
3741                                                                                \
3742   vld1.u16 { fb_pixels_next }, [fb_ptr_next];                                  \
3743   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3744   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3745   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3746   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3747   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3748                                                                                \
3749   bal 3b                                                                       \
3750
3751 blend_blocks_average_builder(textured, off)
3752 blend_blocks_average_builder(untextured, off)
3753 blend_blocks_average_builder(textured, on)
3754 blend_blocks_average_builder(untextured, on)
3755
3756
3757 #define blend_blocks_add_mask_set_on()                                         \
3758   vclt.s16 write_mask, fb_pixels, #0                                           \
3759
3760 #define blend_blocks_add_mask_copy_on()                                        \
3761   vorr.u16 draw_mask, draw_mask, write_mask                                    \
3762
3763 #define blend_blocks_add_mask_set_off()                                        \
3764
3765 #define blend_blocks_add_mask_copy_off()                                       \
3766
3767
3768 #define blend_blocks_add_textured_builder(mask_evaluate)                       \
3769 .align 3;                                                                      \
3770                                                                                \
3771 function(blend_blocks_textured_add_##mask_evaluate)                            \
3772   stmdb sp!, { r4, r14 };                                                      \
3773   save_abi_regs();                                                             \
3774   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3775   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3776                                                                                \
3777   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3778   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3779                                                                                \
3780   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3781   mov c_64, #64;                                                               \
3782                                                                                \
3783   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3784   vmov.u16 d128_0x03E0, #0x0300;                                               \
3785   vmov.u16 d128_0x83E0, #0x8000;                                               \
3786   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3787   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3788   vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0;                              \
3789                                                                                \
3790   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3791   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3792   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3793   vclt.s16 blend_mask, pixels, #0;                                             \
3794   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3795   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3796   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3797                                                                                \
3798   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3799   vorr.u16 pixels, pixels, msb_mask;                                           \
3800   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3801   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3802   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3803   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3804   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3805   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3806   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3807   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3808                                                                                \
3809   subs num_blocks, num_blocks, #1;                                             \
3810   beq 1f;                                                                      \
3811                                                                                \
3812  0:                                                                            \
3813   mov fb_ptr, fb_ptr_next;                                                     \
3814                                                                                \
3815   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3816                                                                                \
3817   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3818   vclt.s16 blend_mask, pixels, #0;                                             \
3819                                                                                \
3820   vorr.u16 pixels, pixels, msb_mask;                                           \
3821   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3822   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3823                                                                                \
3824   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3825   pld [fb_ptr_next, #64];                                                      \
3826                                                                                \
3827   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3828   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3829                                                                                \
3830   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3831   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3832                                                                                \
3833   cmp fb_ptr_cmp, #28;                                                         \
3834   bls 2f;                                                                      \
3835                                                                                \
3836   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3837   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3838   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3839   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3840   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3841   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3842   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3843                                                                                \
3844  3:                                                                            \
3845   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3846   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3847   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3848   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3849   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3850                                                                                \
3851   subs num_blocks, num_blocks, #1;                                             \
3852   bne 0b;                                                                      \
3853                                                                                \
3854  1:                                                                            \
3855   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3856   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3857   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
3858                                                                                \
3859   restore_abi_regs();                                                          \
3860   ldmia sp!, { r4, pc };                                                       \
3861                                                                                \
3862  2:                                                                            \
3863   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3864   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3865                                                                                \
3866   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3867   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3868   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3869   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3870   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3871   bal 3b                                                                       \
3872
3873
3874 #define blend_blocks_add_untextured_builder(mask_evaluate)                     \
3875 .align 3;                                                                      \
3876                                                                                \
3877 function(blend_blocks_untextured_add_##mask_evaluate)                          \
3878   stmdb sp!, { r4, r14 };                                                      \
3879   save_abi_regs();                                                             \
3880   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3881   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
3882                                                                                \
3883   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3884   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
3885                                                                                \
3886   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3887   mov c_64, #64;                                                               \
3888                                                                                \
3889   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3890   vmov.u16 d128_0x03E0, #0x0300;                                               \
3891   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3892   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3893                                                                                \
3894   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3895   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3896   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3897   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3898   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3899   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3900                                                                                \
3901   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3902   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3903   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3904   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3905   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3906   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3907   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3908   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3909                                                                                \
3910   subs num_blocks, num_blocks, #1;                                             \
3911   beq 1f;                                                                      \
3912                                                                                \
3913  0:                                                                            \
3914   mov fb_ptr, fb_ptr_next;                                                     \
3915                                                                                \
3916   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
3917                                                                                \
3918   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
3919                                                                                \
3920   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3921   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3922   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3923                                                                                \
3924   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3925   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
3926                                                                                \
3927   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3928   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3929   cmp fb_ptr_cmp, #28;                                                         \
3930   bls 2f;                                                                      \
3931                                                                                \
3932   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3933   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3934   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3935   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3936   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3937   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3938                                                                                \
3939  3:                                                                            \
3940   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3941   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3942   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3943   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3944   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3945                                                                                \
3946   subs num_blocks, num_blocks, #1;                                             \
3947   bne 0b;                                                                      \
3948                                                                                \
3949  1:                                                                            \
3950   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3951   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3952   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3953   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
3954                                                                                \
3955   restore_abi_regs();                                                          \
3956   ldmia sp!, { r4, pc };                                                       \
3957                                                                                \
3958  2:                                                                            \
3959   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
3960   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3961                                                                                \
3962   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
3963   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3964   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3965   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3966   bal 3b                                                                       \
3967
3968
3969 blend_blocks_add_textured_builder(off)
3970 blend_blocks_add_textured_builder(on)
3971 blend_blocks_add_untextured_builder(off)
3972 blend_blocks_add_untextured_builder(on)
3973
3974 #define blend_blocks_subtract_set_blend_mask_textured()                        \
3975   vclt.s16 blend_mask, pixels_next, #0                                         \
3976
3977 #define blend_blocks_subtract_combine_textured()                               \
3978   vbif.u16 blend_pixels, pixels, blend_mask                                    \
3979
3980 #define blend_blocks_subtract_set_stp_textured()                               \
3981   vorr.u16 blend_pixels, #0x8000                                               \
3982
3983 #define blend_blocks_subtract_msb_mask_textured()                              \
3984   vorr.u16 pixels, pixels_next, msb_mask                                       \
3985
3986 #define blend_blocks_subtract_set_blend_mask_untextured()                      \
3987
3988 #define blend_blocks_subtract_combine_untextured()                             \
3989
3990 #define blend_blocks_subtract_set_stp_untextured()                             \
3991   vorr.u16 blend_pixels, blend_pixels, msb_mask                                \
3992
3993 #define blend_blocks_subtract_msb_mask_untextured()                            \
3994
3995
3996 #define blend_blocks_subtract_mask_set_on()                                    \
3997   vclt.s16 write_mask, fb_pixels, #0                                           \
3998
3999 #define blend_blocks_subtract_mask_copy_on()                                   \
4000   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
4001
4002 #define blend_blocks_subtract_mask_set_off()                                   \
4003
4004 #define blend_blocks_subtract_mask_copy_off()                                  \
4005   vmov draw_mask, draw_mask_next                                               \
4006
4007
4008 #define blend_blocks_subtract_builder(texturing, mask_evaluate)                \
4009 .align 3;                                                                      \
4010                                                                                \
4011 function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
4012   stmdb sp!, { r4, r14 };                                                      \
4013   save_abi_regs();                                                             \
4014   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4015   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4016                                                                                \
4017   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4018   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4019                                                                                \
4020   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4021   mov c_64, #64;                                                               \
4022                                                                                \
4023   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4024   vmov.u16 d128_0x03E0, #0x0300;                                               \
4025   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4026   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4027                                                                                \
4028   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
4029   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4030   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
4031   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4032   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4033   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4034   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4035                                                                                \
4036   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4037   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4038   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4039   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4040   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4041                                                                                \
4042   subs num_blocks, num_blocks, #1;                                             \
4043   beq 1f;                                                                      \
4044                                                                                \
4045  0:                                                                            \
4046   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4047   mov fb_ptr, fb_ptr_next;                                                     \
4048   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4049                                                                                \
4050   vld1.u32 { draw_mask_next }, [draw_mask_ptr, :128], c_64;                    \
4051   blend_blocks_subtract_msb_mask_##texturing();                                \
4052                                                                                \
4053   vld1.u32 { pixels_next }, [pixel_ptr, :128], c_64;                           \
4054   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4055   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4056   blend_blocks_subtract_set_stp_##texturing();                                 \
4057   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4058   blend_blocks_subtract_combine_##texturing();                                 \
4059   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4060   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4061                                                                                \
4062   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4063   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4064   cmp fb_ptr_cmp, #28;                                                         \
4065   bls 2f;                                                                      \
4066                                                                                \
4067   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4068   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4069   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4070   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4071   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4072   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4073   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4074                                                                                \
4075  3:                                                                            \
4076   subs num_blocks, num_blocks, #1;                                             \
4077   bne 0b;                                                                      \
4078                                                                                \
4079  1:                                                                            \
4080   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4081                                                                                \
4082   blend_blocks_subtract_msb_mask_##texturing();                                \
4083   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4084   blend_blocks_subtract_set_stp_##texturing();                                 \
4085   blend_blocks_subtract_combine_##texturing();                                 \
4086   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4087   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4088                                                                                \
4089   restore_abi_regs();                                                          \
4090   ldmia sp!, { r4, pc };                                                       \
4091                                                                                \
4092  2:                                                                            \
4093   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4094   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4095   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4096   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4097   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4098   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4099   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4100   bal 3b                                                                       \
4101
4102
4103 blend_blocks_subtract_builder(textured, off)
4104 blend_blocks_subtract_builder(textured, on)
4105 blend_blocks_subtract_builder(untextured, off)
4106 blend_blocks_subtract_builder(untextured, on)
4107
4108
4109 #define blend_blocks_add_fourth_textured_builder(mask_evaluate)                \
4110 .align 3;                                                                      \
4111                                                                                \
4112 function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
4113   stmdb sp!, { r4, r14 };                                                      \
4114   save_abi_regs();                                                             \
4115   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4116   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4117                                                                                \
4118   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4119   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4120                                                                                \
4121   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4122   mov c_64, #64;                                                               \
4123                                                                                \
4124   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4125   vmov.u16 d128_0x03E0, #0x0300;                                               \
4126   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4127   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4128   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4129   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4130   vorr.u16 d128_0x1C07, #0x0007;                                               \
4131                                                                                \
4132   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4133   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4134   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4135   vclt.s16 blend_mask, pixels, #0;                                             \
4136   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4137   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4138   vshr.s16 pixels_fourth, pixels, #2;                                          \
4139   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4140                                                                                \
4141   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4142   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4143   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4144   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4145   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4146   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4147   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4148   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4149                                                                                \
4150   subs num_blocks, num_blocks, #1;                                             \
4151   beq 1f;                                                                      \
4152                                                                                \
4153  0:                                                                            \
4154   mov fb_ptr, fb_ptr_next;                                                     \
4155   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4156                                                                                \
4157   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4158   vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
4159   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4160                                                                                \
4161   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4162   vclt.s16 blend_mask, pixels, #0;                                             \
4163   vshr.s16 pixels_fourth, pixels, #2;                                          \
4164   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4165   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4166                                                                                \
4167   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4168   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4169                                                                                \
4170   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4171   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4172   cmp fb_ptr_cmp, #28;                                                         \
4173   bls 2f;                                                                      \
4174                                                                                \
4175   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4176   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4177   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4178   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4179   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4180   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4181                                                                                \
4182  3:                                                                            \
4183   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4184   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4185   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4186   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4187   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4188                                                                                \
4189   subs num_blocks, num_blocks, #1;                                             \
4190   bne 0b;                                                                      \
4191                                                                                \
4192  1:                                                                            \
4193   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4194   vorr.u16 blend_pixels, #0x8000;  /* stp */                                   \
4195   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4196   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4197   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4198   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4199                                                                                \
4200   restore_abi_regs();                                                          \
4201   ldmia sp!, { r4, pc };                                                       \
4202                                                                                \
4203  2:                                                                            \
4204   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4205   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4206                                                                                \
4207   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4208   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4209   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4210   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4211   bal 3b                                                                       \
4212
4213
4214
4215 #define blend_blocks_add_fourth_untextured_builder(mask_evaluate)              \
4216 .align 3;                                                                      \
4217                                                                                \
4218 function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
4219   stmdb sp!, { r4, r14 };                                                      \
4220   save_abi_regs();                                                             \
4221   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4222   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
4223                                                                                \
4224   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4225   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16];           \
4226                                                                                \
4227   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4228   mov c_64, #64;                                                               \
4229                                                                                \
4230   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4231   vmov.u16 d128_0x03E0, #0x0300;                                               \
4232   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4233   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4234   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4235   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4236   vorr.u16 d128_0x1C07, #0x0007;                                               \
4237                                                                                \
4238   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4239   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4240   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4241   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4242   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4243   vshr.s16 pixels_fourth, pixels, #2;                                          \
4244   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4245                                                                                \
4246   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4247   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4248   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4249   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4250   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4251   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4252   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4253   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4254                                                                                \
4255   subs num_blocks, num_blocks, #1;                                             \
4256   beq 1f;                                                                      \
4257                                                                                \
4258  0:                                                                            \
4259   mov fb_ptr, fb_ptr_next;                                                     \
4260   ldr fb_ptr_next, [pixel_ptr, #28];                                           \
4261                                                                                \
4262   vld1.u32 { pixels }, [pixel_ptr, :128], c_64;                                \
4263                                                                                \
4264   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4265   vshr.s16 pixels_fourth, pixels, #2;                                          \
4266   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4267   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4268                                                                                \
4269   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4270   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64;                         \
4271                                                                                \
4272   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4273   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4274   cmp fb_ptr_cmp, #28;                                                         \
4275   bls 2f;                                                                      \
4276                                                                                \
4277   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4278   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4279   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4280   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4281   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4282   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4283                                                                                \
4284  3:                                                                            \
4285   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4286   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4287   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4288   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4289   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4290                                                                                \
4291   subs num_blocks, num_blocks, #1;                                             \
4292   bne 0b;                                                                      \
4293                                                                                \
4294  1:                                                                            \
4295   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4296   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4297   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4298   vst1.u16 { blend_pixels }, [fb_ptr_next];                                    \
4299                                                                                \
4300   restore_abi_regs();                                                          \
4301   ldmia sp!, { r4, pc };                                                       \
4302                                                                                \
4303  2:                                                                            \
4304   vst1.u16 { blend_pixels }, [fb_ptr];                                         \
4305   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4306                                                                                \
4307   vld1.u16 { fb_pixels }, [fb_ptr_next];                                       \
4308   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4309   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4310   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4311   bal 3b                                                                       \
4312
4313
4314 blend_blocks_add_fourth_textured_builder(off)
4315 blend_blocks_add_fourth_textured_builder(on)
4316 blend_blocks_add_fourth_untextured_builder(off)
4317 blend_blocks_add_fourth_untextured_builder(on)
4318
4319 // TODO: Optimize this more. Need a scene that actually uses it for
4320 // confirmation..
4321
4322 .align 3
4323
4324 function(blend_blocks_textured_unblended_on)         
4325   stmdb sp!, { r4, r14 }
4326   save_abi_regs()
4327   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4328   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
4329
4330   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4331   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [mask_msb_ptr, :16]
4332
4333   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4334   mov c_64, #64
4335
4336   ldr fb_ptr, [pixel_ptr, #28]
4337   vld1.u16 { fb_pixels }, [fb_ptr]
4338   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
4339   vclt.s16 write_mask, fb_pixels, #0
4340   vld1.u32 { pixels }, [pixel_ptr, :128], c_64
4341
4342   subs num_blocks, num_blocks, #1
4343   beq 1f
4344
4345  0:
4346   vorr.u16 pixels, pixels, msb_mask
4347   vorr.u16 draw_mask, draw_mask, write_mask
4348   vbif.u16 fb_pixels, pixels, draw_mask
4349   vst1.u16 { fb_pixels }, [fb_ptr]
4350
4351   ldr fb_ptr, [pixel_ptr, #28]
4352   vld1.u16 { fb_pixels }, [fb_ptr]
4353   vld1.u32 { draw_mask }, [draw_mask_ptr, :128], c_64
4354   vclt.s16 write_mask, fb_pixels, #0
4355   vld1.u32 { pixels }, [pixel_ptr, :128], c_64
4356
4357   subs num_blocks, num_blocks, #1
4358   bne 0b
4359  
4360  1:
4361   vorr.u16 pixels, pixels, msb_mask
4362   vorr.u16 draw_mask, draw_mask, write_mask
4363   vbif.u16 fb_pixels, pixels, draw_mask
4364   vst1.u16 { fb_pixels }, [fb_ptr]
4365
4366   restore_abi_regs()
4367   ldmia sp!, { r4, pc }
4368
4369
4370 function(blend_blocks_textured_unblended_off)
4371   bx lr
4372
4373
4374 function(warmup)
4375   mov r3, #64
4376   cmp r0, #0
4377   bxeq lr
4378
4379  0:
4380   vld1.u32 { u_whole_8, v_whole_8 }, [r1, :128], r3
4381
4382   subs r0, r0, #1
4383   bne 0b
4384
4385   bx lr
4386
4387 #undef vram_ptr
4388 #undef color
4389 #undef width
4390 #undef height
4391 #undef pitch
4392
4393 #define vram_ptr                                          r0
4394 #define color                                             r1
4395 #define width                                             r2
4396 #define height                                            r3
4397
4398 #define pitch                                             r1
4399
4400 #define num_width                                         r12
4401
4402 #undef colors_a
4403 #undef colors_b
4404
4405 #define colors_a                                          q0
4406 #define colors_b                                          q1
4407
4408 .align 3
4409
4410 function(render_block_fill_body)
4411   vdup.u16 colors_a, color
4412   mov pitch, #2048
4413
4414   vmov colors_b, colors_a
4415   sub pitch, pitch, width, lsl #1
4416
4417   mov num_width, width
4418
4419  0:  
4420   vst1.u32 { colors_a, colors_b }, [vram_ptr, :256]!
4421
4422   subs num_width, num_width, #16
4423   bne 0b
4424
4425   add vram_ptr, vram_ptr, pitch
4426   mov num_width, width
4427
4428   subs height, height, #1
4429   bne 0b
4430
4431   bx lr
4432  
4433
4434 #undef x
4435 #undef y
4436 #undef width
4437 #undef height
4438 #undef fb_ptr
4439 #undef texture_mask
4440 #undef num_blocks
4441 #undef temp
4442 #undef dirty_textures_mask
4443 #undef clut_ptr
4444 #undef current_texture_mask
4445
4446 #define psx_gpu                                           r0
4447 #define x                                                 r1
4448 #define y                                                 r2
4449 #define u                                                 r3
4450 #define v                                                 r4
4451 #define width                                             r5
4452 #define height                                            r6
4453 #define offset_u                                          r8
4454 #define offset_v                                          r9
4455 #define offset_u_right                                    r10
4456 #define width_rounded                                     r11
4457 #define height_rounded                                    r12
4458
4459 #define texture_offset_base                               r1
4460 #define tile_width                                        r2
4461 #define tile_height                                       r3
4462 #define num_blocks                                        r4
4463 #define block                                             r5
4464 #define sub_tile_height                                   r6
4465 #define fb_ptr                                            r7
4466 #define texture_mask                                      r8
4467 #define column_data                                       r9
4468 #define texture_offset                                    r10
4469 #define tiles_remaining                                   r11
4470 #define fb_ptr_advance_column                             r12
4471 #define texture_block_ptr                                 r14
4472
4473 #define temp                                              r14
4474
4475 #define texture_page_ptr                                  r3
4476 #define left_block_mask                                   r4
4477 #define right_block_mask                                  r5
4478 #define texture_mask_rev                                  r10
4479 #define control_mask                                      r11
4480
4481 #define dirty_textures_mask                               r4
4482 #define clut_ptr                                          r5
4483 #define current_texture_mask                              r6
4484
4485
4486 #undef texels
4487 #undef clut_low_a
4488 #undef clut_low_b
4489 #undef clut_high_a
4490 #undef clut_high_b
4491 #undef clut_a
4492 #undef clut_b
4493 #undef texels_low
4494 #undef texels_high
4495
4496 #define texels                                            d0
4497 #define draw_masks_fb_ptrs                                q1
4498
4499 #define draw_mask_fb_ptr_left                             d2
4500 #define draw_mask_fb_ptr_right                            d3
4501
4502 #define draw_mask_fb_ptr_left_a                           d2
4503 #define draw_mask_fb_ptr_left_b                           d3
4504 #define draw_mask_fb_ptr_right_a                          d10
4505 #define draw_mask_fb_ptr_right_b                          d11
4506 #define draw_masks_fb_ptrs2                               q5
4507
4508 #define clut_low_a                                        d4
4509 #define clut_low_b                                        d5
4510 #define clut_high_a                                       d6
4511 #define clut_high_b                                       d7
4512
4513 #define block_masks                                       d8
4514 #define block_masks_shifted                               d9
4515
4516 #define clut_a                                            q2
4517 #define clut_b                                            q3
4518
4519 #define texels_low                                        d12
4520 #define texels_high                                       d13
4521
4522 #define texels_wide_low                                   d14
4523 #define texels_wide_high                                  d15
4524 #define texels_wide                                       q7
4525
4526
4527 setup_sprite_flush_blocks:
4528   vpush { q1 - q5 }
4529
4530   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
4531   bl flush_render_block_buffer
4532   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
4533
4534   vpop { q1 - q5 }
4535
4536   add block, psx_gpu, #psx_gpu_blocks_offset
4537   bx lr
4538
4539
4540 setup_sprite_update_texture_4bpp_cache:
4541   stmdb sp!, { r0 - r3, r14 }
4542   bl update_texture_4bpp_cache
4543   ldmia sp!, { r0 - r3, pc }
4544
4545
4546 setup_sprite_update_texture_8bpp_cache:
4547   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
4548   bl update_texture_8bpp_cache
4549   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
4550
4551
4552 #define setup_sprite_tiled_initialize_4bpp()                                   \
4553   ldr dirty_textures_mask,                                                     \
4554    [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset];                        \
4555   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset];                           \
4556                                                                                \
4557   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset];   \
4558   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128];                               \
4559                                                                                \
4560   tst current_texture_mask, dirty_textures_mask;                               \
4561   vuzp.u8 clut_a, clut_b;                                                      \
4562                                                                                \
4563   blne setup_sprite_update_texture_4bpp_cache                                  \
4564
4565 #define setup_sprite_tiled_initialize_8bpp()                                   \
4566   ldr dirty_textures_mask,                                                     \
4567    [psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset];                        \
4568   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset];   \
4569                                                                                \
4570   tst current_texture_mask, dirty_textures_mask;                               \
4571   blne setup_sprite_update_texture_8bpp_cache                                  \
4572
4573
4574 #define setup_sprite_block_count_single()                                      \
4575   sub_tile_height                                                              \
4576
4577 #define setup_sprite_block_count_double()                                      \
4578   sub_tile_height, lsl #1                                                      \
4579
4580 #define setup_sprite_tile_add_blocks(type)                                     \
4581   add num_blocks, num_blocks, setup_sprite_block_count_##type();               \
4582   cmp num_blocks, #MAX_BLOCKS;                                                 \
4583                                                                                \
4584   movgt num_blocks, setup_sprite_block_count_##type();                         \
4585   blgt setup_sprite_flush_blocks                                               \
4586
4587
4588 #define setup_sprite_tile_full_4bpp(edge)                                      \
4589   setup_sprite_tile_add_blocks(double);                                        \
4590                                                                                \
4591  4:                                                                            \
4592   and texture_block_ptr, texture_offset, texture_mask;                         \
4593   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4594                                                                                \
4595   pld [fb_ptr];                                                                \
4596   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4597   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4598                                                                                \
4599   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4600   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4601                                                                                \
4602   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4603   add texture_block_ptr, texture_offset, #8;                                   \
4604                                                                                \
4605   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4606   add block, block, #40;                                                       \
4607                                                                                \
4608   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4609   add fb_ptr, fb_ptr, #16;                                                     \
4610                                                                                \
4611   vst1.u32 { draw_mask_fb_ptr_left }, [block, :64];                            \
4612   add block, block, #24;                                                       \
4613                                                                                \
4614   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4615   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4616                                                                                \
4617   pld [fb_ptr];                                                                \
4618   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4619   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4620                                                                                \
4621   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4622   add block, block, #40;                                                       \
4623                                                                                \
4624   add texture_offset, texture_offset, #0x10;                                   \
4625   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4626                                                                                \
4627   vst1.u32 { draw_mask_fb_ptr_right }, [block, :64];                           \
4628   add block, block, #24;                                                       \
4629                                                                                \
4630   subs sub_tile_height, sub_tile_height, #1;                                   \
4631   bne 4b;                                                                      \
4632                                                                                \
4633   add texture_offset, texture_offset, #0xF00;                                  \
4634   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4635
4636   
4637 #define setup_sprite_tile_half_4bpp(edge)                                      \
4638   setup_sprite_tile_add_blocks(single);                                        \
4639                                                                                \
4640  4:                                                                            \
4641   and texture_block_ptr, texture_offset, texture_mask;                         \
4642   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4643                                                                                \
4644   pld [fb_ptr];                                                                \
4645   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4646   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4647                                                                                \
4648   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4649   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4650                                                                                \
4651   vst2.u8 { texels_low, texels_high }, [block, :128];                          \
4652   add block, block, #40;                                                       \
4653                                                                                \
4654   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4655   vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64];                          \
4656                                                                                \
4657   add block, block, #24;                                                       \
4658   add texture_offset, texture_offset, #0x10;                                   \
4659                                                                                \
4660   add fb_ptr, fb_ptr, #2048;                                                   \
4661   subs sub_tile_height, sub_tile_height, #1;                                   \
4662                                                                                \
4663   bne 4b;                                                                      \
4664                                                                                \
4665   add texture_offset, texture_offset, #0xF00;                                  \
4666   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4667  
4668  
4669 #define setup_sprite_tile_full_8bpp(edge)                                      \
4670   setup_sprite_tile_add_blocks(double);                                        \
4671   add block, block, #16;                                                       \
4672                                                                                \
4673  4:                                                                            \
4674   and texture_block_ptr, texture_offset, texture_mask;                         \
4675   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4676                                                                                \
4677   pld [fb_ptr];                                                                \
4678   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4679   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4680                                                                                \
4681   add texture_block_ptr, texture_offset, #8;                                   \
4682   vst1.u32 { texels }, [block, :64];                                           \
4683                                                                                \
4684   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4685   add block, block, #24;                                                       \
4686                                                                                \
4687   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4688                                                                                \
4689   add fb_ptr, fb_ptr, #16;                                                     \
4690   vst1.u32 { draw_mask_fb_ptr_left }, [block, :64];                            \
4691                                                                                \
4692   add block, block, #40;                                                       \
4693   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4694   pld [fb_ptr];                                                                \
4695                                                                                \
4696   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4697   vst1.u32 { texels }, [block, :64];                                           \
4698   add block, block, #24;                                                       \
4699                                                                                \
4700   add texture_offset, texture_offset, #0x10;                                   \
4701   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4702                                                                                \
4703   vst1.u32 { draw_mask_fb_ptr_right }, [block, :64];                           \
4704   add block, block, #40;                                                       \
4705                                                                                \
4706   subs sub_tile_height, sub_tile_height, #1;                                   \
4707   bne 4b;                                                                      \
4708                                                                                \
4709   sub block, block, #16;                                                       \
4710   add texture_offset, texture_offset, #0xF00;                                  \
4711   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4712
4713   
4714 #define setup_sprite_tile_half_8bpp(edge)                                      \
4715   setup_sprite_tile_add_blocks(single);                                        \
4716   add block, block, #16;                                                       \
4717                                                                                \
4718  4:                                                                            \
4719   and texture_block_ptr, texture_offset, texture_mask;                         \
4720   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4721   pld [fb_ptr];                                                                \
4722                                                                                \
4723   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4724   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4725                                                                                \
4726   vst1.u32 { texels }, [block, :64];                                           \
4727   add block, block, #24;                                                       \
4728                                                                                \
4729   vst1.u32 { draw_mask_fb_ptr_##edge }, [block, :64];                          \
4730   add block, block, #40;                                                       \
4731                                                                                \
4732   add texture_offset, texture_offset, #0x10;                                   \
4733   add fb_ptr, fb_ptr, #2048;                                                   \
4734                                                                                \
4735   subs sub_tile_height, sub_tile_height, #1;                                   \
4736   bne 4b;                                                                      \
4737                                                                                \
4738   sub block, block, #16;                                                       \
4739   add texture_offset, texture_offset, #0xF00;                                  \
4740   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
4741
4742  
4743 #define setup_sprite_tile_column_edge_pre_adjust_half_right()                  \
4744   add texture_offset, texture_offset_base, #8;                                 \
4745   add fb_ptr, fb_ptr, #16                                                      \
4746
4747 #define setup_sprite_tile_column_edge_pre_adjust_half_left()                   \
4748   mov texture_offset, texture_offset_base                                      \
4749
4750 #define setup_sprite_tile_column_edge_pre_adjust_half(edge)                    \
4751   setup_sprite_tile_column_edge_pre_adjust_half_##edge()                       \
4752
4753 #define setup_sprite_tile_column_edge_pre_adjust_full(edge)                    \
4754   mov texture_offset, texture_offset_base                                      \
4755
4756 #define setup_sprite_tile_column_edge_post_adjust_half_right()                 \
4757   sub fb_ptr, fb_ptr, #16                                                      \
4758
4759 #define setup_sprite_tile_column_edge_post_adjust_half_left()                  \
4760
4761 #define setup_sprite_tile_column_edge_post_adjust_half(edge)                   \
4762   setup_sprite_tile_column_edge_post_adjust_half_##edge()                      \
4763
4764 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
4765
4766
4767 #define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode,  \
4768  x4mode)                                                                       \
4769   mov sub_tile_height, column_data;                                            \
4770   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4771   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4772   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4773
4774 #define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode,   \
4775  x4mode)                                                                       \
4776   and sub_tile_height, column_data, #0xFF;                                     \
4777   mov tiles_remaining, column_data, lsr #16;                                   \
4778   setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
4779   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4780                                                                                \
4781   subs tiles_remaining, tiles_remaining, #1;                                   \
4782   beq 2f;                                                                      \
4783                                                                                \
4784  3:                                                                            \
4785   mov sub_tile_height, #16;                                                    \
4786   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4787   subs tiles_remaining, tiles_remaining, #1;                                   \
4788   bne 3b;                                                                      \
4789                                                                                \
4790  2:                                                                            \
4791   uxtb sub_tile_height, column_data, ror #8;                                   \
4792   setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
4793   setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
4794
4795
4796 #define setup_sprite_column_data_single()                                      \
4797   mov column_data, height;                                                     \
4798   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]            \
4799
4800 #define setup_sprite_column_data_multi()                                       \
4801   and height_rounded, height_rounded, #0xF;                                    \
4802   rsb column_data, offset_v, #16;                                              \
4803                                                                                \
4804   add height_rounded, height_rounded, #1;                                      \
4805   sub tile_height, tile_height, #1;                                            \
4806                                                                                \
4807   orr column_data, column_data, tile_height, lsl #16;                          \
4808   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset];           \
4809                                                                                \
4810   orr column_data, column_data, height_rounded, lsl #8                         \
4811
4812 #define setup_sprite_setup_left_draw_mask_fb_ptr()                             \
4813   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4814   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4815
4816 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column()              \
4817   mov fb_ptr_advance_column, #32;                                              \
4818   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4819                                                                                \
4820   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11;           \
4821   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
4822
4823 #define setup_sprite_setup_right_draw_mask_fb_ptr()                            \
4824   vdup.u8 draw_mask_fb_ptr_left, block_masks[4];                               \
4825   vdup.u8 draw_mask_fb_ptr_right, block_masks[5]                               \
4826
4827 #define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode,     \
4828  edge, x4mode)                                                                 \
4829  setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode:     \
4830   setup_sprite_column_data_##multi_height();                                   \
4831   vext.32 block_masks_shifted, block_masks, block_masks, #1;                   \
4832   vorr.u32 block_masks, block_masks, block_masks_shifted;                      \
4833   setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
4834                                                                                \
4835   setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
4836   restore_abi_regs();                                                          \
4837   ldmia sp!, { r4 - r11, pc }                                                  \
4838
4839 #define setup_sprite_tiled_advance_column()                                    \
4840   add texture_offset_base, texture_offset_base, #0x100;                        \
4841   tst texture_offset_base, #0xF00;                                             \
4842   subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00)             \
4843
4844 #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode,      \
4845  right_mode, x4mode)                                                           \
4846  setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
4847   setup_sprite_column_data_##multi_height();                                   \
4848                                                                                \
4849   setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode();           \
4850                                                                                \
4851   setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
4852                                                                                \
4853   subs tile_width, tile_width, #2;                                             \
4854   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4855                                                                                \
4856   beq 1f;                                                                      \
4857                                                                                \
4858   vmov.u8 draw_masks_fb_ptrs, #0;                                              \
4859   vmov.u8 draw_masks_fb_ptrs2, #0;                                             \
4860                                                                                \
4861  0:                                                                            \
4862   setup_sprite_tiled_advance_column();                                         \
4863   setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode);      \
4864   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4865   subs tile_width, tile_width, #1;                                             \
4866   bne 0b;                                                                      \
4867                                                                                \
4868  1:                                                                            \
4869   setup_sprite_setup_right_draw_mask_fb_ptr##x4mode();                         \
4870                                                                                \
4871   setup_sprite_tiled_advance_column();                                         \
4872   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
4873   restore_abi_regs();                                                          \
4874   ldmia sp!, { r4 - r11, pc }                                                  \
4875
4876
4877 #define setup_sprite_offset_u_adjust()                                         \
4878
4879 #define setup_sprite_get_left_block_mask()                                     \
4880   and left_block_mask, left_block_mask, #0xFF                                  \
4881
4882 #define setup_sprite_compare_left_block_mask()                                 \
4883   cmp left_block_mask, #0xFF                                                   \
4884
4885 #define setup_sprite_get_right_block_mask()                                    \
4886   uxtb right_block_mask, right_block_mask, ror #8                              \
4887
4888 #define setup_sprite_compare_right_block_mask()                                \
4889   cmp right_block_mask, #0xFF                                                  \
4890
4891
4892
4893 /* 4x stuff */
4894 #define fb_ptr2 column_data
4895
4896 #define setup_sprite_offset_u_adjust_4x()                                      \
4897   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
4898   lsl offset_u_right, #1;                                                      \
4899   lsl offset_u, #1;                                                            \
4900   add offset_u_right, #1                                                       \
4901
4902 #define setup_sprite_get_left_block_mask_4x()                                  \
4903   sxth left_block_mask, left_block_mask                                        \
4904
4905 #define setup_sprite_compare_left_block_mask_4x()                              \
4906   cmp left_block_mask, #0xFFFFFFFF                                             \
4907
4908 #define setup_sprite_get_right_block_mask_4x()                                 \
4909   sxth right_block_mask, right_block_mask, ror #16                             \
4910
4911 #define setup_sprite_compare_right_block_mask_4x()                             \
4912   cmp right_block_mask, #0xFFFFFFFF                                            \
4913
4914
4915 #define widen_texels_16bpp(texels_)                                            \
4916   vmov texels_wide_low, texels_;                                               \
4917   vmov texels_wide_high, texels_;                                              \
4918   vzip.16 texels_wide_low, texels_wide_high                                    \
4919
4920 #define widen_texels_8bpp(texels_)                                             \
4921   vmov texels_wide_low, texels_;                                               \
4922   vmov texels_wide_high, texels_;                                              \
4923   vzip.8 texels_wide_low, texels_wide_high                                     \
4924
4925 #define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)         \
4926   vst1.u32 { texels_ }, [block_, :128];                                        \
4927   add block_, block_, #40;                                                     \
4928                                                                                \
4929   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4930   vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64];                               \
4931   add block_, block_, #24                                                      \
4932
4933 /* assumes 16-byte offset already added to block_ */
4934 #define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)          \
4935   vst1.u32 { texels_ }, [block_, :64];                                         \
4936   add block_, block_, #24;                                                     \
4937                                                                                \
4938   vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
4939   vst1.u32 { draw_mask_fb_ptr_ }, [block_, :64];                               \
4940   add block_, block_, #40                                                      \
4941
4942 #define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,             \
4943  draw_mask_fb_ptr_b_)                                                          \
4944   widen_texels_16bpp(texels_low);                                              \
4945   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4946                                                                                \
4947   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr);          \
4948                                                                                \
4949   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);      \
4950   widen_texels_16bpp(texels_high);                                             \
4951                                                                                \
4952   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4953   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);      \
4954                                                                                \
4955   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4956   write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)       \
4957
4958 #define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,              \
4959  draw_mask_fb_ptr_b_)                                                          \
4960   widen_texels_8bpp(texels);                                                   \
4961   add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
4962                                                                                \
4963   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr);       \
4964   write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);   \
4965                                                                                \
4966   add fb_ptr_tmp, fb_ptr, #8*2;                                                \
4967   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);  \
4968                                                                                \
4969   add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
4970   write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)   \
4971
4972
4973 #define setup_sprite_tiled_initialize_4bpp_4x()                                \
4974   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset];                           \
4975   vld1.u32 { clut_a, clut_b }, [clut_ptr, :128];                               \
4976                                                                                \
4977   vuzp.u8 clut_a, clut_b                                                       \
4978
4979 #define setup_sprite_tiled_initialize_8bpp_4x()                                \
4980
4981
4982 #define setup_sprite_block_count_single_4x()                                   \
4983   sub_tile_height, lsl #2                                                      \
4984
4985 #define setup_sprite_block_count_double_4x()                                   \
4986   sub_tile_height, lsl #(1+2)                                                  \
4987
4988 #define setup_sprite_tile_full_4bpp_4x(edge)                                   \
4989   setup_sprite_tile_add_blocks(double_4x);                                     \
4990   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
4991                                                                                \
4992  4:                                                                            \
4993   and texture_block_ptr, texture_offset, texture_mask;                         \
4994   pld [fb_ptr];                                                                \
4995                                                                                \
4996   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4997   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
4998                                                                                \
4999   add texture_block_ptr, texture_offset, #8;                                   \
5000   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5001                                                                                \
5002   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
5003   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5004                                                                                \
5005   vzip.8 texels_low, texels_high;                                              \
5006   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                  \
5007    draw_mask_fb_ptr_left_b);                                                   \
5008                                                                                \
5009   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5010   pld [fb_ptr, #2048];                                                         \
5011                                                                                \
5012   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5013   add fb_ptr, fb_ptr, #16*2;                                                   \
5014                                                                                \
5015   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5016   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5017                                                                                \
5018   vzip.8 texels_low, texels_high;                                              \
5019   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                 \
5020    draw_mask_fb_ptr_right_b);                                                  \
5021                                                                                \
5022   add texture_offset, texture_offset, #0x10;                                   \
5023   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5024                                                                                \
5025   subs sub_tile_height, sub_tile_height, #1;                                   \
5026   bne 4b;                                                                      \
5027                                                                                \
5028   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5029   add texture_offset, texture_offset, #0xF00;                                  \
5030   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5031
5032
5033 #define setup_sprite_tile_half_4bpp_4x(edge)                                   \
5034   setup_sprite_tile_add_blocks(single_4x);                                     \
5035   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5036                                                                                \
5037  4:                                                                            \
5038   and texture_block_ptr, texture_offset, texture_mask;                         \
5039   pld [fb_ptr];                                                                \
5040                                                                                \
5041   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5042   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5043                                                                                \
5044   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5045   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
5046                                                                                \
5047   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
5048   add texture_offset, texture_offset, #0x10;                                   \
5049                                                                                \
5050   vzip.8 texels_low, texels_high;                                              \
5051   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,              \
5052    draw_mask_fb_ptr_##edge##_b);                                               \
5053                                                                                \
5054   pld [fb_ptr, #2048];                                                         \
5055   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5056                                                                                \
5057   subs sub_tile_height, sub_tile_height, #1;                                   \
5058   bne 4b;                                                                      \
5059                                                                                \
5060   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5061   add texture_offset, texture_offset, #0xF00;                                  \
5062   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5063
5064
5065 #define setup_sprite_tile_full_8bpp_4x(edge)                                   \
5066   setup_sprite_tile_add_blocks(double_4x);                                     \
5067   add block, block, #16;                                                       \
5068   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5069                                                                                \
5070  4:                                                                            \
5071   and texture_block_ptr, texture_offset, texture_mask;                         \
5072   pld [fb_ptr];                                                                \
5073                                                                                \
5074   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5075   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5076                                                                                \
5077   add texture_block_ptr, texture_offset, #8;                                   \
5078   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                   \
5079    draw_mask_fb_ptr_left_b);                                                   \
5080                                                                                \
5081   pld [fb_ptr, #2048];                                                         \
5082   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
5083                                                                                \
5084   add fb_ptr, fb_ptr, #16*2;                                                   \
5085   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5086                                                                                \
5087   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5088                                                                                \
5089   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                  \
5090    draw_mask_fb_ptr_right_b);                                                  \
5091                                                                                \
5092   add texture_offset, texture_offset, #0x10;                                   \
5093   add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
5094                                                                                \
5095   subs sub_tile_height, sub_tile_height, #1;                                   \
5096   bne 4b;                                                                      \
5097                                                                                \
5098   sub block, block, #16;                                                       \
5099   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5100   add texture_offset, texture_offset, #0xF00;                                  \
5101   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5102
5103   
5104 #define setup_sprite_tile_half_8bpp_4x(edge)                                   \
5105   setup_sprite_tile_add_blocks(single_4x);                                     \
5106   add block, block, #16;                                                       \
5107   str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
5108                                                                                \
5109  4:                                                                            \
5110   and texture_block_ptr, texture_offset, texture_mask;                         \
5111   pld [fb_ptr];                                                                \
5112                                                                                \
5113   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
5114   vld1.u32 { texels }, [texture_block_ptr, :64];                               \
5115                                                                                \
5116   pld [fb_ptr, #2048];                                                         \
5117   do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,               \
5118    draw_mask_fb_ptr_##edge##_b);                                               \
5119                                                                                \
5120   add texture_offset, texture_offset, #0x10;                                   \
5121   add fb_ptr, fb_ptr, #2048 * 2;                                               \
5122                                                                                \
5123   subs sub_tile_height, sub_tile_height, #1;                                   \
5124   bne 4b;                                                                      \
5125                                                                                \
5126   sub block, block, #16;                                                       \
5127   ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
5128   add texture_offset, texture_offset, #0xF00;                                  \
5129   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]                       \
5130
5131  
5132 #define setup_sprite_tile_column_edge_pre_adjust_half_right_4x()               \
5133   add texture_offset, texture_offset_base, #8;                                 \
5134   add fb_ptr, fb_ptr, #16 * 2                                                  \
5135
5136 #define setup_sprite_tile_column_edge_pre_adjust_half_left_4x()                \
5137   mov texture_offset, texture_offset_base                                      \
5138
5139 #define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge)                 \
5140   setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x()                  \
5141
5142 #define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge)                 \
5143   mov texture_offset, texture_offset_base                                      \
5144
5145 #define setup_sprite_tile_column_edge_post_adjust_half_right_4x()              \
5146   sub fb_ptr, fb_ptr, #16 * 2                                                  \
5147
5148 #define setup_sprite_tile_column_edge_post_adjust_half_left_4x()               \
5149
5150 #define setup_sprite_tile_column_edge_post_adjust_half_4x(edge)                \
5151   setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x()                 \
5152
5153 #define setup_sprite_tile_column_edge_post_adjust_full_4x(edge)                \
5154
5155
5156 #define setup_sprite_setup_left_draw_mask_fb_ptr_4x()                          \
5157   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5158   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5159   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5160   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5161
5162 #define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x()           \
5163   mov fb_ptr_advance_column, #32 * 2;                                          \
5164   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
5165   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
5166   sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1;       \
5167   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
5168   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
5169
5170 #define setup_sprite_setup_right_draw_mask_fb_ptr_4x()                         \
5171   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4];                             \
5172   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5];                             \
5173   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6];                            \
5174   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7]                             \
5175
5176
5177 // r0: psx_gpu
5178 // r1: x
5179 // r2: y
5180 // r3: u
5181 // [sp]: v
5182 // [sp + 4]: width
5183 // [sp + 8]: height
5184 // [sp + 12]: color (unused)
5185
5186 #define setup_sprite_tiled_builder(texture_mode, x4mode)                       \
5187                                                                                \
5188 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, full,        \
5189   x4mode);                                                                     \
5190 setup_sprite_tile_column_width_single(texture_mode, multi,  full, none,        \
5191   x4mode);                                                                     \
5192 setup_sprite_tile_column_width_multi(texture_mode,  single, full, full,        \
5193   x4mode);                                                                     \
5194 setup_sprite_tile_column_width_single(texture_mode, single, full, none,        \
5195   x4mode);                                                                     \
5196 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, full,        \
5197   x4mode);                                                                     \
5198 setup_sprite_tile_column_width_single(texture_mode, multi,  half, right,       \
5199   x4mode);                                                                     \
5200 setup_sprite_tile_column_width_multi(texture_mode,  single, half, full,        \
5201   x4mode);                                                                     \
5202 setup_sprite_tile_column_width_single(texture_mode, single, half, right,       \
5203   x4mode);                                                                     \
5204 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, half,        \
5205   x4mode);                                                                     \
5206 setup_sprite_tile_column_width_single(texture_mode, multi,  half, left,        \
5207   x4mode);                                                                     \
5208 setup_sprite_tile_column_width_multi(texture_mode,  single, full, half,        \
5209   x4mode);                                                                     \
5210 setup_sprite_tile_column_width_single(texture_mode, single, half, left,        \
5211   x4mode);                                                                     \
5212 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, half,        \
5213   x4mode);                                                                     \
5214 setup_sprite_tile_column_width_multi(texture_mode,  single, half, half,        \
5215   x4mode);                                                                     \
5216                                                                                \
5217 .align 4;                                                                      \
5218                                                                                \
5219 function(setup_sprite_##texture_mode##x4mode)                                  \
5220   stmdb sp!, { r4 - r11, r14 };                                                \
5221   setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
5222                                                                                \
5223   ldr v, [sp, #36];                                                            \
5224   and offset_u, u, #0xF;                                                       \
5225                                                                                \
5226   ldr width, [sp, #40];                                                        \
5227   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset];                         \
5228                                                                                \
5229   ldr height, [sp, #44];                                                       \
5230   add fb_ptr, fb_ptr, y, lsl #11;                                              \
5231                                                                                \
5232   save_abi_regs();                                                             \
5233                                                                                \
5234   add fb_ptr, fb_ptr, x, lsl #1;                                               \
5235   and offset_v, v, #0xF;                                                       \
5236                                                                                \
5237   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
5238   add width_rounded, offset_u, width;                                          \
5239                                                                                \
5240   add height_rounded, offset_v, height;                                        \
5241   add width_rounded, width_rounded, #15;                                       \
5242                                                                                \
5243   add height_rounded, height_rounded, #15;                                     \
5244   mov tile_width, width_rounded, lsr #4;                                       \
5245                                                                                \
5246   /* texture_offset_base = VH-VL-00-00                                       */\
5247   mov texture_offset_base, v, lsl #8;                                          \
5248   and offset_u_right, width_rounded, #0xF;                                     \
5249                                                                                \
5250   /* texture_offset_base = VH-UH-UL-00                                       */\
5251   bfi texture_offset_base, u, #4, #8;                                          \
5252   mov right_block_mask, #0xFFFFFFFE;                                           \
5253                                                                                \
5254   setup_sprite_offset_u_adjust##x4mode();                                      \
5255                                                                                \
5256   /* texture_offset_base = VH-UH-VL-00                                       */\
5257   bfi texture_offset_base, v, #4, #4;                                          \
5258   mov left_block_mask, #0xFFFFFFFF;                                            \
5259                                                                                \
5260   mov tile_height, height_rounded, lsr #4;                                     \
5261   mvn left_block_mask, left_block_mask, lsl offset_u;                          \
5262                                                                                \
5263   /* texture_mask = HH-HL-WH-WL                                              */\
5264   ldrh texture_mask, [psx_gpu, #psx_gpu_texture_mask_width_offset];            \
5265   mov right_block_mask, right_block_mask, lsl offset_u_right;                  \
5266                                                                                \
5267   /* texture_mask_rev = WH-WL-HH-HL                                          */\
5268   rev16 texture_mask_rev, texture_mask;                                        \
5269   vmov block_masks, left_block_mask, right_block_mask;                         \
5270                                                                                \
5271   /* texture_mask = HH-HL-HL-WL                                              */\
5272   bfi texture_mask, texture_mask_rev, #4, #4;                                  \
5273   /* texture_mask_rev = 00-00-00-WH                                          */\
5274   mov texture_mask_rev, texture_mask_rev, lsr #12;                             \
5275                                                                                \
5276   /* texture_mask = HH-WH-HL-WL                                              */\
5277   bfi texture_mask, texture_mask_rev, #8, #4;                                  \
5278   setup_sprite_get_left_block_mask##x4mode();                                  \
5279                                                                                \
5280   mov control_mask, #0;                                                        \
5281   setup_sprite_compare_left_block_mask##x4mode();                              \
5282                                                                                \
5283   setup_sprite_get_right_block_mask##x4mode();                                 \
5284   orreq control_mask, control_mask, #0x4;                                      \
5285                                                                                \
5286   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset];                      \
5287   setup_sprite_compare_right_block_mask##x4mode();                             \
5288                                                                                \
5289   orreq control_mask, control_mask, #0x8;                                      \
5290   cmp tile_width, #1;                                                          \
5291                                                                                \
5292   add block, psx_gpu, #psx_gpu_blocks_offset;                                  \
5293   orreq control_mask, control_mask, #0x1;                                      \
5294                                                                                \
5295   cmp tile_height, #1;                                                         \
5296   add block, block, num_blocks, lsl #6;                                        \
5297                                                                                \
5298   orreq control_mask, control_mask, #0x2;                                      \
5299   JT_OP_REL(9f, control_mask, temp);                                           \
5300   JT_OP(ldr pc, [pc, control_mask, lsl #2]);                                   \
5301   nop;                                                                         \
5302                                                                                \
5303  9:                                                                            \
5304  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode);   \
5305  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode);  \
5306  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode);  \
5307  .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
5308  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode);   \
5309  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
5310  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode);  \
5311  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
5312  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode);   \
5313  .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode);  \
5314  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode);  \
5315  .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
5316  .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode);   \
5317  .word 0x00000000;                                                             \
5318  .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode);  \
5319
5320
5321 setup_sprite_tiled_builder(4bpp,);
5322 setup_sprite_tiled_builder(8bpp,);
5323
5324 #undef draw_mask_fb_ptr_left
5325 #undef draw_mask_fb_ptr_right
5326
5327 setup_sprite_tiled_builder(4bpp, _4x);
5328 setup_sprite_tiled_builder(8bpp, _4x);
5329
5330
5331 #undef block_ptr
5332 #undef num_blocks
5333 #undef clut_ptr
5334
5335 #define psx_gpu                                           r0
5336 #define block_ptr                                         r0
5337 #define num_blocks                                        r1
5338 #define clut_ptr                                          r2
5339 #define texel_shift_mask                                  r3
5340 #define block_pixels_a                                    r4
5341 #define block_pixels_b                                    r5
5342 #define texel_0                                           r6
5343 #define texel_2                                           r7
5344 #define texel_4                                           r8
5345 #define texel_6                                           r9
5346 #define texel_1                                           r10
5347 #define texel_3                                           r11
5348 #define texel_5                                           r12
5349 #define texel_7                                           r14
5350 #define texels_01                                         r6
5351 #define texels_23                                         r7
5352 #define texels_45                                         r8
5353 #define texels_67                                         r9
5354
5355 function(texture_sprite_blocks_8bpp)
5356   stmdb sp!, { r4 - r11, r14 }
5357   movw texel_shift_mask, #(0xFF << 1)
5358
5359   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5360   ldr clut_ptr, [psx_gpu, #psx_gpu_clut_ptr_offset]
5361
5362   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
5363   ldr block_pixels_a, [block_ptr, #16]
5364
5365  0:
5366   and texel_0, texel_shift_mask, block_pixels_a, lsl #1
5367   ldr block_pixels_b, [block_ptr, #20]
5368
5369   and texel_1, texel_shift_mask, block_pixels_a, lsr #7
5370   ldrh texel_0, [clut_ptr, texel_0]
5371
5372   and texel_2, texel_shift_mask, block_pixels_a, lsr #15
5373   ldrh texel_1, [clut_ptr, texel_1]
5374
5375   and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5376   ldr block_pixels_a, [block_ptr, #(64 + 16)]
5377
5378   ldrh texel_2, [clut_ptr, texel_2]
5379   and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5380
5381   ldrh texel_3, [clut_ptr, texel_3]
5382   and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5383
5384   ldrh texel_4, [clut_ptr, texel_4]
5385   and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5386
5387   ldrh texel_5, [clut_ptr, texel_5]
5388   and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5389
5390   ldrh texel_6, [clut_ptr, texel_6]
5391   orr texels_01, texel_0, texel_1, lsl #16
5392
5393   ldrh texel_7, [clut_ptr, texel_7]
5394   orr texels_23, texel_2, texel_3, lsl #16
5395
5396   orr texels_45, texel_4, texel_5, lsl #16
5397   str texels_01, [block_ptr, #0]
5398
5399   orr texels_67, texel_6, texel_7, lsl #16
5400   str texels_23, [block_ptr, #4]
5401
5402   subs num_blocks, num_blocks, #1
5403   str texels_45, [block_ptr, #8]
5404
5405   str texels_67, [block_ptr, #12]
5406   add block_ptr, block_ptr, #64
5407
5408   bne 0b
5409
5410   ldmia sp!, { r4 - r11, pc }
5411
5412
5413 #undef width_rounded
5414 #undef texture_mask
5415 #undef num_blocks
5416 #undef texture_offset
5417 #undef texels_low
5418 #undef texels_high
5419 #undef texels_wide_low
5420 #undef texels_wide_high
5421 #undef texels_wide
5422 #undef fb_ptr2
5423 #undef temp
5424
5425 #define psx_gpu                                           r0
5426 #define x                                                 r1
5427 #define y                                                 r2
5428 #define u                                                 r3
5429 #define v                                                 r4
5430 #define width                                             r5
5431 #define height                                            r6
5432 #define left_offset                                       r8
5433 #define width_rounded                                     r9
5434 #define right_width                                       r10
5435
5436 #define block_width                                       r11
5437
5438 #define texture_offset_base                               r1
5439 #define texture_mask                                      r2
5440 #define texture_page_ptr                                  r3
5441 #define num_blocks                                        r4
5442 #define block                                             r5
5443 #define fb_ptr                                            r7
5444 #define texture_offset                                    r8
5445 #define blocks_remaining                                  r9
5446 #define fb_ptr2                                           r10
5447 #define fb_ptr_pitch                                      r12
5448 #define texture_block_ptr                                 r14
5449
5450 #define texture_mask_width                                r2
5451 #define texture_mask_height                               r3
5452 #define left_mask_bits                                    r4
5453 #define right_mask_bits                                   r5
5454
5455
5456 #undef block_masks
5457 #undef block_masks_shifted
5458 #undef texels
5459
5460 #define block_masks                                       d0
5461 #define block_masks_shifted                               d1
5462 #define draw_mask_fb_ptr                                  d2
5463 #define texels                                            q2
5464
5465 #define draw_mask_fb_ptr_a                                d2
5466 #define draw_mask_fb_ptr_b                                d3
5467 #define texels_low                                        d4
5468 #define texels_high                                       d5
5469 #define texels_wide_low                                   d6
5470 #define texels_wide_high                                  d7
5471 #define texels_wide                                       q3
5472
5473
5474 setup_sprites_16bpp_flush:
5475   vpush { d0 - d3 }
5476
5477   stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
5478   bl flush_render_block_buffer
5479   ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
5480
5481   vpop { d0 - d3 }
5482
5483   add block, psx_gpu, #psx_gpu_blocks_offset
5484   mov num_blocks, block_width
5485
5486   bx lr
5487
5488 function(setup_sprite_16bpp)
5489   stmdb sp!, { r4 - r11, r14 }
5490   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5491
5492   ldr v, [sp, #36]
5493   add fb_ptr, fb_ptr, y, lsl #11
5494
5495   ldr width, [sp, #40]
5496   add fb_ptr, fb_ptr, x, lsl #1
5497
5498   ldr height, [sp, #44]
5499   and left_offset, u, #0x7
5500
5501   add texture_offset_base, u, u
5502   add width_rounded, width, #7
5503
5504   add texture_offset_base, texture_offset_base, v, lsl #11
5505   mov left_mask_bits, #0xFF
5506   
5507   ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
5508   add width_rounded, width_rounded, left_offset
5509
5510   ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
5511   sub fb_ptr, fb_ptr, left_offset, lsl #1
5512
5513   add texture_mask, texture_mask_width, texture_mask_width
5514   mov right_mask_bits, #0xFE
5515
5516   and right_width, width_rounded, #0x7
5517   mvn left_mask_bits, left_mask_bits, lsl left_offset
5518
5519   add texture_mask, texture_mask, texture_mask_height, lsl #11
5520   mov block_width, width_rounded, lsr #3
5521
5522   mov right_mask_bits, right_mask_bits, lsl right_width
5523   movw fb_ptr_pitch, #(2048 + 16)
5524
5525   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5526   vmov block_masks, left_mask_bits, right_mask_bits
5527
5528   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5529   add block, psx_gpu, #psx_gpu_blocks_offset
5530
5531   bic texture_offset_base, texture_offset_base, #0xF
5532   cmp block_width, #1
5533
5534   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
5535   add block, block, num_blocks, lsl #6
5536
5537   bne 0f
5538
5539   vext.32 block_masks_shifted, block_masks, block_masks, #1
5540   vorr.u32 block_masks, block_masks, block_masks_shifted
5541   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5542
5543  1:
5544   add num_blocks, num_blocks, #1
5545   cmp num_blocks, #MAX_BLOCKS
5546   blgt setup_sprites_16bpp_flush
5547
5548   and texture_block_ptr, texture_offset_base, texture_mask
5549   subs height, height, #1
5550
5551   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5552   vld1.u32 { texels }, [texture_block_ptr, :128]
5553
5554   vst1.u32 { texels }, [block, :128]
5555   add block, block, #40
5556
5557   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5558   pld [fb_ptr]
5559
5560   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5561
5562   add block, block, #24
5563   add texture_offset_base, texture_offset_base, #2048
5564   add fb_ptr, fb_ptr, #2048
5565   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5566   bne 1b
5567
5568   ldmia sp!, { r4 - r11, pc }
5569
5570  0:
5571   add num_blocks, num_blocks, block_width
5572   mov texture_offset, texture_offset_base
5573
5574   cmp num_blocks, #MAX_BLOCKS
5575   blgt setup_sprites_16bpp_flush
5576
5577   add texture_offset_base, texture_offset_base, #2048
5578   and texture_block_ptr, texture_offset, texture_mask
5579
5580   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5581   vld1.u32 { texels }, [texture_block_ptr, :128]  
5582
5583   vst1.u32 { texels }, [block, :128]
5584   add block, block, #40
5585
5586   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5587   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5588   pld [fb_ptr]
5589
5590   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5591   subs blocks_remaining, block_width, #2
5592
5593   add texture_offset, texture_offset, #16
5594   add fb_ptr, fb_ptr, #16
5595
5596   vmov.u8 draw_mask_fb_ptr, #0
5597
5598   add block, block, #24
5599   beq 2f
5600
5601  1:
5602   and texture_block_ptr, texture_offset, texture_mask
5603   subs blocks_remaining, blocks_remaining, #1
5604
5605   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5606   vld1.u32 { texels }, [texture_block_ptr, :128]
5607
5608   vst1.u32 { texels }, [block, :128]
5609   add block, block, #40
5610
5611   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5612   pld [fb_ptr]
5613
5614   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5615   
5616   add texture_offset, texture_offset, #16
5617   add fb_ptr, fb_ptr, #16
5618
5619   add block, block, #24
5620   bne 1b
5621
5622  2:
5623   and texture_block_ptr, texture_offset, texture_mask
5624   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5625
5626   vld1.u32 { texels }, [texture_block_ptr, :128]
5627   vdup.u8 draw_mask_fb_ptr, block_masks[4]
5628
5629   vst1.u32 { texels }, [block, :128]
5630   add block, block, #40
5631
5632   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5633   vst1.u32 { draw_mask_fb_ptr }, [block, :64]
5634   
5635   add block, block, #24
5636   subs height, height, #1
5637
5638   add fb_ptr, fb_ptr, fb_ptr_pitch
5639   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5640
5641   bne 0b
5642
5643   ldmia sp!, { r4 - r11, pc }
5644
5645
5646 // 4x version
5647 // FIXME: duplicate code with normal version :(
5648 #undef draw_mask_fb_ptr
5649
5650 function(setup_sprite_16bpp_4x)
5651   stmdb sp!, { r4 - r11, r14 }
5652   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5653
5654   ldr v, [sp, #36]
5655   add fb_ptr, fb_ptr, y, lsl #11
5656
5657   ldr width, [sp, #40]
5658   add fb_ptr, fb_ptr, x, lsl #1
5659
5660   ldr height, [sp, #44]
5661   and left_offset, u, #0x7
5662
5663   add texture_offset_base, u, u
5664   add width_rounded, width, #7
5665
5666   add texture_offset_base, texture_offset_base, v, lsl #11
5667   movw left_mask_bits, #0xFFFF
5668   
5669   ldrb texture_mask_width, [psx_gpu, #psx_gpu_texture_mask_width_offset]
5670   add width_rounded, width_rounded, left_offset
5671
5672   lsl left_offset, #1
5673
5674   ldrb texture_mask_height, [psx_gpu, #psx_gpu_texture_mask_height_offset]
5675   sub fb_ptr, fb_ptr, left_offset, lsl #1
5676
5677   add texture_mask, texture_mask_width, texture_mask_width
5678   movw right_mask_bits, #0xFFFC
5679
5680   and right_width, width_rounded, #0x7
5681   mvn left_mask_bits, left_mask_bits, lsl left_offset
5682
5683   lsl right_width, #1
5684
5685   add texture_mask, texture_mask, texture_mask_height, lsl #11
5686   mov block_width, width_rounded, lsr #3
5687
5688   mov right_mask_bits, right_mask_bits, lsl right_width
5689   movw fb_ptr_pitch, #(2048 + 16) * 2
5690
5691   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
5692   vmov block_masks, left_mask_bits, right_mask_bits
5693
5694   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5695   add block, psx_gpu, #psx_gpu_blocks_offset
5696
5697   bic texture_offset_base, texture_offset_base, #0xF
5698   cmp block_width, #1
5699
5700   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_ptr_offset]
5701   add block, block, num_blocks, lsl #6
5702
5703   lsl block_width, #2
5704   bne 0f
5705
5706   vext.32 block_masks_shifted, block_masks, block_masks, #1
5707   vorr.u32 block_masks, block_masks, block_masks_shifted
5708   vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
5709   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5710
5711  1:
5712   add num_blocks, num_blocks, block_width
5713   cmp num_blocks, #MAX_BLOCKS
5714   blgt setup_sprites_16bpp_flush
5715
5716   and texture_block_ptr, texture_offset_base, texture_mask
5717   subs height, height, #1
5718
5719   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5720   vld1.u32 { texels }, [texture_block_ptr, :128]
5721
5722   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5723
5724   add texture_offset_base, texture_offset_base, #2048
5725   add fb_ptr, fb_ptr, #2048*2
5726   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5727   bne 1b
5728
5729   ldmia sp!, { r4 - r11, pc }
5730
5731  0:
5732   add num_blocks, num_blocks, block_width
5733   mov texture_offset, texture_offset_base
5734
5735   vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
5736   vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
5737
5738   cmp num_blocks, #MAX_BLOCKS
5739   blgt setup_sprites_16bpp_flush
5740
5741   add texture_offset_base, texture_offset_base, #2048
5742   and texture_block_ptr, texture_offset, texture_mask
5743
5744   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5745   vld1.u32 { texels }, [texture_block_ptr, :128]
5746
5747   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5748
5749   subs blocks_remaining, block_width, #2*4
5750   add texture_offset, texture_offset, #16
5751
5752   vmov.u8 draw_mask_fb_ptr_a, #0
5753   vmov.u8 draw_mask_fb_ptr_b, #0
5754
5755   add fb_ptr, fb_ptr, #16*2
5756   beq 2f
5757
5758  1:
5759   and texture_block_ptr, texture_offset, texture_mask
5760   subs blocks_remaining, blocks_remaining, #4
5761
5762   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5763   vld1.u32 { texels }, [texture_block_ptr, :128]
5764
5765   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5766   add texture_offset, texture_offset, #16
5767
5768   add fb_ptr, fb_ptr, #16*2
5769   bgt 1b
5770
5771  2:
5772   vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
5773   vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
5774
5775   and texture_block_ptr, texture_offset, texture_mask
5776   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5777
5778   vld1.u32 { texels }, [texture_block_ptr, :128]
5779
5780   do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
5781   subs height, height, #1
5782
5783   add fb_ptr, fb_ptr, fb_ptr_pitch
5784   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5785
5786   bne 0b
5787
5788   ldmia sp!, { r4 - r11, pc }
5789
5790
5791 #undef width
5792 #undef right_width
5793 #undef right_mask_bits
5794 #undef color
5795 #undef height
5796 #undef blocks_remaining
5797 #undef colors
5798 #undef right_mask
5799 #undef test_mask
5800 #undef draw_mask
5801
5802 #define psx_gpu                                           r0
5803 #define x                                                 r1
5804 #define y                                                 r2
5805 #define width                                             r3
5806 #define right_width                                       r5
5807 #define right_mask_bits                                   r6
5808 #define fb_ptr                                            r7
5809 #define color                                             r8
5810 #define height                                            r9
5811 #define fb_ptr_pitch                                      r12
5812
5813 // referenced by setup_sprites_16bpp_flush
5814 #define num_blocks                                        r4
5815 #define block                                             r5
5816 #define block_width                                       r11
5817
5818 #define color_r                                           r1
5819 #define color_g                                           r2
5820 #define color_b                                           r8
5821 #define blocks_remaining                                  r6
5822
5823 #define colors                                            q0
5824 #define right_mask                                        q1
5825 #define test_mask                                         q2
5826 #define draw_mask                                         q2
5827 #define draw_mask_bits_fb_ptr                             d6
5828
5829
5830 .align 3
5831
5832 function(setup_sprite_untextured_512)
5833   stmdb sp!, { r4 - r11, r14 }
5834
5835   ldr width, [sp, #40]
5836   ldr fb_ptr, [psx_gpu, #psx_gpu_vram_out_ptr_offset]
5837
5838   ldr height, [sp, #44]
5839   add fb_ptr, fb_ptr, y, lsl #11
5840
5841   add fb_ptr, fb_ptr, x, lsl #1
5842   sub right_width, width, #1
5843
5844   ldr color, [sp, #48]
5845   and right_width, #7
5846
5847   add block_width, width, #7
5848   add right_width, #1
5849
5850   lsr block_width, #3
5851   mov right_mask_bits, #0xff
5852
5853   sub fb_ptr_pitch, block_width, #1
5854   lsl right_mask_bits, right_width
5855
5856   lsl fb_ptr_pitch, #3+1
5857   ubfx color_r, color, #3, #5
5858
5859   rsb fb_ptr_pitch, #1024*2
5860   ubfx color_g, color, #11, #5
5861
5862   vld1.u32 { test_mask }, [psx_gpu, :128]
5863   ubfx color_b, color, #19, #5
5864
5865   vdup.u16 right_mask, right_mask_bits
5866   orr color, color_r, color_b, lsl #10
5867
5868   ldrh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5869   orr color, color, color_g, lsl #5
5870
5871   vtst.u16 right_mask, right_mask, test_mask
5872   add block, psx_gpu, #psx_gpu_blocks_offset
5873
5874   vdup.u16 colors, color
5875   add block, block, num_blocks, lsl #6
5876
5877
5878 setup_sprite_untextured_height_loop:
5879   add num_blocks, block_width
5880   sub blocks_remaining, block_width, #1
5881
5882   cmp num_blocks, #MAX_BLOCKS
5883   blgt setup_sprites_16bpp_flush
5884
5885   cmp blocks_remaining, #0
5886   ble 1f
5887
5888   vmov.u8 draw_mask, #0 /* zero_mask */
5889   vmov.u8 draw_mask_bits_fb_ptr, #0
5890
5891  0:
5892   vst1.u32 { draw_mask }, [block, :128]!
5893   subs blocks_remaining, #1
5894
5895   vst1.u32 { colors }, [block, :128]
5896   add block, block, #24
5897
5898   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5899   vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
5900   
5901   add block, block, #24
5902   add fb_ptr, #8*2
5903   bgt 0b
5904
5905  1:
5906   vst1.u32 { right_mask }, [block, :128]!
5907   subs height, #1
5908
5909   vst1.u32 { colors }, [block, :128]
5910   add block, block, #24
5911
5912   vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
5913   vst1.u32 { draw_mask_bits_fb_ptr }, [block, :64]
5914   
5915   add block, block, #24
5916   add fb_ptr, fb_ptr_pitch
5917
5918   strh num_blocks, [psx_gpu, #psx_gpu_num_blocks_offset]
5919   bgt setup_sprite_untextured_height_loop
5920
5921   ldmia sp!, { r4 - r11, pc }
5922
5923
5924
5925 #undef texture_page_ptr
5926 #undef vram_ptr
5927 #undef dirty_textures_mask
5928 #undef current_texture_mask
5929
5930 #define psx_gpu                                           r0
5931 #define current_texture_page                              r1
5932 #define texture_page_ptr                                  r2
5933 #define vram_ptr_a                                        r3
5934 #define current_texture_page_x                            r12
5935 #define current_texture_page_y                            r4
5936 #define dirty_textures_mask                               r5
5937 #define tile_y                                            r6
5938 #define tile_x                                            r7
5939 #define sub_y                                             r8
5940 #define current_texture_mask                              r9
5941 #define c_4096                                            r10
5942 #define vram_ptr_b                                        r11
5943
5944 #define texel_block_a                                     d0
5945 #define texel_block_b                                     d1
5946 #define texel_block_expanded_a                            q1
5947 #define texel_block_expanded_b                            q2
5948 #define texel_block_expanded_ab                           q2
5949 #define texel_block_expanded_c                            q3
5950 #define texel_block_expanded_d                            q0
5951 #define texel_block_expanded_cd                           q3
5952
5953 function(update_texture_4bpp_cache)
5954   stmdb sp!, { r4 - r11, r14 }
5955   vpush { q0 - q3 }
5956
5957   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
5958
5959   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
5960   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
5961
5962   and current_texture_page_x, current_texture_page, #0xF
5963   ldr current_texture_mask, [psx_gpu, #psx_gpu_current_texture_mask_offset]
5964
5965   mov current_texture_page_y, current_texture_page, lsr #4
5966   ldr dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
5967
5968   add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5969   mov tile_y, #16
5970
5971   add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7  
5972   bic dirty_textures_mask, current_texture_mask
5973   
5974   mov tile_x, #16
5975   str dirty_textures_mask, [psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset]
5976
5977   mov sub_y, #8
5978   movw c_4096, #4096
5979
5980   add vram_ptr_b, vram_ptr_a, #2048
5981
5982  0:
5983   vld1.u32 { texel_block_a }, [vram_ptr_a, :64], c_4096
5984   vld1.u32 { texel_block_b }, [vram_ptr_b, :64], c_4096
5985
5986   vmovl.u8 texel_block_expanded_a, texel_block_a
5987   vshll.u8 texel_block_expanded_b, texel_block_a, #4
5988   vmovl.u8 texel_block_expanded_c, texel_block_b
5989   vshll.u8 texel_block_expanded_d, texel_block_b, #4
5990
5991   vbic.u16 texel_block_expanded_a, #0x00F0
5992   vbic.u16 texel_block_expanded_b, #0x00F0
5993   vbic.u16 texel_block_expanded_c, #0x00F0
5994   vbic.u16 texel_block_expanded_d, #0x00F0
5995
5996   vorr.u16 texel_block_expanded_ab, texel_block_expanded_a,                    \
5997    texel_block_expanded_b
5998   vorr.u16 texel_block_expanded_cd, texel_block_expanded_c,                    \
5999    texel_block_expanded_d
6000
6001   vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd },               \
6002    [texture_page_ptr, :256]!
6003
6004   subs sub_y, sub_y, #1
6005   bne 0b
6006
6007   mov sub_y, #8
6008   add vram_ptr_a, vram_ptr_a, #8
6009   add vram_ptr_b, vram_ptr_b, #8
6010
6011   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6012   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6013
6014   subs tile_x, tile_x, #1
6015   bne 0b
6016
6017   mov tile_x, #16
6018   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6019   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6020
6021   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6022   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6023
6024   subs tile_y, tile_y, #1
6025   bne 0b
6026
6027   vpop { q0 - q3 }
6028   ldmia sp!, { r4 - r11, pc }
6029
6030
6031 #undef current_texture_page
6032
6033 #define psx_gpu                                           r0
6034 #define texture_page                                      r1
6035 #define texture_page_ptr                                  r2
6036 #define vram_ptr_a                                        r3
6037 #define texture_page_x                                    r12
6038 #define texture_page_y                                    r4
6039 #define current_texture_page                              r5
6040 #define tile_y                                            r6
6041 #define tile_x                                            r7
6042 #define sub_y                                             r8
6043 #define c_4096                                            r10
6044 #define vram_ptr_b                                        r11
6045
6046
6047 #undef texels_a
6048 #undef texels_b
6049
6050 #define texels_a                                          q0
6051 #define texels_b                                          q1
6052 #define texels_c                                          q2
6053 #define texels_d                                          q3
6054
6055
6056 function(update_texture_8bpp_cache_slice)
6057   stmdb sp!, { r4 - r11, r14 }
6058   vpush { q0 - q3 }
6059
6060   ldrb current_texture_page, [psx_gpu, #psx_gpu_current_texture_page_offset]
6061   ldr vram_ptr_a, [psx_gpu, #psx_gpu_vram_ptr_offset]
6062
6063   ldr texture_page_ptr, [psx_gpu, #psx_gpu_texture_page_base_offset]
6064   mov tile_y, #16
6065
6066   and texture_page_x, texture_page, #0xF
6067   mov texture_page_y, texture_page, lsr #4
6068
6069   add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7  
6070   mov tile_x, #8
6071
6072   add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
6073   eor current_texture_page, current_texture_page, texture_page
6074
6075   ands current_texture_page, current_texture_page, #0x1
6076   mov sub_y, #4
6077
6078   addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6079   movw c_4096, #4096
6080
6081   add vram_ptr_b, vram_ptr_a, #2048
6082
6083  0:
6084   vld1.u32 { texels_a }, [vram_ptr_a, :128], c_4096
6085   vld1.u32 { texels_b }, [vram_ptr_b, :128], c_4096
6086   vld1.u32 { texels_c }, [vram_ptr_a, :128], c_4096
6087   vld1.u32 { texels_d }, [vram_ptr_b, :128], c_4096
6088
6089   vst1.u32 { texels_a, texels_b }, [texture_page_ptr, :256]!
6090   vst1.u32 { texels_c, texels_d }, [texture_page_ptr, :256]!
6091
6092   subs sub_y, sub_y, #1
6093   bne 0b
6094
6095   mov sub_y, #4
6096
6097   add vram_ptr_a, vram_ptr_a, #16
6098   add vram_ptr_b, vram_ptr_b, #16
6099
6100   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
6101   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
6102
6103   subs tile_x, tile_x, #1
6104   bne 0b
6105
6106   mov tile_x, #8
6107
6108   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
6109   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
6110
6111   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
6112   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
6113
6114   subs tile_y, tile_y, #1
6115   add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
6116
6117   bne 0b
6118
6119   vpop { q0 - q3 }
6120   ldmia sp!, { r4 - r11, pc }
6121
6122
6123 /* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
6124 function(scale2x_tiles8)
6125   push { r4, r14 }
6126
6127   mov r4, r1
6128   add r12, r0, #1024*2
6129   mov r14, r2
6130
6131 0:
6132   pld [r1, #1024*2]
6133   vld1.u16 { q0 }, [r1, :128]!
6134   vld1.u16 { q2 }, [r1, :128]!
6135   vmov q1, q0
6136   vmov q3, q2
6137   vzip.16 q0, q1
6138   vzip.16 q2, q3
6139   subs r14, #2
6140   vst1.u16 { q0, q1 }, [r0, :128]!
6141   vst1.u16 { q0, q1 }, [r12, :128]!
6142   blt 1f
6143   vst1.u16 { q2, q3 }, [r0, :128]!
6144   vst1.u16 { q2, q3 }, [r12, :128]!
6145   bgt 0b
6146 1:
6147   subs r3, #1
6148   mov r14, r2
6149   add r0, #1024*2*2
6150   add r4, #1024*2
6151   sub r0, r0, r2, lsl #4+1
6152   mov r1, r4
6153   add r12, r0, #1024*2
6154   bgt 0b
6155   nop
6156
6157   pop { r4, pc }
6158
6159 // vim:filetype=armasm