294685aee8271fcdf4ef402e54d7a07dbfac5963
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
1 /*
2  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of
7  * the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  */
14
15 #define MAX_SPANS                                         512
16 #define MAX_BLOCKS                                        64
17 #define MAX_BLOCKS_PER_ROW                                128
18
19 #define psx_gpu_test_mask_offset                          0
20 #define psx_gpu_uvrg_offset                               16
21 #define psx_gpu_uvrg_dx_offset                            32
22 #define psx_gpu_uvrg_dy_offset                            48
23 #define psx_gpu_u_block_span_offset                       64
24 #define psx_gpu_v_block_span_offset                       80
25 #define psx_gpu_r_block_span_offset                       96
26 #define psx_gpu_g_block_span_offset                       112
27 #define psx_gpu_b_block_span_offset                       128
28
29 #define psx_gpu_b_dx_offset                               132
30
31 #define psx_gpu_b_offset                                  144
32 #define psx_gpu_b_dy_offset                               148
33 #define psx_gpu_triangle_area_offset                      152
34 #define psx_gpu_texture_window_settings_offset            156
35 #define psx_gpu_current_texture_mask_offset               160
36 #define psx_gpu_viewport_mask_offset                      164
37 #define psx_gpu_dirty_textures_4bpp_mask_offset           168
38 #define psx_gpu_dirty_textures_8bpp_mask_offset           172
39 #define psx_gpu_dirty_textures_8bpp_alternate_mask_offset 176
40 #define psx_gpu_triangle_color_offset                     180
41 #define psx_gpu_dither_table_offset                       184
42 #define psx_gpu_render_block_handler_offset               200
43 #define psx_gpu_texture_page_ptr_offset                   204
44 #define psx_gpu_texture_page_base_offset                  208
45 #define psx_gpu_clut_ptr_offset                           212
46 #define psx_gpu_vram_ptr_offset                           216
47
48 #define psx_gpu_render_state_base_offset                  220
49 #define psx_gpu_render_state_offset                       222
50 #define psx_gpu_num_spans_offset                          224
51 #define psx_gpu_num_blocks_offset                         226
52 #define psx_gpu_offset_x_offset                           228
53 #define psx_gpu_offset_y_offset                           230
54 #define psx_gpu_clut_settings_offset                      232
55 #define psx_gpu_texture_settings_offset                   234
56 #define psx_gpu_viewport_start_x_offset                   236
57 #define psx_gpu_viewport_start_y_offset                   238
58 #define psx_gpu_viewport_end_x_offset                     240
59 #define psx_gpu_viewport_end_y_offset                     242
60 #define psx_gpu_mask_msb_offset                           244
61                                                           
62 #define psx_gpu_triangle_winding_offset                   246
63 #define psx_gpu_display_area_draw_enable_offset           247
64 #define psx_gpu_current_texture_page_offset               248
65 #define psx_gpu_last_8bpp_texture_page_offset             249
66 #define psx_gpu_texture_mask_width_offset                 250
67 #define psx_gpu_texture_mask_height_offset                251
68 #define psx_gpu_texture_window_x_offset                   252
69 #define psx_gpu_texture_window_y_offset                   253
70 #define psx_gpu_primitive_type_offset                     254
71
72 #define psx_gpu_reserved_a_offset                         255
73
74 #define psx_gpu_blocks_offset                             0x0100
75 #define psx_gpu_span_uvrg_offset_offset                   0x2100
76 #define psx_gpu_span_edge_data_offset                     0x4100
77 #define psx_gpu_span_b_offset_offset                      0x5100
78
79 #define edge_data_left_x_offset                           0
80 #define edge_data_num_blocks_offset                       2
81 #define edge_data_right_mask_offset                       4
82 #define edge_data_y_offset                                6
83
84
85 #define psx_gpu                                           r0
86 #define v_a                                               r1
87 #define v_b                                               r2
88 #define v_c                                               r3
89
90 #define x0                                                r4
91 #define x1                                                r5
92 #define x2                                                r6
93 #define x0_x1                                             r5
94 #define x1_x2                                             r6
95 #define y0                                                r7
96 #define y1                                                r8
97 #define y2                                                r9
98 #define y0_y1                                             r7
99 #define y1_y2                                             r8
100 #define b0                                                r9
101 #define b1                                                r10
102 #define b2                                                r11
103 #define b0_b1                                             r10
104 #define b1_b2                                             r11
105
106
107 #define area_r_s                                          r5
108
109 #define g_bx0                                             r2
110 #define g_bx                                              r3
111 #define g_bx2                                             r4
112 #define g_bx3                                             r5
113 #define b_base                                            r6
114 #define g_by                                              r8
115
116 #define gs_bx                                             r7
117 #define gs_by                                             r10
118
119 #define ga_bx                                             g_bx
120 #define ga_by                                             g_by
121
122 #define gw_bx_h                                           g_bx
123 #define gw_by_h                                           g_by
124
125 #define gw_bx_l                                           r11
126 #define gw_by_l                                           gw_bx_l
127
128 #define store_a                                           r0
129 #define store_b                                           r1
130 #define store_inc                                         r5
131
132
133 #define v0                                                q0
134 #define uvrgb0                                            d0
135 #define x0_y0                                             d1
136
137 #define v1                                                q1
138 #define uvrgb1                                            d2
139 #define x1_y1                                             d3
140
141 #define v2                                                q2
142 #define uvrgb2                                            d4
143 #define x2_y2                                             d5
144
145 #define x0_ab                                             q3
146 #define uvrg_xxxx0                                        q3
147 #define uvrg0                                             d6
148 #define xxxx0                                             d7
149
150 #define x1_ab                                             q4
151 #define uvrg_xxxx1                                        q4
152 #define uvrg1                                             d8
153 #define xxxx1                                             d9
154
155 #define x2_ab                                             q5
156 #define uvrg_xxxx2                                        q5
157 #define uvrg2                                             d10
158 #define xxxx2                                             d11
159
160 #define y0_ab                                             q6
161 #define yyyy_uvrg0                                        q6
162 #define yyyy0                                             d12
163 #define uvrg0b                                            d13
164
165 #define y1_ab                                             q7
166 #define yyyy_uvrg1                                        q7
167 #define yyyy1                                             d14
168 #define uvrg1b                                            d15
169
170 #define y2_ab                                             q8
171 #define yyyy_uvrg2                                        q8
172 #define yyyy2                                             d16
173 #define uvrg2b                                            d17
174
175 #define d0_ab                                             q9
176 #define d0_a                                              d18
177 #define d0_b                                              d19
178
179 #define d1_ab                                             q10
180 #define d1_a                                              d20
181 #define d1_b                                              d21
182
183 #define d2_ab                                             q11
184 #define d2_a                                              d22
185 #define d2_b                                              d23
186
187 #define d3_ab                                             q12
188 #define d3_a                                              d24
189 #define d3_b                                              d25
190
191 #define ga_uvrg_x                                         q1
192 #define ga_uvrg_y                                         q4
193
194 #define dx                                                x0_x1
195 #define dy                                                y0_y1
196 #define db                                                b0_b1
197
198 #define uvrg_base                                         q11
199
200 #define gs_uvrg_x                                         q5
201 #define gs_uvrg_y                                         q6
202
203 #define g_uvrg_x                                          q1
204 #define ga_uv_x                                           d2
205 #define g_uv_x                                            d2
206 #define ga_rg_x                                           d3
207 #define g_rg_x                                            d3
208
209 #define g_uvrg_y                                          q4
210 #define ga_uv_y                                           d8
211 #define g_uv_y                                            d8
212 #define ga_rg_y                                           d9
213 #define g_rg_y                                            d9
214
215 #define gw_uv_x                                           q1
216 #define gw_rg_x                                           q2
217 #define gw_uv_y                                           q4
218 #define gw_rg_y                                           q3
219
220 #define w_mask                                            q9
221 #define w_mask_l                                          d18
222
223 #define r_shift                                           q10
224
225 #define uvrg_dx0                                          q0
226 #define uvrg_dx0l                                         d0
227 #define uvrg_dx0h                                         d1
228
229 #define uvrg_dx1                                          q1
230 #define uvrg_dx1l                                         d2
231 #define uvrg_dx1h                                         d3
232
233 #define uvrg_dx2                                          q2
234 #define uvrg_dx2l                                         d4
235 #define uvrg_dx2h                                         d5
236
237 #define uvrg_dx3                                          q3
238 #define uvrg_dx3l                                         d6
239 #define uvrg_dx3h                                         d7
240
241
242 .align 4
243
244 /* FIXME: users of this should be in psx_gpu instead */
245 #ifndef __PIC__
246 #define load_pointer(register, pointer)                                        \
247   movw register, :lower16:pointer;                                             \
248   movt register, :upper16:pointer;                                             \
249
250 #else
251 #define load_pointer(register, pointer)                                        \
252   ldr  register, =pointer                                                      \
253
254 #endif
255
256 #define function(name)                                                         \
257   .global name;                                                                \
258   name:                                                                        \
259
260 @ r0: psx_gpu
261 @ r1: v_a
262 @ r2: v_b
263 @ r3: v_c
264
265 function(compute_all_gradients)
266   // First compute the triangle area reciprocal and shift. The division will
267   // happen concurrently with much of the work which follows.
268   @ r12 = psx_gpu->triangle_area
269   ldr r12, [ psx_gpu, #psx_gpu_triangle_area_offset ]
270   stmdb sp!, { r4 - r11, lr }
271
272   @ load exponent of 62 into upper half of double
273   movw r4, #0
274   clz r14, r12                       @ r14 = shift
275
276   movt r4, #((62 + 1023) << 4)
277   mov r12, r12, lsl r14              @ r12 = triangle_area_normalized
278
279   @ load area normalized into lower half of double
280   mov r5, r12, lsr #10
281   vmov.f64 d30, r5, r4               @ d30 = (1 << 62) + ta_n
282
283   movt r4, #((1022 + 31) << 4)
284   mov r5, r12, lsl #20
285
286   add r4, r4, r12, lsr #11
287   vmov.f64 d31, r5, r4
288
289   vdiv.f64 d30, d30, d31             @ d30 = ((1 << 62) + ta_n) / ta_n
290
291   // ((x1 - x0) * (y2 - y1)) - ((x2 - x1) * (y1 - y0)) =
292   // ( d0       *  d1      ) - ( d2       *  d3      ) =
293   // ( m0                  ) - ( m1                  ) = gradient
294
295   // This is split to do 12 elements at a time over three sets: a, b, and c.
296   // Technically we only need to do 10 elements (uvrgb_x and uvrgb_y), so
297   // two of the slots are unused.
298
299   // Inputs are all 16-bit signed. The m0/m1 results are 32-bit signed, as
300   // is g.
301
302   // First type is:  uvrg bxxx xxxx 
303   // Second type is: yyyy ybyy uvrg 
304   // Since x_a and y_c are the same the same variable is used for both. 
305
306   vld1.u32 { v0 }, [ v_a, : 128 ]    @ v0 = { uvrg0, b0, x0, y0 }
307   ldrsh x0, [ v_a, #8 ]              @ load x0
308
309   vld1.u32 { v1 }, [ v_b, : 128 ]    @ v1 = { uvrg1, b1, x1, y1}
310   ldrh x1, [ v_b, #8 ]               @ load x1
311
312   vld1.u32 { v2 }, [ v_c, : 128 ]    @ v2 = { uvrg2, b2, x2, y2 }
313   ldrh x2, [ v_c, #8 ]               @ load x2
314
315   vmovl.u8 uvrg_xxxx0, uvrgb0        @ uvrg_xxxx0 = { uv0, rg0, b0-, -- }
316   ldrh y0, [ v_a, #10 ]              @ load y0
317
318   vmovl.u8 uvrg_xxxx1, uvrgb1        @ uvrg_xxxx1 = { uv1, rg1, b1-, -- }
319   ldrh y1, [ v_b, #10 ]              @ load y1
320
321   vmovl.u8 uvrg_xxxx2, uvrgb2        @ uvrg_xxxx2 = { uv2, rg2, b2-, -- }
322   ldrh y2, [ v_c, #10 ]              @ load y2
323
324   vmov.u8 uvrg0b, uvrg0              @ uvrg0b = { uv0, rg0 }
325   vdup.u16 xxxx0, x0_y0[0]           @ xxxx0 = { xx0, xx0 }
326
327   orr x1_x2, x1, x2, lsl #16         @ x1_x2 = { x1, x2 }
328   pkhbt x0_x1, x0, x1, lsl #16       @ x0_x1 = { x0, x1 }
329
330   vmov.u8 uvrg1b, uvrg1              @ uvrg1b = { uv1, rg1 }
331   vdup.u16 xxxx1, x1_y1[0]           @ xxxx1 = { xx1, xx1 }
332
333   vmov.u8 uvrg2b, uvrg2              @ uvrg2b = { uv2, rg2 }
334   vdup.u16 xxxx2, x2_y2[0]           @ xxxx2 = { xx2, xx2 }
335
336   ldrb b2, [ v_c, #4 ]               @ load b2
337   orr y0_y1, y0, y1, lsl #16         @ y0_y1 = { y0, y1 }
338
339   ldrb b1, [ v_b, #4 ]               @ load b1
340   orr y1_y2, y1, y2, lsl #16         @ y1_y2 = { y1, y2 }
341
342   vdup.u16 yyyy0, x0_y0[1]           @ yyyy0 = { yy0, yy0 }
343   vsub.s16 d0_ab, x1_ab, x0_ab
344
345   ldrb b0, [ v_a, #4 ]               @ load b0
346   orr b1_b2, b1, b2, lsl #16         @ b1_b2 = { b1, b2 }
347
348   vdup.u16 yyyy1, x1_y1[1]           @ yyyy1 = { yy1, yy1 }
349   vsub.s16 d2_ab, x2_ab, x1_ab
350
351   vdup.u16 yyyy2, x2_y2[1]           @ yyyy2 = { yy2, yy2 }
352   vsub.s16 d1_ab, y2_ab, y1_ab
353
354   orr b0_b1, b0, b1, lsl #16         @ b1_b2 = { b1, b2 }
355   ssub16 dx, x1_x2, x0_x1            @ dx = { x1 - x0, x2 - x1 }
356
357   ssub16 dy, y1_y2, y0_y1            @ dy = { y1 - y0, y2 - y1 }
358   ssub16 db, b1_b2, b0_b1            @ db = { b1 - b0, b2 - b1 }
359
360   vsub.s16 d3_ab, y1_ab, y0_ab
361   smusdx ga_by, dx, db               @ ga_by = ((x1 - x0) * (b2 - b1)) -
362                                      @         ((x2 - X1) * (b1 - b0)) 
363   vmull.s16 ga_uvrg_x, d0_a, d1_a
364   smusdx ga_bx, db, dy               @ ga_bx = ((b1 - b0) * (y2 - y1)) -
365                                      @         ((b2 - b1) * (y1 - y0))
366   vmlsl.s16 ga_uvrg_x, d2_a, d3_a
367   movs gs_bx, ga_bx, asr #31
368
369   vmull.s16 ga_uvrg_y, d0_b, d1_b
370   rsbmi ga_bx, ga_bx, #0
371
372   vmlsl.s16 ga_uvrg_y, d2_b, d3_b
373   movs gs_by, ga_by, asr #31
374
375   vshr.u64 d0, d30, #22
376   mov b_base, b0, lsl #16
377
378   rsbmi ga_by, ga_by, #0
379   vclt.s32 gs_uvrg_x, ga_uvrg_x, #0  @ gs_uvrg_x = ga_uvrg_x < 0
380
381   @ r12 = psx_gpu->triangle_winding_offset
382   ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
383   vclt.s32 gs_uvrg_y, ga_uvrg_y, #0  @ gs_uvrg_y = ga_uvrg_y < 0
384
385   add b_base, b_base, #0x8000
386   rsb r12, r12, #0                   @ r12 = -(triangle->winding)
387
388   vdup.u32 w_mask, r12               @ w_mask = { -w, -w, -w, -w }
389   sub r14, r14, #(62 - 12)           @ r14 = shift - (62 - FIXED_BITS)
390
391   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
392   vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
393
394   vorr.u32 uvrg_base, #0x8000
395   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
396
397   vmov area_r_s, s0                  @ area_r_s = triangle_reciprocal
398   vabs.s32 ga_uvrg_y, ga_uvrg_y      @ ga_uvrg_y = abs(ga_uvrg_y)
399
400   vmull.u32 gw_rg_x, ga_rg_x, d0[0]
401   vmull.u32 gw_uv_x, ga_uv_x, d0[0]
402   vmull.u32 gw_rg_y, ga_rg_y, d0[0]
403   vmull.u32 gw_uv_y, ga_uv_y, d0[0]
404
405   vshl.u64 gw_rg_x, gw_rg_x, r_shift
406   vshl.u64 gw_uv_x, gw_uv_x, r_shift
407   vshl.u64 gw_rg_y, gw_rg_y, r_shift
408   vshl.u64 gw_uv_y, gw_uv_y, r_shift
409
410   veor.u32 gs_uvrg_x, gs_uvrg_x, w_mask
411   vmovn.u64 g_uv_x, gw_uv_x
412
413   veor.u32 gs_uvrg_y, gs_uvrg_y, w_mask
414   vmovn.u64 g_rg_x, gw_rg_x
415
416   veor.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
417   vmovn.u64 g_uv_y, gw_uv_y
418
419   vsub.u32 g_uvrg_x, g_uvrg_x, gs_uvrg_x
420   vmovn.u64 g_rg_y, gw_rg_y
421
422   veor.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
423   mov ga_bx, ga_bx, lsl #13
424
425   vsub.u32 g_uvrg_y, g_uvrg_y, gs_uvrg_y
426   mov ga_by, ga_by, lsl #13
427
428   vdup.u32 x0_y0, x0
429   umull gw_bx_l, gw_bx_h, ga_bx, area_r_s
430
431   vshl.u32 g_uvrg_x, g_uvrg_x, #4
432   vshl.u32 g_uvrg_y, g_uvrg_y, #4
433
434   umull gw_by_l, gw_by_h, ga_by, area_r_s
435   vmls.s32 uvrg_base, ga_uvrg_x, x0_y0[0]
436
437   eor gs_bx, gs_bx, r12
438   vadd.u32 uvrg_dx2, uvrg_dx1, uvrg_dx1
439
440   veor.u32 uvrg_dx0, uvrg_dx0, uvrg_dx0
441   eor gs_by, gs_by, r12
442
443   rsb r11, r14, #0                   @ r11 = negative shift for scalar lsr
444   add store_a, psx_gpu, #psx_gpu_uvrg_offset
445
446   sub r11, r11, #(32 - 13)
447
448   add store_b, store_a, #16
449   mov store_inc, #32
450
451   vadd.u32 uvrg_dx3, uvrg_dx2, uvrg_dx1
452   vst1.u32 { uvrg_base }, [ store_a, : 128 ], store_inc
453
454   vst1.u32 { uvrg_dx1 }, [ store_b, : 128 ], store_inc
455   mov g_bx, gw_bx_h, lsr r11
456
457   vst1.u32 { g_uvrg_y }, [ store_a, : 128 ], store_inc
458   mov g_by, gw_by_h, lsr r11
459
460   vst4.u32 { uvrg_dx0l, uvrg_dx1l, uvrg_dx2l, uvrg_dx3l },                     \
461    [ store_b, : 128 ], store_inc
462   eor g_bx, g_bx, gs_bx
463
464   vst4.u32 { uvrg_dx0h, uvrg_dx1h, uvrg_dx2h, uvrg_dx3h },                     \
465    [ store_b, : 128 ], store_inc
466   sub g_bx, g_bx, gs_bx
467
468   lsl g_bx, g_bx, #4  
469   eor g_by, g_by, gs_by
470
471   mls b_base, g_bx, x0, b_base
472   sub g_by, g_by, gs_by
473
474   lsl g_by, g_by, #4
475   mov g_bx0, #0
476
477   add g_bx2, g_bx, g_bx
478   add g_bx3, g_bx, g_bx2
479
480   stmia store_b, { g_bx0, g_bx, g_bx2, g_bx3, b_base, g_by }
481
482   ldmia sp!, { r4 - r11, pc }
483
484
485 #define psx_gpu                                  r0
486 #define v_a                                      r1
487 #define v_b                                      r2
488 #define v_c                                      r3
489
490 #define temp                                     r14
491
492 #define x_a                                      r4
493 #define x_b                                      r5
494 #define x_c                                      r6
495 #define y_a                                      r1
496 #define y_b                                      r2
497 #define y_c                                      r3
498
499 #define height_minor_a                           r7
500 #define height_minor_b                           r8
501 #define height_major                             r9
502 #define height                                   r9
503
504 #define reciprocal_table_ptr                     r10
505
506 #define edge_alt_low                             r4
507 #define edge_alt_high                            r5
508 #define edge_dx_dy_alt                           r6
509 #define edge_shift_alt                           r10
510
511 #define edge_dx_dy_alt_low                       r4
512 #define edge_dx_dy_alt_high                      r5
513
514 #define span_edge_data                           r4
515 #define span_uvrg_offset                         r5
516 #define span_b_offset                            r6
517
518 #define clip                                     r14
519
520 #define b                                        r11
521 #define b_dy                                     r12
522
523
524 #define alternate_x                              q0
525 #define alternate_dx_dy                          q1
526 #define alternate_x_32                           q2
527
528 #define alternate_x_low                          d0
529 #define alternate_x_high                         d1
530 #define alternate_dx_dy_low                      d2
531 #define alternate_dx_dy_high                     d3
532 #define alternate_x_32_low                       d4
533 #define alternate_x_32_high                      d5
534
535 #define left_x                                   q3
536 #define right_x                                  q4
537 #define left_dx_dy                               q5
538 #define right_dx_dy                              q6
539 #define left_edge                                q7
540 #define right_edge                               q8
541
542 #define left_x_low                               d6
543 #define left_x_high                              d7
544 #define right_x_low                              d8
545 #define right_x_high                             d9
546 #define left_dx_dy_low                           d10
547 #define left_dx_dy_high                          d11
548 #define right_dx_dy_low                          d12
549 #define right_dx_dy_high                         d13
550 #define left_edge_low                            d14
551 #define left_edge_high                           d15
552 #define right_edge_low                           d16
553 #define right_edge_high                          d17
554
555 #define y_mid_point                              d18
556 #define c_0x0004                                 d19
557
558 #define left_right_x_16                          q11
559 #define span_shifts_y                            q12
560 #define c_0x0001                                 q13
561
562 #define span_shifts                              d24
563 #define y_x4                                     d25
564 #define c_0xFFFE                                 d26
565 #define c_0x0007                                 d27
566
567 #define left_right_x_16_low                      d22
568 #define left_right_x_16_high                     d23
569
570 #define uvrg                                     q14
571 #define uvrg_dy                                  q15
572
573 #define alternate_x_16                           d4
574
575 #define v_clip                                   q3
576 #define v_clip_low                               d6
577
578 #define right_x_32                               q10
579 #define left_x_32                                q11
580 #define alternate_select                         d24
581
582 #define right_x_32_low                           d20
583 #define right_x_32_high                          d21
584 #define left_x_32_low                            d22
585 #define left_x_32_high                           d23
586
587 #define edges_xy                                 q0
588 #define edges_dx_dy                              d2
589 #define edge_shifts                              d3
590 #define edge_shifts_64                           q2
591
592 #define edges_xy_left                            d0
593 #define edges_xy_right                           d1
594
595 #define height_reciprocals                       d6
596 #define heights                                  d7
597
598 #define widths                                   d8
599 #define c_0x01                                   d9
600 #define x_starts                                 d10
601 #define x_ends                                   d11
602
603 #define heights_b                                d12
604 #define edges_dx_dy_64                           q10
605
606 #define edges_dx_dy_64_left                      d20
607 #define edges_dx_dy_64_right                     d21
608
609
610 #define setup_spans_prologue()                                                 \
611   stmdb sp!, { r4 - r11, lr };                                                 \
612                                                                                \
613   ldrsh x_a, [ v_a, #8 ];                                                      \
614   ldrsh x_b, [ v_b, #8 ];                                                      \
615   ldrsh x_c, [ v_c, #8 ];                                                      \
616   ldrsh y_a, [ v_a, #10 ];                                                     \
617   ldrsh y_b, [ v_b, #10 ];                                                     \
618   ldrsh y_c, [ v_c, #10 ];                                                     \
619                                                                                \
620   add temp, psx_gpu, #psx_gpu_uvrg_offset;                                     \
621   vld1.32 { uvrg }, [ temp ];                                                  \
622   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
623   vld1.32 { uvrg_dy }, [ temp ];                                               \
624   load_pointer(reciprocal_table_ptr, reciprocal_table);                        \
625                                                                                \
626   vmov.u32 c_0x01, #0x01                                                       \
627
628 #define setup_spans_load_b()                                                   \
629   ldr b, [ psx_gpu, #psx_gpu_b_offset ];                                       \
630   ldr b_dy, [ psx_gpu, #psx_gpu_b_dy_offset ]                                  \
631
632 #define setup_spans_prologue_b()                                               \
633   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
634   add temp, psx_gpu, #psx_gpu_viewport_start_x_offset;                         \
635                                                                                \
636   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
637   vmov.u16 c_0x0004, #0x0004;                                                  \
638                                                                                \
639   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
640   vmov.u16 c_0x0001, #0x0001;                                                  \
641                                                                                \
642   vld1.u16 { left_edge_low[], left_edge_high[] }, [ temp ];                    \
643   add temp, psx_gpu, #psx_gpu_viewport_end_x_offset;                           \
644                                                                                \
645   vld1.u16 { right_edge_low[], right_edge_high[] }, [ temp ];                  \
646   vadd.u16 right_edge, right_edge, c_0x0001;                                   \
647                                                                                \
648   vmov.u16 c_0x0007, #0x0007;                                                  \
649   vmvn.u16 c_0xFFFE, #0x0001                                                   \
650
651
652 #define compute_edge_delta_x2()                                                \
653   ldr temp, [ reciprocal_table_ptr, height, lsl #2 ];                          \
654                                                                                \
655   vdup.u32 heights, height;                                                    \
656   vsub.u32 widths, x_ends, x_starts;                                           \
657                                                                                \
658   vdup.u32 edge_shifts, temp;                                                  \
659   vsub.u32 heights_b, heights, c_0x01;                                         \
660   vshr.u32 height_reciprocals, edge_shifts, #12;                               \
661                                                                                \
662   vmla.s32 heights_b, x_starts, heights;                                       \
663   vbic.u16 edge_shifts, #0xE0;                                                 \
664   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
665   vmull.s32 edges_xy, heights_b, height_reciprocals                            \
666
667 #define width_alt                 r6
668 #define height_reciprocal_alt     r11
669 #define height_b_alt              r12
670
671 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
672   vmov.u32 heights, height_a, height_b;                                        \
673   ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ];                        \
674   vmov.u32 edge_shifts[0], temp;                                               \
675   ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ];                        \
676   vmov.u32 edge_shifts[1], temp;                                               \
677   ldr edge_shift_alt, [ reciprocal_table_ptr, height_minor_b, lsl #2 ];        \
678                                                                                \
679   vsub.u32 widths, x_ends, x_starts;                                           \
680   sub width_alt, x_c, start_c;                                                 \
681                                                                                \
682   vsub.u32 heights_b, heights, c_0x01;                                         \
683   sub height_b_alt, height_minor_b, #1;                                        \
684                                                                                \
685   vshr.u32 height_reciprocals, edge_shifts, #12;                               \
686   lsr height_reciprocal_alt, edge_shift_alt, #12;                              \
687                                                                                \
688   vmla.s32 heights_b, x_starts, heights;                                       \
689   mla height_b_alt, height_minor_b, start_c, height_b_alt;                     \
690                                                                                \
691   vbic.u16 edge_shifts, #0xE0;                                                 \
692   and edge_shift_alt, edge_shift_alt, #0x1F;                                   \
693                                                                                \
694   vmul.s32 edges_dx_dy, widths, height_reciprocals;                            \
695   mul edge_dx_dy_alt, width_alt, height_reciprocal_alt;                        \
696                                                                                \
697   vmull.s32 edges_xy, heights_b, height_reciprocals;                           \
698   smull edge_alt_low, edge_alt_high, height_b_alt, height_reciprocal_alt       \
699
700
701 #define setup_spans_adjust_y_up()                                              \
702   vsub.u32 y_x4, y_x4, c_0x0004                                                \
703
704 #define setup_spans_adjust_y_down()                                            \
705   vadd.u32 y_x4, y_x4, c_0x0004                                                \
706
707 #define setup_spans_adjust_interpolants_up()                                   \
708   vsub.u32 uvrg, uvrg, uvrg_dy;                                                \
709   sub b, b, b_dy                                                               \
710
711 #define setup_spans_adjust_interpolants_down()                                 \
712   vadd.u32 uvrg, uvrg, uvrg_dy;                                                \
713   add b, b, b_dy                                                               \
714
715
716 #define setup_spans_clip_interpolants_increment()                              \
717   mla b, b_dy, clip, b;                                                        \
718   vmla.s32 uvrg, uvrg_dy, v_clip                                               \
719
720 #define setup_spans_clip_interpolants_decrement()                              \
721   mls b, b_dy, clip, b;                                                        \
722   vmls.s32 uvrg, uvrg_dy, v_clip                                               \
723
724 #define setup_spans_clip_alternate_yes()                                       \
725   smlal edge_alt_low, edge_alt_high, edge_dx_dy_alt, clip                      \
726
727 #define setup_spans_clip_alternate_no()                                        \
728
729 #define setup_spans_clip(direction, alternate_active)                          \
730   vdup.u32 v_clip, clip;                                                       \
731   setup_spans_clip_alternate_##alternate_active();                             \
732   setup_spans_clip_interpolants_##direction();                                 \
733   vmlal.s32 edges_xy, edges_dx_dy, v_clip_low                                  \
734
735
736 #define setup_spans_adjust_edges_alternate_no(left_index, right_index)         \
737   vmovl.s32 edge_shifts_64, edge_shifts;                                       \
738   vmovl.s32 edges_dx_dy_64, edges_dx_dy;                                       \
739                                                                                \
740   vshl.s64 edges_xy, edges_xy, edge_shifts_64;                                 \
741   vshl.s64 edges_dx_dy_64, edges_dx_dy_64, edge_shifts_64;                     \
742                                                                                \
743   vmov left_x_low, edges_xy_##left_index;                                      \
744   vmov right_x_low, edges_xy_##right_index;                                    \
745                                                                                \
746   vmov left_dx_dy_low, edges_dx_dy_64_##left_index;                            \
747   vmov left_dx_dy_high, edges_dx_dy_64_##left_index;                           \
748   vmov right_dx_dy_low, edges_dx_dy_64_##right_index;                          \
749   vmov right_dx_dy_high, edges_dx_dy_64_##right_index;                         \
750                                                                                \
751   vadd.u64 left_x_high, left_x_low, left_dx_dy_low;                            \
752   vadd.u64 right_x_high, right_x_low, right_dx_dy_low;                         \
753                                                                                \
754   vadd.u64 left_dx_dy, left_dx_dy, left_dx_dy;                                 \
755   vadd.u64 right_dx_dy, right_dx_dy, right_dx_dy                               \
756
757
758 #define setup_spans_adjust_edges_alternate_yes(left_index, right_index)        \
759   setup_spans_adjust_edges_alternate_no(left_index, right_index);              \
760                                                                                \
761   vdup.u16 y_mid_point, y_b;                                                   \
762   rsb temp, edge_shift_alt, #32;                                               \
763                                                                                \
764   lsl edge_alt_high, edge_alt_high, edge_shift_alt;                            \
765   orr edge_alt_high, edge_alt_high, edge_alt_low, lsr temp;                    \
766   lsl edge_alt_low, edge_alt_low, edge_shift_alt;                              \
767   vmov alternate_x_low, edge_alt_low, edge_alt_high;                           \
768                                                                                \
769   asr edge_dx_dy_alt_high, edge_dx_dy_alt, temp;                               \
770   lsl edge_dx_dy_alt_low, edge_dx_dy_alt, edge_shift_alt;                      \
771   vmov alternate_dx_dy_low, edge_dx_dy_alt_low, edge_dx_dy_alt_high;           \
772   vmov alternate_dx_dy_high, alternate_dx_dy_low;                              \
773                                                                                \
774   vadd.u64 alternate_x_high, alternate_x_low, alternate_dx_dy_low;             \
775   vadd.u64 alternate_dx_dy, alternate_dx_dy, alternate_dx_dy                   \
776
777
778 #define setup_spans_y_select_up()                                              \
779   vclt.s16 alternate_select, y_x4, y_mid_point                                 \
780
781 #define setup_spans_y_select_down()                                            \
782   vcgt.s16 alternate_select, y_x4, y_mid_point                                 \
783
784
785 #define setup_spans_alternate_select_left()                                    \
786   vbit.u16 left_right_x_16_low, alternate_x_16, alternate_select               \
787
788 #define setup_spans_alternate_select_right()                                   \
789   vbit.u16 left_right_x_16_high, alternate_x_16, alternate_select              \
790
791
792 #define setup_spans_set_x4_alternate_yes(alternate, direction)                 \
793   vshrn.s64 alternate_x_32_low, alternate_x, #32;                              \
794   vshrn.s64 left_x_32_low, left_x, #32;                                        \
795   vshrn.s64 right_x_32_low, right_x, #32;                                      \
796                                                                                \
797   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
798   vadd.u64 left_x, left_x, left_dx_dy;                                         \
799   vadd.u64 right_x, right_x, right_dx_dy;                                      \
800                                                                                \
801   vshrn.s64 alternate_x_32_high, alternate_x, #32;                             \
802   vshrn.s64 left_x_32_high, left_x, #32;                                       \
803   vshrn.s64 right_x_32_high, right_x, #32;                                     \
804                                                                                \
805   vadd.u64 alternate_x, alternate_x, alternate_dx_dy;                          \
806   vadd.u64 left_x, left_x, left_dx_dy;                                         \
807   vadd.u64 right_x, right_x, right_dx_dy;                                      \
808                                                                                \
809   vmovn.u32 alternate_x_16, alternate_x_32;                                    \
810   setup_spans_y_select_##direction();                                          \
811   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
812                                                                                \
813   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
814   setup_spans_alternate_select_##alternate();                                  \
815                                                                                \
816   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
817   str b, [ span_b_offset ], #4;                                                \
818   setup_spans_adjust_interpolants_##direction();                               \
819                                                                                \
820   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
821                                                                                \
822   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
823   str b, [ span_b_offset ], #4;                                                \
824   setup_spans_adjust_interpolants_##direction();                               \
825                                                                                \
826   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
827                                                                                \
828   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
829   str b, [ span_b_offset ], #4;                                                \
830   setup_spans_adjust_interpolants_##direction();                               \
831                                                                                \
832   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
833   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
834   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
835                                                                                \
836   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
837   str b, [ span_b_offset ], #4;                                                \
838   setup_spans_adjust_interpolants_##direction();                               \
839                                                                                \
840   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
841   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
842                                                                                \
843   vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!;            \
844                                                                                \
845   setup_spans_adjust_y_##direction()                                           \
846
847
848 #define setup_spans_set_x4_alternate_no(alternate, direction)                  \
849   vshrn.s64 left_x_32_low, left_x, #32;                                        \
850   vshrn.s64 right_x_32_low, right_x, #32;                                      \
851                                                                                \
852   vadd.u64 left_x, left_x, left_dx_dy;                                         \
853   vadd.u64 right_x, right_x, right_dx_dy;                                      \
854                                                                                \
855   vshrn.s64 left_x_32_high, left_x, #32;                                       \
856   vshrn.s64 right_x_32_high, right_x, #32;                                     \
857                                                                                \
858   vadd.u64 left_x, left_x, left_dx_dy;                                         \
859   vadd.u64 right_x, right_x, right_dx_dy;                                      \
860                                                                                \
861   vmovn.u32 left_right_x_16_low, left_x_32;                                    \
862   vmovn.u32 left_right_x_16_high, right_x_32;                                  \
863                                                                                \
864   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
865   str b, [ span_b_offset ], #4;                                                \
866   setup_spans_adjust_interpolants_##direction();                               \
867                                                                                \
868   vmax.s16 left_right_x_16, left_right_x_16, left_edge;                        \
869                                                                                \
870   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
871   str b, [ span_b_offset ], #4;                                                \
872   setup_spans_adjust_interpolants_##direction();                               \
873                                                                                \
874   vmin.s16 left_right_x_16, left_right_x_16, right_edge;                       \
875                                                                                \
876   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
877   str b, [ span_b_offset ], #4;                                                \
878   setup_spans_adjust_interpolants_##direction();                               \
879                                                                                \
880   vsub.u16 left_right_x_16_high, left_right_x_16_high, left_right_x_16_low;    \
881   vadd.u16 left_right_x_16_high, left_right_x_16_high, c_0x0007;               \
882   vand.u16 span_shifts, left_right_x_16_high, c_0x0007;                        \
883                                                                                \
884   vst1.u32 { uvrg }, [ span_uvrg_offset, :128 ]!;                              \
885   str b, [ span_b_offset ], #4;                                                \
886   setup_spans_adjust_interpolants_##direction();                               \
887                                                                                \
888   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
889   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
890                                                                                \
891   vst4.u16 { left_right_x_16, span_shifts_y }, [ span_edge_data ]!;            \
892                                                                                \
893   setup_spans_adjust_y_##direction()                                           \
894
895
896 #define edge_adjust_low           r11
897 #define edge_adjust_high          r12
898
899 #define setup_spans_alternate_adjust_yes()                                     \
900   smull edge_adjust_low, edge_adjust_high, edge_dx_dy_alt, height_minor_a;     \
901   subs edge_alt_low, edge_alt_low, edge_adjust_low;                            \
902   sbc edge_alt_high, edge_alt_high, edge_adjust_high                           \
903
904 #define setup_spans_alternate_adjust_no()                                      \
905
906
907 #define setup_spans_down(left_index, right_index, alternate, alternate_active) \
908   setup_spans_alternate_adjust_##alternate_active();                           \
909   setup_spans_load_b();                                                        \
910                                                                                \
911   ldrsh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ];                     \
912   subs y_c, y_c, temp;                                                         \
913   subgt height, height, y_c;                                                   \
914   addgt height, height, #1;                                                    \
915                                                                                \
916   ldrsh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ];                   \
917   subs clip, temp, y_a;                                                        \
918   ble 0f;                                                                      \
919                                                                                \
920   sub height, height, clip;                                                    \
921   add y_a, y_a, clip;                                                          \
922   setup_spans_clip(increment, alternate_active);                               \
923                                                                                \
924  0:                                                                            \
925   cmp height, #0;                                                              \
926   ble 1f;                                                                      \
927                                                                                \
928   orr temp, y_a, y_a, lsl #16;                                                 \
929   add temp, temp, #(1 << 16);                                                  \
930   add y_a, temp, #2;                                                           \
931   add y_a, y_a, #(2 << 16);                                                    \
932   vmov.u32 y_x4, temp, y_a;                                                    \
933                                                                                \
934   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
935    right_index);                                                               \
936   setup_spans_prologue_b();                                                    \
937                                                                                \
938   strh height, [ psx_gpu, #psx_gpu_num_spans_offset ];                         \
939                                                                                \
940  2:                                                                            \
941   setup_spans_set_x4_alternate_##alternate_active(alternate, down);            \
942   subs height, height, #4;                                                     \
943   bhi 2b;                                                                      \
944                                                                                \
945  1:                                                                            \
946
947
948 #define setup_spans_alternate_pre_increment_yes()                              \
949   adds edge_alt_low, edge_alt_low, edge_dx_dy_alt;                             \
950   adc edge_alt_high, edge_alt_high, edge_dx_dy_alt, asr #31                    \
951
952 #define setup_spans_alternate_pre_increment_no()                               \
953
954
955 #define setup_spans_up_decrement_yes()                                         \
956   suble height, height, #1                                                     \
957
958 #define setup_spans_up_decrement_no()                                          \
959
960
961 #define setup_spans_up(left_index, right_index, alternate, alternate_active)   \
962   setup_spans_alternate_adjust_##alternate_active();                           \
963   setup_spans_load_b();                                                        \
964   sub y_a, y_a, #1;                                                            \
965                                                                                \
966   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ];                    \
967   subs temp, temp, y_c;                                                        \
968   subgt height, height, temp;                                                  \
969   setup_spans_up_decrement_##alternate_active();                               \
970                                                                                \
971   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ];                      \
972   subs clip, y_a, temp;                                                        \
973   ble 0f;                                                                      \
974                                                                                \
975   sub height, height, clip;                                                    \
976   sub y_a, y_a, clip;                                                          \
977   setup_spans_clip(decrement, alternate_active);                               \
978                                                                                \
979  0:                                                                            \
980   cmp height, #0;                                                              \
981   ble 1f;                                                                      \
982                                                                                \
983   orr temp, y_a, y_a, lsl #16;                                                 \
984   sub temp, temp, #(1 << 16);                                                  \
985   sub y_a, temp, #2;                                                           \
986   sub y_a, y_a, #(2 << 16);                                                    \
987   vmov.u32 y_x4, temp, y_a;                                                    \
988                                                                                \
989   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
990                                                                                \
991   setup_spans_alternate_pre_increment_##alternate_active();                    \
992   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
993    right_index);                                                               \
994   setup_spans_adjust_interpolants_up();                                        \
995   setup_spans_prologue_b();                                                    \
996                                                                                \
997   strh height, [ psx_gpu, #psx_gpu_num_spans_offset ];                         \
998                                                                                \
999  2:                                                                            \
1000   setup_spans_set_x4_alternate_##alternate_active(alternate, up);              \
1001   subs height, height, #4;                                                     \
1002   bhi 2b;                                                                      \
1003                                                                                \
1004  1:                                                                            \
1005
1006
1007 #define setup_spans_epilogue()                                                 \
1008   ldmia sp!, { r4 - r11, pc }                                                  \
1009
1010
1011 #define setup_spans_up_up(minor, major)                                        \
1012   setup_spans_prologue();                                                      \
1013   sub height_minor_a, y_a, y_b;                                                \
1014   sub height_minor_b, y_b, y_c;                                                \
1015   sub height, y_a, y_c;                                                        \
1016                                                                                \
1017   vdup.u32 x_starts, x_a;                                                      \
1018   vmov.u32 x_ends, x_c, x_b;                                                   \
1019                                                                                \
1020   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1021   setup_spans_up(major, minor, minor, yes);                                    \
1022   setup_spans_epilogue()                                                       \
1023
1024 function(setup_spans_up_left)
1025   setup_spans_up_up(left, right)
1026
1027 function(setup_spans_up_right)
1028   setup_spans_up_up(right, left)
1029
1030 .pool
1031
1032 #define setup_spans_down_down(minor, major)                                    \
1033   setup_spans_prologue();                                                      \
1034   sub height_minor_a, y_b, y_a;                                                \
1035   sub height_minor_b, y_c, y_b;                                                \
1036   sub height, y_c, y_a;                                                        \
1037                                                                                \
1038   vdup.u32 x_starts, x_a;                                                      \
1039   vmov.u32 x_ends, x_c, x_b;                                                   \
1040                                                                                \
1041   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
1042   setup_spans_down(major, minor, minor, yes);                                  \
1043   setup_spans_epilogue()                                                       \
1044
1045 function(setup_spans_down_left)
1046   setup_spans_down_down(left, right)
1047
1048 function(setup_spans_down_right)
1049   setup_spans_down_down(right, left)
1050
1051
1052 #define setup_spans_up_flat()                                                  \
1053   sub height, y_a, y_c;                                                        \
1054                                                                                \
1055   compute_edge_delta_x2();                                                     \
1056   setup_spans_up(left, right, none, no);                                       \
1057   setup_spans_epilogue()                                                       \
1058
1059 function(setup_spans_up_a)
1060   setup_spans_prologue()
1061
1062   vmov.u32 x_starts, x_a, x_b
1063   vdup.u32 x_ends, x_c
1064
1065   setup_spans_up_flat()
1066
1067 function(setup_spans_up_b)
1068   setup_spans_prologue()
1069
1070   vdup.u32 x_starts, x_a
1071   vmov.u32 x_ends, x_b, x_c
1072
1073   setup_spans_up_flat()
1074
1075 #define setup_spans_down_flat()                                                \
1076   sub height, y_c, y_a;                                                        \
1077                                                                                \
1078   compute_edge_delta_x2();                                                     \
1079   setup_spans_down(left, right, none, no);                                     \
1080   setup_spans_epilogue()                                                       \
1081
1082 function(setup_spans_down_a)
1083   setup_spans_prologue()
1084
1085   vmov.u32 x_starts, x_a, x_b
1086   vdup.u32 x_ends, x_c
1087
1088   setup_spans_down_flat()
1089
1090 function(setup_spans_down_b)
1091   setup_spans_prologue()
1092
1093   vdup.u32 x_starts, x_a
1094   vmov.u32 x_ends, x_b, x_c
1095
1096   setup_spans_down_flat()
1097
1098
1099 #define middle_y                                          r9
1100
1101 #define edges_xy_b                                        q11
1102 #define edges_dx_dy_b                                     d26
1103 #define edge_shifts_b                                     d27
1104 #define edges_dx_dy_and_shifts_b                          q13
1105 #define height_increment                                  d20
1106
1107 #define edges_dx_dy_and_shifts                            q1
1108
1109 #define edges_xy_b_left                                   d22
1110 #define edges_xy_b_right                                  d23
1111
1112 #define setup_spans_up_down_load_edge_set_b()                                  \
1113   vmov edges_xy, edges_xy_b;                                                   \
1114   vmov edges_dx_dy_and_shifts, edges_dx_dy_and_shifts_b                        \
1115
1116
1117 function(setup_spans_up_down)
1118   setup_spans_prologue()
1119
1120   // s32 middle_y = y_a;
1121   sub height_minor_a, y_a, y_b
1122   sub height_minor_b, y_c, y_a
1123   sub height_major, y_c, y_b
1124
1125   vmov.u32 x_starts, x_a, x_c
1126   vdup.u32 x_ends, x_b
1127
1128   compute_edge_delta_x3(x_a, height_minor_a, height_major)
1129
1130   mov temp, #0
1131   vmov.u32 height_increment, temp, height_minor_b
1132   vmlal.s32 edges_xy, edges_dx_dy, height_increment
1133
1134   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
1135   vmov edges_xy_b_right, edges_xy_right
1136
1137   vmov edge_shifts_b, edge_shifts
1138   vmov.u32 edge_shifts_b[0], edge_shift_alt
1139
1140   vneg.s32 edges_dx_dy_b, edges_dx_dy
1141   vmov.u32 edges_dx_dy_b[0], edge_dx_dy_alt
1142
1143   mov middle_y, y_a
1144   
1145   setup_spans_load_b()
1146   sub y_a, y_a, #1
1147
1148   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1149   subs temp, temp, y_b
1150   subgt height_minor_a, height_minor_a, temp
1151
1152   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1153   subs clip, y_a, temp
1154   ble 0f
1155
1156   sub height_minor_a, height_minor_a, clip
1157   sub y_a, y_a, clip
1158   setup_spans_clip(decrement, no)
1159
1160  0:                                                                
1161   cmp height_minor_a, #0
1162   ble 3f
1163
1164   orr temp, y_a, y_a, lsl #16
1165   sub temp, temp, #(1 << 16)
1166   sub y_a, temp, #2
1167   sub y_a, y_a, #(2 << 16)
1168   vmov.u32 y_x4, temp, y_a
1169
1170   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
1171
1172   strh height_minor_a, [ psx_gpu, #psx_gpu_num_spans_offset ]
1173
1174   setup_spans_adjust_edges_alternate_no(left, right); 
1175   setup_spans_adjust_interpolants_up()
1176   setup_spans_up_down_load_edge_set_b()
1177
1178   setup_spans_prologue_b()
1179
1180
1181  2: 
1182   setup_spans_set_x4_alternate_no(none, up)
1183   subs height_minor_a, height_minor_a, #4
1184   bhi 2b
1185
1186   add span_edge_data, span_edge_data, height_minor_a, lsl #3
1187   add span_uvrg_offset, span_uvrg_offset, height_minor_a, lsl #4
1188   add span_b_offset, span_b_offset, height_minor_a, lsl #2
1189
1190  4:
1191   add temp, psx_gpu, #psx_gpu_uvrg_offset
1192   vld1.32 { uvrg }, [ temp ]
1193   mov y_a, middle_y
1194   
1195   setup_spans_load_b()
1196
1197   ldrh temp, [ psx_gpu, #psx_gpu_viewport_end_y_offset ]
1198   subs y_c, y_c, temp
1199   subgt height_minor_b, height_minor_b, y_c
1200   addgt height_minor_b, height_minor_b, #1
1201
1202   ldrh temp, [ psx_gpu, #psx_gpu_viewport_start_y_offset ]
1203   subs clip, temp, y_a
1204   ble 0f
1205
1206   sub height_minor_b, height_minor_b, clip
1207   add y_a, y_a, clip
1208   setup_spans_clip(increment, no)
1209
1210  0:
1211   cmp height_minor_b, #0
1212   ble 1f
1213
1214   orr temp, y_a, y_a, lsl #16
1215   add temp, temp, #(1 << 16) 
1216   add y_a, temp, #2
1217   add y_a, y_a, #(2 << 16)
1218   vmov.u32 y_x4, temp, y_a
1219
1220   setup_spans_adjust_edges_alternate_no(left, right)
1221
1222   ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1223   add temp, temp, height_minor_b
1224   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
1225
1226  2:                                                     
1227   setup_spans_set_x4_alternate_no(none, down)
1228   subs height_minor_b, height_minor_b, #4
1229   bhi 2b
1230
1231  1:
1232   setup_spans_epilogue()
1233
1234  3:
1235   setup_spans_up_down_load_edge_set_b()
1236   setup_spans_prologue_b()
1237   bal 4b
1238
1239 .pool
1240
1241 #undef span_uvrg_offset
1242 #undef span_edge_data
1243 #undef span_b_offset
1244 #undef left_x
1245 #undef b
1246
1247 #define psx_gpu                                  r0
1248 #define num_spans                                r1
1249 #define span_uvrg_offset                         r2
1250 #define span_edge_data                           r3
1251 #define span_b_offset                            r4
1252 #define b_dx                                     r5
1253 #define span_num_blocks                          r6
1254 #define y                                        r7
1255 #define left_x                                   r8
1256 #define b                                        r9
1257 #define dither_offset_ptr                        r10
1258 #define block_ptr_a                              r11
1259 #define fb_ptr                                   r12
1260 #define num_blocks                               r14
1261
1262 #define uvrg_dx_ptr                              r2
1263 #define texture_mask_ptr                         r3
1264 #define dither_shift                             r8
1265 #define dither_row                               r10
1266
1267 #define c_32                                     r7
1268 #define b_dx4                                    r8
1269 #define b_dx8                                    r9
1270 #define block_ptr_b                              r10
1271
1272 #define block_span_ptr                           r10
1273 #define right_mask                               r8
1274
1275 #define color                                    r2
1276 #define color_r                                  r3
1277 #define color_g                                  r4
1278 #define color_b                                  r5
1279
1280 #undef uvrg
1281
1282 #define u_block                                  q0
1283 #define v_block                                  q1
1284 #define r_block                                  q2
1285 #define g_block                                  q3
1286 #define b_block                                  q4
1287
1288 #define uv_dx4                                   d10
1289 #define rg_dx4                                   d11
1290 #define uv_dx8                                   d12
1291 #define rg_dx8                                   d13
1292 #define b_whole_8                                d14
1293 #define fb_mask_ptrs                             d15
1294
1295 #define uvrg_dx4                                 q5
1296 #define uvrg_dx8                                 q6
1297 #define uv_dx8                                   d12
1298 #define rg_dx8                                   d13
1299
1300 #define u_whole                                  q8
1301 #define v_whole                                  q9
1302 #define r_whole                                  q10
1303 #define g_whole                                  q11
1304 #define b_whole                                  q12
1305
1306 #define u_whole_low                              d16
1307 #define u_whole_high                             d17
1308 #define v_whole_low                              d18
1309 #define v_whole_high                             d19
1310 #define r_whole_low                              d20
1311 #define r_whole_high                             d21
1312 #define g_whole_low                              d22
1313 #define g_whole_high                             d23
1314 #define b_whole_low                              d24
1315 #define b_whole_high                             d25
1316
1317 #define dx4                                      q13
1318 #define dx8                                      q13
1319
1320 #define u_whole_8                                d26
1321 #define v_whole_8                                d27
1322 #define u_whole_8b                               d24
1323 #define r_whole_8                                d24
1324 #define g_whole_8                                d25
1325
1326 #define uv_whole_8                               q13
1327 #define uv_whole_8b                              q14
1328
1329 #define dither_offsets                           q14
1330 #define texture_mask                             q15
1331 #define texture_mask_u                           d30
1332 #define texture_mask_v                           d31
1333
1334 #define dither_offsets_short                     d28
1335
1336 #define v_left_x                                 q8
1337 #define uvrg                                     q9
1338 #define block_span                               q10
1339
1340 #define uv                                       d18
1341 #define rg                                       d19
1342
1343 #define draw_mask                                q1
1344 #define draw_mask_edge                           q13
1345 #define test_mask                                q0
1346
1347 #define uvrg_dx                                  q3
1348
1349 #define colors                                   q2
1350
1351 #define setup_blocks_texture_swizzled()                                        \
1352   vand.u8 u_whole_8b, u_whole_8, texture_mask_u;                               \
1353   vsli.u8 u_whole_8, v_whole_8, #4;                                            \
1354   vsri.u8 v_whole_8, u_whole_8b, #4                                            \
1355
1356 #define setup_blocks_texture_unswizzled()                                      \
1357
1358
1359 #define setup_blocks_shaded_textured_builder(swizzling)                        \
1360 .align 3;                                                                      \
1361                                                                                \
1362 function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
1363   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
1364   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1365                                                                                \
1366   vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ];                                 \
1367   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1368                                                                                \
1369   cmp num_spans, #0;                                                           \
1370   bxeq lr;                                                                     \
1371                                                                                \
1372   stmdb sp!, { r4 - r11, r14 };                                                \
1373   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1374                                                                                \
1375   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
1376   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1377                                                                                \
1378   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ];   \
1379   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1380                                                                                \
1381   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1382   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1383                                                                                \
1384   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
1385   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1386                                                                                \
1387   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1388                                                                                \
1389  0:                                                                            \
1390   vmov.u8 fb_mask_ptrs, #0;                                                    \
1391                                                                                \
1392   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
1393   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1394                                                                                \
1395   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
1396   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
1397                                                                                \
1398   cmp span_num_blocks, #0;                                                     \
1399   beq 1f;                                                                      \
1400                                                                                \
1401   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
1402   add num_blocks, span_num_blocks, num_blocks;                                 \
1403                                                                                \
1404   cmp num_blocks, #MAX_BLOCKS;                                                 \
1405   bgt 2f;                                                                      \
1406                                                                                \
1407  3:                                                                            \
1408   ldr b, [ span_b_offset ];                                                    \
1409   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1410                                                                                \
1411   vdup.u32 v_left_x, left_x;                                                   \
1412   and y, y, #0x3;                                                              \
1413                                                                                \
1414   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
1415   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1416                                                                                \
1417   mla b, b_dx, left_x, b;                                                      \
1418   and dither_shift, left_x, #0x03;                                             \
1419                                                                                \
1420   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
1421   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1422                                                                                \
1423   mov dither_shift, dither_shift, lsl #3;                                      \
1424   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1425                                                                                \
1426   mov c_32, #32;                                                               \
1427   subs span_num_blocks, span_num_blocks, #1;                                   \
1428                                                                                \
1429   mov dither_row, dither_row, ror dither_shift;                                \
1430   mov b_dx4, b_dx, lsl #2;                                                     \
1431                                                                                \
1432   vdup.u32 dither_offsets_short, dither_row;                                   \
1433   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1434                                                                                \
1435   vdup.u32 b_block, b;                                                         \
1436   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1437                                                                                \
1438   vdup.u32 u_block, uv[0];                                                     \
1439   mov b_dx8, b_dx, lsl #3;                                                     \
1440                                                                                \
1441   vdup.u32 v_block, uv[1];                                                     \
1442   vdup.u32 r_block, rg[0];                                                     \
1443   vdup.u32 g_block, rg[1];                                                     \
1444                                                                                \
1445   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1446                                                                                \
1447   vadd.u32 u_block, u_block, block_span;                                       \
1448   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1449                                                                                \
1450   vadd.u32 v_block, v_block, block_span;                                       \
1451   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1452                                                                                \
1453   vadd.u32 r_block, r_block, block_span;                                       \
1454   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1455                                                                                \
1456   vadd.u32 g_block, g_block, block_span;                                       \
1457   vld1.u32 { block_span }, [ block_span_ptr, :128 ];                           \
1458                                                                                \
1459   vadd.u32 b_block, b_block, block_span;                                       \
1460   add block_ptr_b, block_ptr_a, #16;                                           \
1461                                                                                \
1462   vshrn.u32 u_whole_low, u_block, #16;                                         \
1463   vshrn.u32 v_whole_low, v_block, #16;                                         \
1464   vshrn.u32 r_whole_low, r_block, #16;                                         \
1465   vshrn.u32 g_whole_low, g_block, #16;                                         \
1466                                                                                \
1467   vdup.u32 dx4, uv_dx4[0];                                                     \
1468   vshrn.u32 b_whole_low, b_block, #16;                                         \
1469                                                                                \
1470   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1471   vdup.u32 dx4, uv_dx4[1];                                                     \
1472                                                                                \
1473   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1474   vdup.u32 dx4, rg_dx4[0];                                                     \
1475                                                                                \
1476   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1477   vdup.u32 dx4, rg_dx4[1];                                                     \
1478                                                                                \
1479   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1480   vdup.u32 dx4, b_dx4;                                                         \
1481                                                                                \
1482   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1483   vdup.u32 dx8, uv_dx8[0];                                                     \
1484                                                                                \
1485   vadd.u32 u_block, u_block, dx8;                                              \
1486   vdup.u32 dx8, uv_dx8[1];                                                     \
1487                                                                                \
1488   vadd.u32 v_block, v_block, dx8;                                              \
1489   vdup.u32 dx8, rg_dx8[0];                                                     \
1490                                                                                \
1491   vadd.u32 r_block, r_block, dx8;                                              \
1492   vdup.u32 dx8, rg_dx8[1];                                                     \
1493                                                                                \
1494   vadd.u32 g_block, g_block, dx8;                                              \
1495   vdup.u32 dx8, b_dx8;                                                         \
1496                                                                                \
1497   vadd.u32 b_block, b_block, dx8;                                              \
1498   vmovn.u16 u_whole_8, u_whole;                                                \
1499                                                                                \
1500   vmovn.u16 v_whole_8, v_whole;                                                \
1501                                                                                \
1502   vmovn.u16 b_whole_8, b_whole;                                                \
1503   pld [ fb_ptr ];                                                              \
1504   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1505                                                                                \
1506   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1507   setup_blocks_texture_##swizzling();                                          \
1508                                                                                \
1509   vmovn.u16 r_whole_8, r_whole;                                                \
1510   beq 5f;                                                                      \
1511                                                                                \
1512  4:                                                                            \
1513   vmovn.u16 g_whole_8, g_whole;                                                \
1514   vshrn.u32 u_whole_low, u_block, #16;                                         \
1515                                                                                \
1516   vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32;               \
1517   vshrn.u32 v_whole_low, v_block, #16;                                         \
1518                                                                                \
1519   vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32;              \
1520   vshrn.u32 r_whole_low, r_block, #16;                                         \
1521                                                                                \
1522   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1523   vshrn.u32 g_whole_low, g_block, #16;                                         \
1524                                                                                \
1525   vdup.u32 dx4, uv_dx4[0];                                                     \
1526   vshrn.u32 b_whole_low, b_block, #16;                                         \
1527                                                                                \
1528   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1529   vdup.u32 dx4, uv_dx4[1];                                                     \
1530                                                                                \
1531   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1532   vdup.u32 dx4, rg_dx4[0];                                                     \
1533                                                                                \
1534   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
1535   vdup.u32 dx4, rg_dx4[1];                                                     \
1536                                                                                \
1537   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
1538   vdup.u32 dx4, b_dx4;                                                         \
1539                                                                                \
1540   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
1541   vdup.u32 dx8, uv_dx8[0];                                                     \
1542                                                                                \
1543   vadd.u32 u_block, u_block, dx8;                                              \
1544   vdup.u32 dx8, uv_dx8[1];                                                     \
1545                                                                                \
1546   vadd.u32 v_block, v_block, dx8;                                              \
1547   vdup.u32 dx8, rg_dx8[0];                                                     \
1548                                                                                \
1549   vadd.u32 r_block, r_block, dx8;                                              \
1550   vdup.u32 dx8, rg_dx8[1];                                                     \
1551                                                                                \
1552   vadd.u32 g_block, g_block, dx8;                                              \
1553   vdup.u32 dx8, b_dx8;                                                         \
1554                                                                                \
1555   vadd.u32 b_block, b_block, dx8;                                              \
1556   vmovn.u16 u_whole_8, u_whole;                                                \
1557                                                                                \
1558   add fb_ptr, fb_ptr, #16;                                                     \
1559   vmovn.u16 v_whole_8, v_whole;                                                \
1560                                                                                \
1561   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1562   vmovn.u16 b_whole_8, b_whole;                                                \
1563                                                                                \
1564   pld [ fb_ptr ];                                                              \
1565                                                                                \
1566   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1567   subs span_num_blocks, span_num_blocks, #1;                                   \
1568                                                                                \
1569   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1570   setup_blocks_texture_##swizzling();                                          \
1571                                                                                \
1572   vmovn.u16 r_whole_8, r_whole;                                                \
1573   bne 4b;                                                                      \
1574                                                                                \
1575  5:                                                                            \
1576   vmovn.u16 g_whole_8, g_whole;                                                \
1577   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
1578                                                                                \
1579   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
1580   vdup.u8 draw_mask, right_mask;                                               \
1581                                                                                \
1582   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1583   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1584   vzip.u8 u_whole_8, v_whole_8;                                                \
1585                                                                                \
1586   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1587   vst1.u32 { r_whole_8, g_whole_8 }, [ block_ptr_b, :128 ], c_32;              \
1588   vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32;                        \
1589   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1590   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1591                                                                                \
1592  1:                                                                            \
1593   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1594   add span_b_offset, span_b_offset, #4;                                        \
1595                                                                                \
1596   add span_edge_data, span_edge_data, #8;                                      \
1597   subs num_spans, num_spans, #1;                                               \
1598                                                                                \
1599   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1600   bne 0b;                                                                      \
1601                                                                                \
1602   ldmia sp!, { r4 - r11, pc };                                                 \
1603                                                                                \
1604  2:                                                                            \
1605   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1606   vpush { texture_mask };                                                      \
1607   vpush { uvrg_dx4 };                                                          \
1608                                                                                \
1609   stmdb sp!, { r0 - r3, r12, r14 };                                            \
1610   bl flush_render_block_buffer;                                                \
1611   ldmia sp!, { r0 - r3, r12, r14 };                                            \
1612                                                                                \
1613   vpop { uvrg_dx4 };                                                           \
1614   vpop { texture_mask };                                                       \
1615                                                                                \
1616   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1617   vmov.u8 fb_mask_ptrs, #0;                                                    \
1618                                                                                \
1619   mov num_blocks, span_num_blocks;                                             \
1620   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1621   bal 3b                                                                       \
1622
1623
1624 setup_blocks_shaded_textured_builder(swizzled)
1625 setup_blocks_shaded_textured_builder(unswizzled)
1626
1627
1628 #define setup_blocks_unshaded_textured_builder(swizzling)                      \
1629 .align 3;                                                                      \
1630                                                                                \
1631 function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
1632   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
1633   add uvrg_dx_ptr, psx_gpu, #psx_gpu_uvrg_dx_offset;                           \
1634                                                                                \
1635   vld1.u32 { uvrg_dx }, [ uvrg_dx_ptr, :128 ];                                 \
1636   add texture_mask_ptr, psx_gpu, #psx_gpu_texture_mask_width_offset;           \
1637                                                                                \
1638   cmp num_spans, #0;                                                           \
1639   bxeq lr;                                                                     \
1640                                                                                \
1641   stmdb sp!, { r4 - r11, r14 };                                                \
1642   vshl.u32 uvrg_dx4, uvrg_dx, #2;                                              \
1643                                                                                \
1644   vshl.u32 uvrg_dx8, uvrg_dx, #3;                                              \
1645                                                                                \
1646   vld2.u8 { texture_mask_u[], texture_mask_v[] }, [ texture_mask_ptr, :16 ];   \
1647   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
1648                                                                                \
1649   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1650   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
1651                                                                                \
1652   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1653                                                                                \
1654   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
1655                                                                                \
1656  0:                                                                            \
1657   vmov.u8 fb_mask_ptrs, #0;                                                    \
1658                                                                                \
1659   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
1660   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
1661                                                                                \
1662   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
1663   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
1664                                                                                \
1665   cmp span_num_blocks, #0;                                                     \
1666   beq 1f;                                                                      \
1667                                                                                \
1668   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
1669   add num_blocks, span_num_blocks, num_blocks;                                 \
1670                                                                                \
1671   cmp num_blocks, #MAX_BLOCKS;                                                 \
1672   bgt 2f;                                                                      \
1673                                                                                \
1674  3:                                                                            \
1675   add fb_ptr, fb_ptr, y, lsl #11;                                              \
1676                                                                                \
1677   vdup.u32 v_left_x, left_x;                                                   \
1678   and y, y, #0x3;                                                              \
1679                                                                                \
1680   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
1681   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
1682                                                                                \
1683   and dither_shift, left_x, #0x03;                                             \
1684                                                                                \
1685   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
1686   vshr.u32 uvrg_dx, uvrg_dx4, #2;                                              \
1687                                                                                \
1688   mov dither_shift, dither_shift, lsl #3;                                      \
1689   vmla.u32 uvrg, uvrg_dx, v_left_x;                                            \
1690                                                                                \
1691   mov c_32, #32;                                                               \
1692   subs span_num_blocks, span_num_blocks, #1;                                   \
1693                                                                                \
1694   mov dither_row, dither_row, ror dither_shift;                                \
1695                                                                                \
1696   vdup.u32 dither_offsets_short, dither_row;                                   \
1697   add block_span_ptr, psx_gpu, #psx_gpu_u_block_span_offset;                   \
1698                                                                                \
1699   vshll.s8 dither_offsets, dither_offsets_short, #4;                           \
1700                                                                                \
1701   vdup.u32 u_block, uv[0];                                                     \
1702                                                                                \
1703   vdup.u32 v_block, uv[1];                                                     \
1704   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1705                                                                                \
1706   vadd.u32 u_block, u_block, block_span;                                       \
1707   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
1708                                                                                \
1709   vadd.u32 v_block, v_block, block_span;                                       \
1710   add block_ptr_b, block_ptr_a, #16;                                           \
1711                                                                                \
1712   vshrn.u32 u_whole_low, u_block, #16;                                         \
1713   vshrn.u32 v_whole_low, v_block, #16;                                         \
1714                                                                                \
1715   vdup.u32 dx4, uv_dx4[0];                                                     \
1716                                                                                \
1717   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1718   vdup.u32 dx4, uv_dx4[1];                                                     \
1719                                                                                \
1720   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1721   vdup.u32 dx8, uv_dx8[0];                                                     \
1722                                                                                \
1723   vadd.u32 u_block, u_block, dx8;                                              \
1724   vdup.u32 dx8, uv_dx8[1];                                                     \
1725                                                                                \
1726   vadd.u32 v_block, v_block, dx8;                                              \
1727   vmovn.u16 u_whole_8, u_whole;                                                \
1728                                                                                \
1729   vmovn.u16 v_whole_8, v_whole;                                                \
1730                                                                                \
1731   pld [ fb_ptr ];                                                              \
1732   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1733                                                                                \
1734   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1735   setup_blocks_texture_##swizzling();                                          \
1736                                                                                \
1737   beq 5f;                                                                      \
1738                                                                                \
1739  4:                                                                            \
1740   vshrn.u32 u_whole_low, u_block, #16;                                         \
1741                                                                                \
1742   vst2.u8 { u_whole_8, v_whole_8 }, [ block_ptr_a, :128 ], c_32;               \
1743   vshrn.u32 v_whole_low, v_block, #16;                                         \
1744                                                                                \
1745   add block_ptr_b, block_ptr_b, #32;                                           \
1746   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1747                                                                                \
1748   vdup.u32 dx4, uv_dx4[0];                                                     \
1749   vaddhn.u32 u_whole_high, u_block, dx4;                                       \
1750   vdup.u32 dx4, uv_dx4[1];                                                     \
1751                                                                                \
1752   vaddhn.u32 v_whole_high, v_block, dx4;                                       \
1753   vdup.u32 dx8, uv_dx8[0];                                                     \
1754                                                                                \
1755   vadd.u32 u_block, u_block, dx8;                                              \
1756   vdup.u32 dx8, uv_dx8[1];                                                     \
1757                                                                                \
1758   vadd.u32 v_block, v_block, dx8;                                              \
1759   vmovn.u16 u_whole_8, u_whole;                                                \
1760                                                                                \
1761   add fb_ptr, fb_ptr, #16;                                                     \
1762   vmovn.u16 v_whole_8, v_whole;                                                \
1763                                                                                \
1764   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1765   pld [ fb_ptr ];                                                              \
1766                                                                                \
1767   vmov.u32 fb_mask_ptrs[1], fb_ptr;                                            \
1768   subs span_num_blocks, span_num_blocks, #1;                                   \
1769                                                                                \
1770   vand.u8 uv_whole_8, uv_whole_8, texture_mask;                                \
1771   setup_blocks_texture_##swizzling();                                          \
1772                                                                                \
1773   bne 4b;                                                                      \
1774                                                                                \
1775  5:                                                                            \
1776   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
1777                                                                                \
1778   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
1779   vdup.u8 draw_mask, right_mask;                                               \
1780                                                                                \
1781   vmov.u32 fb_mask_ptrs[0], right_mask;                                        \
1782   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
1783   vzip.u8 u_whole_8, v_whole_8;                                                \
1784                                                                                \
1785   vbic.u16 uv_whole_8, uv_whole_8, draw_mask;                                  \
1786   add block_ptr_b, block_ptr_b, #32;                                           \
1787   vst1.u32 { uv_whole_8 }, [ block_ptr_a, :128 ], c_32;                        \
1788   vst1.u32 { dither_offsets }, [ block_ptr_b, :128 ], c_32;                    \
1789   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32;           \
1790                                                                                \
1791  1:                                                                            \
1792   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
1793   add span_edge_data, span_edge_data, #8;                                      \
1794   subs num_spans, num_spans, #1;                                               \
1795                                                                                \
1796   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
1797   bne 0b;                                                                      \
1798                                                                                \
1799   ldmia sp!, { r4 - r11, pc };                                                 \
1800                                                                                \
1801  2:                                                                            \
1802   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
1803   vpush { texture_mask };                                                      \
1804   vpush { uvrg_dx4 };                                                          \
1805                                                                                \
1806   stmdb sp!, { r0 - r3, r12, r14 };                                            \
1807   bl flush_render_block_buffer;                                                \
1808   ldmia sp!, { r0 - r3, r12, r14 };                                            \
1809                                                                                \
1810   vpop { uvrg_dx4 };                                                           \
1811   vpop { texture_mask };                                                       \
1812                                                                                \
1813   vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4;                                       \
1814   vmov.u8 fb_mask_ptrs, #0;                                                    \
1815                                                                                \
1816   mov num_blocks, span_num_blocks;                                             \
1817   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
1818   bal 3b                                                                       \
1819
1820
1821 setup_blocks_unshaded_textured_builder(swizzled)
1822 setup_blocks_unshaded_textured_builder(unswizzled)
1823
1824
1825 .align 3
1826
1827 function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
1828   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1829   veor.u32 draw_mask, draw_mask, draw_mask
1830
1831   cmp num_spans, #0
1832   bxeq lr
1833
1834   stmdb sp!, { r4 - r11, r14 }
1835   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1836
1837   ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1838
1839   ubfx color_r, color, #3, #5
1840   ubfx color_g, color, #11, #5
1841   ubfx color_b, color, #19, #5
1842
1843   orr color, color_r, color_b, lsl #10
1844   orr color, color, color_g, lsl #5
1845
1846   vdup.u16 colors, color
1847
1848   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1849   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1850
1851   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1852   add block_ptr_a, block_ptr_a, num_blocks, lsl #6
1853
1854  0:
1855   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1856   ldrh y, [ span_edge_data, #edge_data_y_offset ]
1857
1858   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
1859
1860   cmp span_num_blocks, #0
1861   beq 1f
1862
1863   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1864   add num_blocks, span_num_blocks, num_blocks
1865
1866   cmp num_blocks, #MAX_BLOCKS
1867   bgt 2f
1868
1869  3:
1870   add fb_ptr, fb_ptr, y, lsl #11
1871   and y, y, #0x3
1872
1873   add fb_ptr, fb_ptr, left_x, lsl #1
1874   mov c_32, #32
1875
1876   subs span_num_blocks, span_num_blocks, #1
1877
1878   add block_ptr_b, block_ptr_a, #16
1879   pld [ fb_ptr ]
1880
1881   vmov.u32 fb_mask_ptrs[1], fb_ptr
1882   beq 5f
1883
1884  4:
1885   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_32
1886   vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1887   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1888
1889   add fb_ptr, fb_ptr, #16
1890   add block_ptr_b, block_ptr_b, #32
1891
1892   pld [ fb_ptr ]
1893
1894   vmov.u32 fb_mask_ptrs[1], fb_ptr
1895   subs span_num_blocks, span_num_blocks, #1
1896
1897   bne 4b
1898
1899  5:
1900   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1901
1902   vdup.u8 draw_mask_edge, right_mask
1903   vtst.u16 draw_mask_edge, draw_mask_edge, test_mask
1904
1905   vst1.u32 { colors }, [ block_ptr_b, :128 ], c_32
1906   vst1.u32 { draw_mask_edge }, [ block_ptr_a, :128 ], c_32
1907   add block_ptr_b, block_ptr_b, #32
1908   vst1.u32 { b_whole_8, fb_mask_ptrs }, [ block_ptr_a, :128 ], c_32
1909
1910  1:
1911   add span_edge_data, span_edge_data, #8
1912   subs num_spans, num_spans, #1
1913
1914   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
1915   bne 0b
1916
1917   ldmia sp!, { r4 - r11, pc }
1918                                                                            
1919  2:
1920   vpush { colors }
1921
1922   stmdb sp!, { r0 - r3, r12, r14 }
1923   bl flush_render_block_buffer
1924   ldmia sp!, { r0 - r3, r12, r14 }
1925
1926   vpop { colors }
1927
1928   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
1929   veor.u32 draw_mask, draw_mask, draw_mask
1930
1931   mov num_blocks, span_num_blocks
1932   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset
1933   bal 3b
1934
1935
1936 #define mask_msb_scalar                                   r14
1937
1938 #define msb_mask                                          q15
1939
1940 #define pixels_low                                        d16
1941
1942 #define msb_mask_low                                      d30
1943 #define msb_mask_high                                     d31
1944
1945
1946 .align 3
1947
1948 function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
1949   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ]
1950
1951   cmp num_spans, #0
1952   bxeq lr
1953
1954   stmdb sp!, { r4 - r11, r14 }
1955
1956   ldr color, [ psx_gpu, #psx_gpu_triangle_color_offset ]
1957
1958   ubfx color_r, color, #3, #5
1959   ubfx color_g, color, #11, #5
1960
1961   ldrh mask_msb_scalar, [ psx_gpu, #psx_gpu_mask_msb_offset ]
1962   ubfx color_b, color, #19, #5
1963
1964   orr color, color_r, color_b, lsl #10
1965   orr color, color, color_g, lsl #5
1966   orr color, color, mask_msb_scalar
1967
1968   vdup.u16 colors, color
1969
1970   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
1971   orr color, color, lsl #16
1972
1973
1974  0:
1975   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
1976   ldrh y, [ span_edge_data, #edge_data_y_offset ]
1977
1978   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
1979
1980   cmp span_num_blocks, #0
1981   beq 1f
1982
1983   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ]
1984
1985   add fb_ptr, fb_ptr, y, lsl #11
1986   subs span_num_blocks, span_num_blocks, #1
1987
1988   add fb_ptr, fb_ptr, left_x, lsl #1
1989   beq 3f
1990
1991  2:
1992   vst1.u32 { colors }, [ fb_ptr ]!
1993   subs span_num_blocks, span_num_blocks, #1
1994
1995   bne 2b
1996
1997  3:
1998   ldrb right_mask, [ span_edge_data, #edge_data_right_mask_offset ]
1999
2000   cmp right_mask, #0x0
2001   beq 5f
2002
2003   tst right_mask, #0xF
2004   streq color, [ fb_ptr ], #4
2005   moveq right_mask, right_mask, lsr #4
2006   streq color, [ fb_ptr ], #4
2007
2008   tst right_mask, #0x3
2009   streq color, [ fb_ptr ], #4
2010   moveq right_mask, right_mask, lsr #2
2011
2012   tst right_mask, #0x1
2013   streqh color, [ fb_ptr ]
2014
2015  1:
2016   add span_edge_data, span_edge_data, #8
2017   subs num_spans, num_spans, #1
2018   bne 0b
2019
2020   ldmia sp!, { r4 - r11, pc }
2021                                                                            
2022  5:
2023   vst1.u32 { colors }, [ fb_ptr ]
2024   bal 1b
2025
2026
2027 #undef c_64
2028
2029 #define c_64                                              r7
2030 #define rg_dx_ptr                                         r2
2031
2032
2033 #undef r_block
2034 #undef g_block
2035 #undef b_block
2036 #undef r_whole
2037 #undef g_whole
2038 #undef b_whole
2039 #undef r_whole_low
2040 #undef r_whole_high
2041 #undef g_whole_low
2042 #undef g_whole_high
2043 #undef b_whole_low
2044 #undef b_whole_high
2045 #undef r_whole_8
2046 #undef g_whole_8
2047 #undef b_whole_8
2048 #undef dither_offsets
2049 #undef rg_dx4
2050 #undef rg_dx8
2051 #undef dx4
2052 #undef dx8
2053 #undef v_left_x
2054 #undef uvrg
2055 #undef block_span
2056 #undef rg
2057 #undef draw_mask
2058 #undef test_mask
2059
2060 #define r_block                                           q0
2061 #define g_block                                           q1
2062 #define b_block                                           q2
2063
2064 #define r_whole                                           q3
2065 #define g_whole                                           q4
2066 #define b_whole                                           q5
2067
2068 #define r_whole_low                                       d6
2069 #define r_whole_high                                      d7
2070 #define g_whole_low                                       d8
2071 #define g_whole_high                                      d9
2072 #define b_whole_low                                       d10
2073 #define b_whole_high                                      d11
2074
2075 #define gb_whole_8                                        q6
2076
2077 #define g_whole_8                                         d12
2078 #define b_whole_8                                         d13
2079
2080 #define r_whole_8                                         d14
2081
2082 #define pixels                                            q8
2083
2084 #define rg_dx4                                            d18
2085 #define rg_dx8                                            d19
2086
2087 #define dx4                                               q10
2088 #define dx8                                               q10
2089
2090 #define v_left_x                                          d6
2091 #define uvrg                                              q4
2092 #define block_span                                        q5
2093
2094 #define rg                                                d9
2095
2096 #define d64_1                                             d22
2097 #define d64_128                                           d23
2098
2099 #define d128_4                                            q12
2100 #define d128_0x7                                          q13
2101
2102 #define d64_4                                             d24
2103
2104 #define dither_offsets                                    q14
2105 #define draw_mask                                         q15
2106
2107 #define dither_offsets_low                                d28
2108
2109 #define rg_dx                                             d0
2110 #define test_mask                                         q10
2111
2112
2113 #define setup_blocks_shaded_untextured_dither_a_dithered()                     \
2114   vqadd.u8 r_whole_8, r_whole_8, dither_offsets_low;                           \
2115   vqadd.u8 gb_whole_8, gb_whole_8, dither_offsets;                             \
2116
2117 #define setup_blocks_shaded_untextured_dither_b_dithered()                     \
2118   vqsub.u8 r_whole_8, r_whole_8, d64_4;                                        \
2119   vqsub.u8 gb_whole_8, gb_whole_8, d128_4                                      \
2120
2121 #define setup_blocks_shaded_untextured_dither_a_undithered()                   \
2122
2123 #define setup_blocks_shaded_untextured_dither_b_undithered()                   \
2124
2125
2126 #define setup_blocks_shaded_untextured_indirect_builder(dithering)             \
2127 .align 3;                                                                      \
2128                                                                                \
2129 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
2130   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
2131   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2132                                                                                \
2133   vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ];                                      \
2134                                                                                \
2135   cmp num_spans, #0;                                                           \
2136   bxeq lr;                                                                     \
2137                                                                                \
2138   stmdb sp!, { r4 - r11, r14 };                                                \
2139   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2140                                                                                \
2141   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
2142   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2143                                                                                \
2144   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2145                                                                                \
2146   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
2147   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2148                                                                                \
2149   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2150   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2151                                                                                \
2152   add block_ptr_a, block_ptr_a, num_blocks, lsl #6;                            \
2153   vmov.u8 d64_1, #1;                                                           \
2154                                                                                \
2155   vmov.u8 d128_4, #4;                                                          \
2156   vmov.u8 d64_128, #128;                                                       \
2157                                                                                \
2158   vmov.u8 d128_0x7, #0x7;                                                      \
2159                                                                                \
2160  0:                                                                            \
2161   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
2162   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2163                                                                                \
2164   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
2165   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
2166                                                                                \
2167   cmp span_num_blocks, #0;                                                     \
2168   beq 1f;                                                                      \
2169                                                                                \
2170   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
2171   add num_blocks, span_num_blocks, num_blocks;                                 \
2172                                                                                \
2173   cmp num_blocks, #MAX_BLOCKS;                                                 \
2174   bgt 2f;                                                                      \
2175                                                                                \
2176  3:                                                                            \
2177   ldr b, [ span_b_offset ];                                                    \
2178   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2179                                                                                \
2180   vdup.u32 v_left_x, left_x;                                                   \
2181   and y, y, #0x3;                                                              \
2182                                                                                \
2183   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
2184   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2185                                                                                \
2186   mla b, b_dx, left_x, b;                                                      \
2187   and dither_shift, left_x, #0x03;                                             \
2188                                                                                \
2189   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
2190   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2191                                                                                \
2192   mov dither_shift, dither_shift, lsl #3;                                      \
2193   vmla.u32 rg, rg_dx, v_left_x;                                                \
2194                                                                                \
2195   mov c_64, #64;                                                               \
2196   subs span_num_blocks, span_num_blocks, #1;                                   \
2197                                                                                \
2198   mov dither_row, dither_row, ror dither_shift;                                \
2199   mov b_dx4, b_dx, lsl #2;                                                     \
2200                                                                                \
2201   vdup.u32 dither_offsets, dither_row;                                         \
2202   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2203                                                                                \
2204   vdup.u32 b_block, b;                                                         \
2205   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2206                                                                                \
2207   mov b_dx8, b_dx, lsl #3;                                                     \
2208   vdup.u32 r_block, rg[0];                                                     \
2209   vdup.u32 g_block, rg[1];                                                     \
2210                                                                                \
2211   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2212                                                                                \
2213   vadd.u32 r_block, r_block, block_span;                                       \
2214   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2215                                                                                \
2216   vadd.u32 g_block, g_block, block_span;                                       \
2217   vld1.u32 { block_span }, [ block_span_ptr, :128 ];                           \
2218                                                                                \
2219   vadd.u32 b_block, b_block, block_span;                                       \
2220   add block_ptr_b, block_ptr_a, #16;                                           \
2221                                                                                \
2222   vshrn.u32 r_whole_low, r_block, #16;                                         \
2223   vshrn.u32 g_whole_low, g_block, #16;                                         \
2224   vshrn.u32 b_whole_low, b_block, #16;                                         \
2225   vdup.u32 dx4, rg_dx4[0];                                                     \
2226                                                                                \
2227   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2228   vdup.u32 dx4, rg_dx4[1];                                                     \
2229                                                                                \
2230   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2231   vdup.u32 dx4, b_dx4;                                                         \
2232                                                                                \
2233   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2234   vdup.u32 dx8, rg_dx8[0];                                                     \
2235                                                                                \
2236   vadd.u32 r_block, r_block, dx8;                                              \
2237   vdup.u32 dx8, rg_dx8[1];                                                     \
2238                                                                                \
2239   vadd.u32 g_block, g_block, dx8;                                              \
2240   vdup.u32 dx8, b_dx8;                                                         \
2241                                                                                \
2242   vadd.u32 b_block, b_block, dx8;                                              \
2243                                                                                \
2244   vmovn.u16 r_whole_8, r_whole;                                                \
2245   vmovn.u16 g_whole_8, g_whole;                                                \
2246   vmovn.u16 b_whole_8, b_whole;                                                \
2247                                                                                \
2248   beq 5f;                                                                      \
2249   veor.u32 draw_mask, draw_mask, draw_mask;                                    \
2250                                                                                \
2251  4:                                                                            \
2252   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2253   vshrn.u32 r_whole_low, r_block, #16;                                         \
2254                                                                                \
2255   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2256   vshrn.u32 g_whole_low, g_block, #16;                                         \
2257                                                                                \
2258   vshrn.u32 b_whole_low, b_block, #16;                                         \
2259   str fb_ptr, [ block_ptr_a, #44 ];                                            \
2260                                                                                \
2261   vdup.u32 dx4, rg_dx4[0];                                                     \
2262   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2263   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2264                                                                                \
2265   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2266   vdup.u32 dx4, rg_dx4[1];                                                     \
2267                                                                                \
2268   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2269   vdup.u32 dx4, b_dx4;                                                         \
2270                                                                                \
2271   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2272   vdup.u32 dx8, rg_dx8[0];                                                     \
2273                                                                                \
2274   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2275   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2276   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2277                                                                                \
2278   vadd.u32 r_block, r_block, dx8;                                              \
2279   vdup.u32 dx8, rg_dx8[1];                                                     \
2280                                                                                \
2281   vadd.u32 g_block, g_block, dx8;                                              \
2282   vdup.u32 dx8, b_dx8;                                                         \
2283                                                                                \
2284   vadd.u32 b_block, b_block, dx8;                                              \
2285   add fb_ptr, fb_ptr, #16;                                                     \
2286                                                                                \
2287   vmovn.u16 r_whole_8, r_whole;                                                \
2288   vmovn.u16 g_whole_8, g_whole;                                                \
2289   vmovn.u16 b_whole_8, b_whole;                                                \
2290                                                                                \
2291   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64;                         \
2292   vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64;                            \
2293                                                                                \
2294   pld [ fb_ptr ];                                                              \
2295                                                                                \
2296   subs span_num_blocks, span_num_blocks, #1;                                   \
2297   bne 4b;                                                                      \
2298                                                                                \
2299  5:                                                                            \
2300   str fb_ptr, [ block_ptr_a, #44 ];                                            \
2301   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2302                                                                                \
2303   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
2304   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2305                                                                                \
2306   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2307   vdup.u8 draw_mask, right_mask;                                               \
2308                                                                                \
2309   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2310   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
2311                                                                                \
2312   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
2313                                                                                \
2314   vmull.u8 pixels, r_whole_8, d64_1;                                           \
2315   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2316   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2317                                                                                \
2318   vst1.u32 { draw_mask }, [ block_ptr_a, :128 ], c_64;                         \
2319   vst1.u32 { pixels }, [ block_ptr_b, :128 ], c_64;                            \
2320                                                                                \
2321  1:                                                                            \
2322   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2323   add span_b_offset, span_b_offset, #4;                                        \
2324                                                                                \
2325   add span_edge_data, span_edge_data, #8;                                      \
2326   subs num_spans, num_spans, #1;                                               \
2327                                                                                \
2328   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
2329   bne 0b;                                                                      \
2330                                                                                \
2331   ldmia sp!, { r4 - r11, pc };                                                 \
2332                                                                                \
2333  2:                                                                            \
2334   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
2335   vpush { rg_dx4 };                                                            \
2336                                                                                \
2337   stmdb sp!, { r0 - r3, r12, r14 };                                            \
2338   bl flush_render_block_buffer;                                                \
2339   ldmia sp!, { r0 - r3, r12, r14 };                                            \
2340                                                                                \
2341   vpop { rg_dx4 };                                                             \
2342                                                                                \
2343   vmov.u8 d64_1, #1;                                                           \
2344   vmov.u8 d128_4, #4;                                                          \
2345   vmov.u8 d64_128, #128;                                                       \
2346   vmov.u8 d128_0x7, #0x7;                                                      \
2347                                                                                \
2348   vadd.u32 rg_dx8, rg_dx4, rg_dx4;                                             \
2349                                                                                \
2350   mov num_blocks, span_num_blocks;                                             \
2351   add block_ptr_a, psx_gpu, #psx_gpu_blocks_offset;                            \
2352   bal 3b                                                                       \
2353
2354
2355 setup_blocks_shaded_untextured_indirect_builder(undithered)
2356 setup_blocks_shaded_untextured_indirect_builder(dithered)
2357
2358
2359 #undef draw_mask
2360
2361 #define mask_msb_ptr                                      r14
2362
2363 #define draw_mask                                         q0
2364 #define pixels_low                                        d16
2365 #define pixels_high                                       d17
2366
2367
2368
2369 #define setup_blocks_shaded_untextured_direct_builder(dithering)               \
2370 .align 3;                                                                      \
2371                                                                                \
2372 function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
2373   ldrh num_spans, [ psx_gpu, #psx_gpu_num_spans_offset ];                      \
2374   add rg_dx_ptr, psx_gpu, #(psx_gpu_uvrg_dx_offset + 8);                       \
2375                                                                                \
2376   vld1.u32 { rg_dx }, [ rg_dx_ptr, :64 ];                                      \
2377                                                                                \
2378   cmp num_spans, #0;                                                           \
2379   bxeq lr;                                                                     \
2380                                                                                \
2381   stmdb sp!, { r4 - r11, r14 };                                                \
2382   vshl.u32 rg_dx4, rg_dx, #2;                                                  \
2383                                                                                \
2384   ldr b_dx, [ psx_gpu, #psx_gpu_b_dx_offset ];                                 \
2385   vshl.u32 rg_dx8, rg_dx, #3;                                                  \
2386                                                                                \
2387   add span_uvrg_offset, psx_gpu, #psx_gpu_span_uvrg_offset_offset;             \
2388   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset;                 \
2389                                                                                \
2390   add span_b_offset, psx_gpu, #psx_gpu_span_b_offset_offset;                   \
2391   vmov.u8 d64_1, #1;                                                           \
2392                                                                                \
2393   vmov.u8 d128_4, #4;                                                          \
2394   vmov.u8 d64_128, #128;                                                       \
2395                                                                                \
2396   vmov.u8 d128_0x7, #0x7;                                                      \
2397   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
2398   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
2399                                                                                \
2400  0:                                                                            \
2401   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ];      \
2402   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
2403                                                                                \
2404   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
2405   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
2406                                                                                \
2407   cmp span_num_blocks, #0;                                                     \
2408   beq 1f;                                                                      \
2409                                                                                \
2410   ldrh left_x, [ span_edge_data, #edge_data_left_x_offset ];                   \
2411   add fb_ptr, fb_ptr, y, lsl #11;                                              \
2412                                                                                \
2413   ldr b, [ span_b_offset ];                                                    \
2414   vdup.u32 v_left_x, left_x;                                                   \
2415   and y, y, #0x3;                                                              \
2416                                                                                \
2417   ldr dither_row, [ dither_offset_ptr, y, lsl #2 ];                            \
2418   add fb_ptr, fb_ptr, left_x, lsl #1;                                          \
2419                                                                                \
2420   mla b, b_dx, left_x, b;                                                      \
2421   and dither_shift, left_x, #0x03;                                             \
2422                                                                                \
2423   vld1.u32 { uvrg }, [ span_uvrg_offset, :128 ];                               \
2424   vshr.u32 rg_dx, rg_dx4, #2;                                                  \
2425                                                                                \
2426   mov dither_shift, dither_shift, lsl #3;                                      \
2427   vmla.u32 rg, rg_dx, v_left_x;                                                \
2428                                                                                \
2429   subs span_num_blocks, span_num_blocks, #1;                                   \
2430                                                                                \
2431   mov dither_row, dither_row, ror dither_shift;                                \
2432   mov b_dx4, b_dx, lsl #2;                                                     \
2433                                                                                \
2434   vdup.u32 dither_offsets, dither_row;                                         \
2435   add block_span_ptr, psx_gpu, #psx_gpu_r_block_span_offset;                   \
2436                                                                                \
2437   vdup.u32 b_block, b;                                                         \
2438   vadd.u8 dither_offsets, dither_offsets, d128_4;                              \
2439                                                                                \
2440   mov b_dx8, b_dx, lsl #3;                                                     \
2441   vdup.u32 r_block, rg[0];                                                     \
2442   vdup.u32 g_block, rg[1];                                                     \
2443                                                                                \
2444   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2445                                                                                \
2446   vadd.u32 r_block, r_block, block_span;                                       \
2447   vld1.u32 { block_span }, [ block_span_ptr, :128 ]!;                          \
2448                                                                                \
2449   vadd.u32 g_block, g_block, block_span;                                       \
2450   vld1.u32 { block_span }, [ block_span_ptr, :128 ];                           \
2451                                                                                \
2452   vadd.u32 b_block, b_block, block_span;                                       \
2453   add block_ptr_b, block_ptr_a, #16;                                           \
2454                                                                                \
2455   vshrn.u32 r_whole_low, r_block, #16;                                         \
2456   vshrn.u32 g_whole_low, g_block, #16;                                         \
2457   vshrn.u32 b_whole_low, b_block, #16;                                         \
2458   vdup.u32 dx4, rg_dx4[0];                                                     \
2459                                                                                \
2460   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2461   vdup.u32 dx4, rg_dx4[1];                                                     \
2462                                                                                \
2463   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2464   vdup.u32 dx4, b_dx4;                                                         \
2465                                                                                \
2466   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2467   vdup.u32 dx8, rg_dx8[0];                                                     \
2468                                                                                \
2469   vadd.u32 r_block, r_block, dx8;                                              \
2470   vdup.u32 dx8, rg_dx8[1];                                                     \
2471                                                                                \
2472   vadd.u32 g_block, g_block, dx8;                                              \
2473   vdup.u32 dx8, b_dx8;                                                         \
2474                                                                                \
2475   vadd.u32 b_block, b_block, dx8;                                              \
2476                                                                                \
2477   vmovn.u16 r_whole_8, r_whole;                                                \
2478   vmovn.u16 g_whole_8, g_whole;                                                \
2479   vmovn.u16 b_whole_8, b_whole;                                                \
2480                                                                                \
2481   beq 3f;                                                                      \
2482                                                                                \
2483  2:                                                                            \
2484   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2485   vshrn.u32 r_whole_low, r_block, #16;                                         \
2486                                                                                \
2487   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2488   vshrn.u32 g_whole_low, g_block, #16;                                         \
2489                                                                                \
2490   vshrn.u32 b_whole_low, b_block, #16;                                         \
2491                                                                                \
2492   vdup.u32 dx4, rg_dx4[0];                                                     \
2493   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2494   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2495                                                                                \
2496   vaddhn.u32 r_whole_high, r_block, dx4;                                       \
2497   vdup.u32 dx4, rg_dx4[1];                                                     \
2498                                                                                \
2499   vmov pixels, msb_mask;                                                       \
2500   vaddhn.u32 g_whole_high, g_block, dx4;                                       \
2501   vdup.u32 dx4, b_dx4;                                                         \
2502                                                                                \
2503   vaddhn.u32 b_whole_high, b_block, dx4;                                       \
2504   vdup.u32 dx8, rg_dx8[0];                                                     \
2505                                                                                \
2506   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2507   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2508   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2509                                                                                \
2510   vadd.u32 r_block, r_block, dx8;                                              \
2511   vdup.u32 dx8, rg_dx8[1];                                                     \
2512                                                                                \
2513   vadd.u32 g_block, g_block, dx8;                                              \
2514   vdup.u32 dx8, b_dx8;                                                         \
2515                                                                                \
2516   vadd.u32 b_block, b_block, dx8;                                              \
2517                                                                                \
2518   vmovn.u16 r_whole_8, r_whole;                                                \
2519   vmovn.u16 g_whole_8, g_whole;                                                \
2520   vmovn.u16 b_whole_8, b_whole;                                                \
2521                                                                                \
2522   vst1.u32 { pixels }, [ fb_ptr ]!;                                            \
2523   subs span_num_blocks, span_num_blocks, #1;                                   \
2524   bne 2b;                                                                      \
2525                                                                                \
2526  3:                                                                            \
2527   setup_blocks_shaded_untextured_dither_a_##dithering();                       \
2528                                                                                \
2529   ldrh right_mask, [ span_edge_data, #edge_data_right_mask_offset ];           \
2530   setup_blocks_shaded_untextured_dither_b_##dithering();                       \
2531                                                                                \
2532   vshr.u8 r_whole_8, r_whole_8, #3;                                            \
2533   rbit right_mask, right_mask;                                                 \
2534   vmov pixels, msb_mask;                                                       \
2535   vbic.u8 gb_whole_8, gb_whole_8, d128_0x7;                                    \
2536   clz right_mask, right_mask;                                                  \
2537                                                                                \
2538   vmlal.u8 pixels, r_whole_8, d64_1;                                           \
2539   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
2540   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
2541                                                                                \
2542   ldr pc, [ pc, right_mask, lsl #2 ];                                          \
2543   nop;                                                                         \
2544   nop;                                                                         \
2545   .word 4f;                                                                    \
2546   .word 5f;                                                                    \
2547   .word 6f;                                                                    \
2548   .word 7f;                                                                    \
2549   .word 8f;                                                                    \
2550   .word 9f;                                                                    \
2551   .word 10f;                                                                   \
2552   .word 11f;                                                                   \
2553                                                                                \
2554  4:                                                                            \
2555   vst1.u16 { pixels_low[0] }, [ fb_ptr ];                                      \
2556   bal 1f;                                                                      \
2557                                                                                \
2558  5:                                                                            \
2559   vst1.u32 { pixels_low[0] }, [ fb_ptr ];                                      \
2560   bal 1f;                                                                      \
2561                                                                                \
2562  6:                                                                            \
2563   vst1.u32 { pixels_low[0] }, [ fb_ptr ]!;                                     \
2564   vst1.u16 { pixels_low[2] }, [ fb_ptr ];                                      \
2565   bal 1f;                                                                      \
2566                                                                                \
2567  7:                                                                            \
2568   vst1.u32 { pixels_low }, [ fb_ptr ];                                         \
2569   bal 1f;                                                                      \
2570                                                                                \
2571  8:                                                                            \
2572   vst1.u32 { pixels_low }, [ fb_ptr ]!;                                        \
2573   vst1.u16 { pixels_high[0] }, [ fb_ptr ];                                     \
2574   bal 1f;                                                                      \
2575                                                                                \
2576  9:                                                                            \
2577   vst1.u32 { pixels_low }, [ fb_ptr ]!;                                        \
2578   vst1.u32 { pixels_high[0] }, [ fb_ptr ]!;                                    \
2579   bal 1f;                                                                      \
2580                                                                                \
2581  10:                                                                           \
2582   vst1.u32 { pixels_low }, [ fb_ptr ]!;                                        \
2583   vst1.u32 { pixels_high[0] }, [ fb_ptr ]!;                                    \
2584   vst1.u16 { pixels_high[2] }, [ fb_ptr ];                                     \
2585   bal 1f;                                                                      \
2586                                                                                \
2587  11:                                                                           \
2588   vst1.u32 { pixels }, [ fb_ptr ];                                             \
2589   bal 1f;                                                                      \
2590                                                                                \
2591  1:                                                                            \
2592   add span_uvrg_offset, span_uvrg_offset, #16;                                 \
2593   add span_b_offset, span_b_offset, #4;                                        \
2594                                                                                \
2595   add span_edge_data, span_edge_data, #8;                                      \
2596   subs num_spans, num_spans, #1;                                               \
2597                                                                                \
2598   bne 0b;                                                                      \
2599                                                                                \
2600   ldmia sp!, { r4 - r11, pc }                                                  \
2601
2602 setup_blocks_shaded_untextured_direct_builder(undithered)
2603 setup_blocks_shaded_untextured_direct_builder(dithered)
2604
2605
2606 #undef psx_gpu
2607 #undef num_blocks
2608 #undef triangle
2609 #undef c_64
2610
2611 #define psx_gpu                                  r0
2612 #define block_ptr                                r1
2613 #define num_blocks                               r2
2614 #define uv_01                                    r3
2615 #define uv_23                                    r4
2616 #define uv_45                                    r5
2617 #define uv_67                                    r6
2618 #define uv_0                                     r7
2619 #define uv_1                                     r3
2620 #define uv_2                                     r8
2621 #define uv_3                                     r4
2622 #define uv_4                                     r9
2623 #define uv_5                                     r5
2624 #define uv_6                                     r10
2625 #define uv_7                                     r6
2626 #define texture_ptr                              r11
2627
2628 #define pixel_0                                  r7
2629 #define pixel_1                                  r3
2630 #define pixel_2                                  r8
2631 #define pixel_3                                  r4
2632 #define pixel_4                                  r9
2633 #define pixel_5                                  r5
2634 #define pixel_6                                  r10
2635 #define pixel_7                                  r6
2636
2637 #define pixels_a                                 r7
2638 #define pixels_b                                 r9
2639 #define pixels_c                                 r8
2640 #define pixels_d                                 r10
2641
2642 #define c_64                                     r0
2643
2644 #define clut_ptr                                 r12
2645 #define current_texture_mask                     r5
2646 #define dirty_textures_mask                      r6
2647
2648 #define texels                                   d0
2649
2650 #define clut_low_a                               d2
2651 #define clut_low_b                               d3
2652 #define clut_high_a                              d4
2653 #define clut_high_b                              d5
2654
2655 #define clut_a                                   q1
2656 #define clut_b                                   q2
2657
2658 #define texels_low                               d6
2659 #define texels_high                              d7
2660
2661 .align 3
2662
2663 function(texture_blocks_untextured)
2664   bx lr
2665
2666
2667 .align 3
2668
2669 function(texture_blocks_4bpp)
2670   stmdb sp!, { r3 - r11, r14 }
2671   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2672
2673   ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2674   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2675
2676   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2677   vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ]
2678
2679   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2680   vuzp.u8 clut_a, clut_b
2681
2682   ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
2683   tst dirty_textures_mask, current_texture_mask
2684
2685   bne 1f
2686   mov c_64, #64
2687
2688 0:
2689   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2690
2691   uxtah uv_0, texture_ptr, uv_01
2692   uxtah uv_1, texture_ptr, uv_01, ror #16
2693
2694   uxtah uv_2, texture_ptr, uv_23
2695   uxtah uv_3, texture_ptr, uv_23, ror #16
2696
2697   uxtah uv_4, texture_ptr, uv_45
2698   ldrb pixel_0, [ uv_0 ]
2699
2700   uxtah uv_5, texture_ptr, uv_45, ror #16
2701   ldrb pixel_1, [ uv_1 ]
2702
2703   uxtah uv_6, texture_ptr, uv_67
2704   ldrb pixel_2, [ uv_2 ]
2705
2706   uxtah uv_7, texture_ptr, uv_67, ror #16
2707   ldrb pixel_3, [ uv_3 ]
2708
2709   ldrb pixel_4, [ uv_4 ]
2710   subs num_blocks, num_blocks, #1
2711
2712   ldrb pixel_5, [ uv_5 ]
2713   orr pixels_a, pixel_0, pixel_1, lsl #8
2714
2715   ldrb pixel_6, [ uv_6 ]
2716   orr pixels_b, pixel_4, pixel_5, lsl #8
2717
2718   ldrb pixel_7, [ uv_7 ]
2719   orr pixels_a, pixels_a, pixel_2, lsl #16
2720
2721   orr pixels_b, pixels_b, pixel_6, lsl #16
2722   orr pixels_a, pixels_a, pixel_3, lsl #24
2723
2724   orr pixels_b, pixels_b, pixel_7, lsl #24
2725   vmov.u32 texels, pixels_a, pixels_b
2726
2727   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
2728   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
2729
2730   vst2.u8 { texels_low, texels_high }, [ block_ptr, :128 ], c_64
2731   bne 0b
2732
2733   ldmia sp!, { r3 - r11, pc }
2734
2735 1:
2736   stmdb sp!, { r1 - r2 }  
2737   bl update_texture_4bpp_cache
2738
2739   mov c_64, #64
2740   ldmia sp!, { r1 - r2 }
2741   bal 0b
2742
2743
2744 .align 3
2745
2746 function(texture_blocks_8bpp)
2747   stmdb sp!, { r3 - r11, r14 }
2748   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2749
2750   ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2751   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2752
2753   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
2754   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
2755
2756   ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ]
2757   tst dirty_textures_mask, current_texture_mask
2758
2759   bne 1f
2760   nop
2761
2762 0:
2763   ldm block_ptr, { uv_01, uv_23, uv_45, uv_67 }
2764
2765   uxtah uv_0, texture_ptr, uv_01
2766   uxtah uv_1, texture_ptr, uv_01, ror #16
2767
2768   uxtah uv_2, texture_ptr, uv_23
2769   uxtah uv_3, texture_ptr, uv_23, ror #16
2770
2771   uxtah uv_4, texture_ptr, uv_45
2772   ldrb pixel_0, [ uv_0 ]
2773
2774   uxtah uv_5, texture_ptr, uv_45, ror #16
2775   ldrb pixel_1, [ uv_1 ]
2776
2777   uxtah uv_6, texture_ptr, uv_67
2778   ldrb pixel_2, [ uv_2 ]
2779
2780   uxtah uv_7, texture_ptr, uv_67, ror #16
2781   ldrb pixel_3, [ uv_3 ]
2782
2783   ldrb pixel_4, [ uv_4 ]
2784   add pixel_0, pixel_0, pixel_0
2785
2786   ldrb pixel_5, [ uv_5 ]
2787   add pixel_1, pixel_1, pixel_1
2788
2789   ldrb pixel_6, [ uv_6 ]
2790   add pixel_2, pixel_2, pixel_2
2791
2792   ldrb pixel_7, [ uv_7 ]
2793   add pixel_3, pixel_3, pixel_3
2794
2795   ldrh pixel_0, [ clut_ptr, pixel_0 ]
2796   add pixel_4, pixel_4, pixel_4
2797
2798   ldrh pixel_1, [ clut_ptr, pixel_1 ]
2799   add pixel_5, pixel_5, pixel_5
2800
2801   ldrh pixel_2, [ clut_ptr, pixel_2 ]
2802   add pixel_6, pixel_6, pixel_6
2803
2804   ldrh pixel_3, [ clut_ptr, pixel_3 ]
2805   add pixel_7, pixel_7, pixel_7
2806
2807   ldrh pixel_4, [ clut_ptr, pixel_4 ]
2808   orr pixels_a, pixel_0, pixel_1, lsl #16
2809
2810   ldrh pixel_5, [ clut_ptr, pixel_5 ]
2811   orr pixels_c, pixel_2, pixel_3, lsl #16
2812
2813   ldrh pixel_6, [ clut_ptr, pixel_6 ]
2814   subs num_blocks, num_blocks, #1
2815
2816   ldrh pixel_7, [ clut_ptr, pixel_7 ]
2817   orr pixels_b, pixel_4, pixel_5, lsl #16
2818
2819   orr pixels_d, pixel_6, pixel_7, lsl #16
2820   stm block_ptr, { pixels_a, pixels_c, pixels_b, pixels_d } 
2821
2822   add block_ptr, block_ptr, #64
2823   bne 0b
2824
2825   ldmia sp!, { r3 - r11, pc }
2826
2827 1:
2828   stmdb sp!, { r1 - r2, r12 }
2829
2830   bl update_texture_8bpp_cache
2831
2832   ldmia sp!, { r1 - r2, r12 }
2833   bal 0b
2834
2835
2836 #undef uv_0
2837 #undef uv_1
2838 #undef uv_2
2839 #undef uv_3
2840 #undef uv_4
2841 #undef uv_5
2842 #undef uv_6
2843 #undef uv_7
2844
2845 #undef pixel_0
2846 #undef pixel_1
2847 #undef pixel_2
2848 #undef pixel_3
2849 #undef pixel_4
2850 #undef pixel_5
2851 #undef pixel_6
2852 #undef pixel_7
2853
2854 #undef texture_ptr
2855
2856 #undef pixels_a
2857 #undef pixels_b
2858 #undef pixels_c
2859 #undef pixels_d
2860
2861 #define psx_gpu                                  r0
2862 #define block_ptr                                r1
2863 #define num_blocks                               r2
2864
2865 #define uv_0                                     r3
2866 #define uv_1                                     r4
2867 #define u_0                                      r3
2868 #define u_1                                      r4
2869 #define v_0                                      r5
2870 #define v_1                                      r6
2871
2872 #define uv_2                                     r5
2873 #define uv_3                                     r6
2874 #define u_2                                      r5
2875 #define u_3                                      r6
2876 #define v_2                                      r7
2877 #define v_3                                      r8
2878
2879 #define uv_4                                     r7
2880 #define uv_5                                     r8
2881 #define u_4                                      r7
2882 #define u_5                                      r8
2883 #define v_4                                      r9
2884 #define v_5                                      r10
2885
2886 #define uv_6                                     r9
2887 #define uv_7                                     r10
2888 #define u_6                                      r9
2889 #define u_7                                      r10
2890 #define v_6                                      r11
2891 #define v_7                                      r0
2892
2893 #define pixel_0                                  r3
2894 #define pixel_1                                  r4
2895 #define pixel_2                                  r5
2896 #define pixel_3                                  r6
2897 #define pixel_4                                  r7
2898 #define pixel_5                                  r8
2899 #define pixel_6                                  r9
2900 #define pixel_7                                  r10
2901
2902 #define pixels_a                                 r3
2903 #define pixels_b                                 r5
2904 #define pixels_c                                 r7
2905 #define pixels_d                                 r9
2906
2907 #define texture_ptr                              r12
2908
2909
2910 .align 3
2911
2912 function(texture_blocks_16bpp)
2913   stmdb sp!, { r3 - r11, r14 }
2914   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
2915
2916   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
2917   ldr texture_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
2918
2919 0:
2920   ldrh uv_0, [ block_ptr ]
2921   subs num_blocks, num_blocks, #1
2922
2923   ldrh uv_1, [ block_ptr, #2 ]
2924
2925   and v_0, uv_0, #0xFF00
2926   and v_1, uv_1, #0xFF00
2927
2928   and u_0, uv_0, #0xFF
2929   and u_1, uv_1, #0xFF
2930
2931   add uv_0, u_0, v_0, lsl #2
2932   ldrh uv_2, [ block_ptr, #4 ]
2933
2934   add uv_1, u_1, v_1, lsl #2
2935   ldrh uv_3, [ block_ptr, #6 ]
2936
2937   add uv_0, uv_0, uv_0
2938   add uv_1, uv_1, uv_1
2939
2940   and v_2, uv_2, #0xFF00
2941   and v_3, uv_3, #0xFF00
2942
2943   and u_2, uv_2, #0xFF
2944   and u_3, uv_3, #0xFF
2945
2946   add uv_2, u_2, v_2, lsl #2
2947   ldrh uv_4, [ block_ptr, #8 ]
2948
2949   add uv_3, u_3, v_3, lsl #2
2950   ldrh uv_5, [ block_ptr, #10 ]
2951
2952   add uv_2, uv_2, uv_2
2953   add uv_3, uv_3, uv_3
2954
2955   and v_4, uv_4, #0xFF00
2956   and v_5, uv_5, #0xFF00
2957
2958   and u_4, uv_4, #0xFF
2959   and u_5, uv_5, #0xFF
2960
2961   add uv_4, u_4, v_4, lsl #2
2962   ldrh uv_6, [ block_ptr, #12 ]
2963
2964   add uv_5, u_5, v_5, lsl #2
2965   ldrh uv_7, [ block_ptr, #14 ]
2966
2967   add uv_4, uv_4, uv_4
2968   ldrh pixel_0, [ texture_ptr, uv_0 ]
2969
2970   add uv_5, uv_5, uv_5
2971   ldrh pixel_1, [ texture_ptr, uv_1 ]
2972
2973   and v_6, uv_6, #0xFF00
2974   ldrh pixel_2, [ texture_ptr, uv_2 ]
2975
2976   and v_7, uv_7, #0xFF00
2977   ldrh pixel_3, [ texture_ptr, uv_3 ]
2978
2979   and u_6, uv_6, #0xFF
2980   ldrh pixel_4, [ texture_ptr, uv_4 ]
2981
2982   and u_7, uv_7, #0xFF
2983   ldrh pixel_5, [ texture_ptr, uv_5 ]
2984
2985   add uv_6, u_6, v_6, lsl #2
2986   add uv_7, u_7, v_7, lsl #2
2987
2988   add uv_6, uv_6, uv_6
2989   add uv_7, uv_7, uv_7
2990
2991   orr pixels_a, pixel_0, pixel_1, lsl #16
2992   orr pixels_b, pixel_2, pixel_3, lsl #16
2993
2994   ldrh pixel_6, [ texture_ptr, uv_6 ]
2995   orr pixels_c, pixel_4, pixel_5, lsl #16
2996
2997   ldrh pixel_7, [ texture_ptr, uv_7 ]
2998   orr pixels_d, pixel_6, pixel_7, lsl #16
2999
3000   stm block_ptr, { pixels_a, pixels_b, pixels_c, pixels_d }
3001   add block_ptr, block_ptr, #64
3002
3003   bne 0b
3004
3005   ldmia sp!, { r3 - r11, pc }
3006
3007
3008 #undef num_blocks
3009
3010 #undef test_mask
3011 #undef texels
3012 #undef pixels_b
3013 #undef pixels
3014 #undef d64_1
3015 #undef d64_4
3016 #undef d64_128
3017 #undef draw_mask
3018 #undef msb_mask
3019 #undef msb_mask_low
3020 #undef msb_mask_high
3021 #undef fb_pixels
3022
3023 #undef c_32
3024 #undef fb_ptr
3025 #undef mask_msb_ptr
3026
3027 #define psx_gpu                                  r0
3028 #define num_blocks                               r1
3029 #define color_ptr                                r2
3030 #define colors_scalar                            r2
3031 #define colors_scalar_compare                    r3
3032 #define mask_msb_ptr                             r2
3033
3034 #define block_ptr_load_a                         r0
3035 #define block_ptr_store                          r3
3036 #define block_ptr_load_b                         r12
3037 #define c_32                                     r2
3038
3039 #define c_48                                     r4
3040 #define fb_ptr                                   r14
3041 #define draw_mask_bits_scalar                    r5
3042
3043 #define d128_0x07                                q0
3044 #define d128_0x1F                                q1
3045 #define d128_0x8000                              q2
3046 #define test_mask                                q3
3047 #define texels                                   q4
3048 #define colors_rg                                q5
3049 #define colors_b_dm_bits                         q6
3050 #define texels_rg                                q7
3051 #define pixels_r                                 q8
3052 #define pixels_g                                 q9
3053 #define pixels_b                                 q10
3054 #define pixels                                   q11
3055 #define zero_mask                                q4
3056 #define draw_mask                                q12
3057 #define msb_mask                                 q13
3058
3059 #define fb_pixels                                q8
3060
3061 #define pixels_gb_low                            q9
3062
3063 #define colors_r                                 d10
3064 #define colors_g                                 d11
3065 #define colors_b                                 d12
3066 #define draw_mask_bits                           d13
3067 #define texels_r                                 d14
3068 #define texels_g                                 d15
3069 #define pixels_r_low                             d16
3070 #define pixels_g_low                             d18
3071 #define pixels_b_low                             d19
3072 #define msb_mask_low                             d26
3073 #define msb_mask_high                            d27
3074
3075 #define d64_1                                    d28
3076 #define d64_4                                    d29
3077 #define d64_128                                  d30
3078 #define texels_b                                 d31
3079
3080 #define shade_blocks_textured_modulated_prologue_indirect()                    \
3081   mov c_48, #48;                                                               \
3082   add block_ptr_store, psx_gpu, #psx_gpu_blocks_offset                         \
3083
3084 #define shade_blocks_textured_modulated_prologue_direct()                      \
3085   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3086   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]          \
3087
3088
3089 #define shade_blocks_textured_modulated_prologue_shaded(dithering, target)     \
3090   
3091 #define shade_blocks_textured_false_modulation_check_undithered(target)        \
3092   ldr colors_scalar, [ psx_gpu, #psx_gpu_triangle_color_offset ];              \
3093   movw colors_scalar_compare, #0x8080;                                         \
3094                                                                                \
3095   movt colors_scalar_compare, #0x80;                                           \
3096   cmp colors_scalar, colors_scalar_compare;                                    \
3097   beq shade_blocks_textured_unmodulated_##target                               \
3098
3099 #define shade_blocks_textured_false_modulation_check_dithered(target)          \
3100
3101 #define shade_blocks_textured_modulated_prologue_unshaded(dithering, target)   \
3102   shade_blocks_textured_false_modulation_check_##dithering(target);            \
3103   add color_ptr, psx_gpu, #psx_gpu_triangle_color_offset;                      \
3104   vld1.u32 { colors_r[] }, [ color_ptr, :32 ];                                 \
3105   vdup.u8 colors_g, colors_r[1];                                               \
3106   vdup.u8 colors_b, colors_r[2];                                               \
3107   vdup.u8 colors_r, colors_r[0]                                                \
3108
3109
3110 #define shade_blocks_textured_modulated_load_dithered(target)                  \
3111   vld1.u32 { target }, [ block_ptr_load_b, :128 ]                              \
3112
3113 #define shade_blocks_textured_modulated_load_last_dithered(target)             \
3114   vld1.u32 { target }, [ block_ptr_load_b, :128 ], c_32                        \
3115
3116 #define shade_blocks_textured_modulated_load_undithered(target)                \
3117
3118 #define shade_blocks_textured_modulated_load_last_undithered(target)           \
3119   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3120
3121 #define shade_blocks_textured_modulate_dithered(channel)                       \
3122   vmlal.u8 pixels_##channel, texels_##channel, colors_##channel                \
3123
3124 #define shade_blocks_textured_modulate_undithered(channel)                     \
3125   vmull.u8 pixels_##channel, texels_##channel, colors_##channel                \
3126
3127
3128 #define shade_blocks_textured_modulated_store_draw_mask_indirect(offset)       \
3129   vst1.u32 { draw_mask }, [ block_ptr_store, :128 ]!                           \
3130
3131 #define shade_blocks_textured_modulated_store_draw_mask_direct(offset)         \
3132   ldr fb_ptr, [ block_ptr_load_b, #(offset - 64) ];                            \
3133   vld1.u32 { fb_pixels }, [ fb_ptr ];                                          \
3134   vbit.u16 pixels, fb_pixels, draw_mask                                        \
3135
3136 #define shade_blocks_textured_modulated_store_pixels_indirect()                \
3137   vst1.u32 { pixels }, [ block_ptr_store, :128 ], c_48                         \
3138
3139 #define shade_blocks_textured_modulated_store_pixels_direct()                  \
3140   vst1.u32 { pixels }, [ fb_ptr ]                                              \
3141
3142
3143 #define shade_blocks_textured_modulated_load_rg_shaded()                       \
3144   vld1.u32 { colors_r, colors_g }, [ block_ptr_load_b, :128 ], c_32            \
3145
3146 #define shade_blocks_textured_modulated_load_rg_unshaded()                     \
3147   add block_ptr_load_b, block_ptr_load_b, #32                                  \
3148
3149 #define shade_blocks_textured_modulated_load_bdm_shaded()                      \
3150   vld1.u32 { colors_b, draw_mask_bits }, [ block_ptr_load_a, :128 ], c_32      \
3151
3152 #define shade_blocks_textured_modulated_load_bdm_unshaded()                    \
3153   ldr draw_mask_bits_scalar, [ block_ptr_load_a, #8 ];                         \
3154   add block_ptr_load_a, block_ptr_load_a, #32                                  \
3155
3156 #define shade_blocks_textured_modulated_expand_draw_mask_shaded()              \
3157   vdup.u16 draw_mask, draw_mask_bits[0]                                        \
3158
3159 #define shade_blocks_textured_modulated_expand_draw_mask_unshaded()            \
3160   vdup.u16 draw_mask, draw_mask_bits_scalar                                    \
3161
3162
3163 #define shade_blocks_textured_modulated_apply_msb_mask_indirect()              \
3164
3165 #define shade_blocks_textured_modulated_apply_msb_mask_direct()                \
3166   vorr.u16 pixels, pixels, msb_mask                                            \
3167
3168
3169 #define shade_blocks_textured_modulated_builder(shading, dithering, target)    \
3170 .align 3;                                                                      \
3171                                                                                \
3172 function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
3173   shade_blocks_textured_modulated_prologue_##shading(dithering, target);       \
3174   stmdb sp!, { r4 - r5, lr };                                                  \
3175   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3176                                                                                \
3177   vld1.u32 { test_mask }, [ psx_gpu, :128 ];                                   \
3178                                                                                \
3179   shade_blocks_textured_modulated_prologue_##target();                         \
3180                                                                                \
3181   add block_ptr_load_a, psx_gpu, #psx_gpu_blocks_offset;                       \
3182   mov c_32, #32;                                                               \
3183                                                                                \
3184   add block_ptr_load_b, block_ptr_load_a, #16;                                 \
3185   vmov.u8 d64_1, #1;                                                           \
3186   vmov.u8 d64_4, #4;                                                           \
3187   vmov.u8 d64_128, #128;                                                       \
3188                                                                                \
3189   vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32;                       \
3190   vmov.u8 d128_0x07, #0x07;                                                    \
3191                                                                                \
3192   shade_blocks_textured_modulated_load_rg_##shading();                         \
3193   vmov.u8 d128_0x1F, #0x1F;                                                    \
3194                                                                                \
3195   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3196   vmov.u16 d128_0x8000, #0x8000;                                               \
3197                                                                                \
3198   vmovn.u16 texels_r, texels;                                                  \
3199   vshrn.u16 texels_g, texels, #5;                                              \
3200                                                                                \
3201   vshrn.u16 texels_b, texels, #7;                                              \
3202   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3203                                                                                \
3204   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3205   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3206                                                                                \
3207   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3208   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3209                                                                                \
3210   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3211   vshr.u8 texels_b, texels_b, #3;                                              \
3212                                                                                \
3213   shade_blocks_textured_modulate_##dithering(r);                               \
3214   shade_blocks_textured_modulate_##dithering(g);                               \
3215   shade_blocks_textured_modulate_##dithering(b);                               \
3216                                                                                \
3217   vand.u16 pixels, texels, d128_0x8000;                                        \
3218   vceq.u16 zero_mask, texels, #0;                                              \
3219                                                                                \
3220   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3221   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3222   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3223                                                                                \
3224   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3225   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3226   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3227   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3228                                                                                \
3229   subs num_blocks, num_blocks, #1;                                             \
3230   beq 1f;                                                                      \
3231                                                                                \
3232  .align 3;                                                                     \
3233                                                                                \
3234  0:                                                                            \
3235   vld1.u32 { texels }, [ block_ptr_load_a, :128 ], c_32;                       \
3236   shade_blocks_textured_modulated_load_rg_##shading();                         \
3237   vshrn.u16 texels_g, texels, #5;                                              \
3238                                                                                \
3239   shade_blocks_textured_modulated_load_bdm_##shading();                        \
3240   vshrn.u16 texels_b, texels, #7;                                              \
3241                                                                                \
3242   vmovn.u16 texels_r, texels;                                                  \
3243   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3244                                                                                \
3245   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3246   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3247   shade_blocks_textured_modulated_store_draw_mask_##target(-4);                \
3248                                                                                \
3249   shade_blocks_textured_modulated_load_##dithering(pixels_r);                  \
3250   shade_blocks_textured_modulated_expand_draw_mask_##shading();                \
3251                                                                                \
3252   shade_blocks_textured_modulated_load_##dithering(pixels_g);                  \
3253   vand.u8 texels_rg, texels_rg, d128_0x1F;                                     \
3254                                                                                \
3255   shade_blocks_textured_modulated_load_last_##dithering(pixels_b);             \
3256   vtst.u16 draw_mask, draw_mask, test_mask;                                    \
3257                                                                                \
3258   shade_blocks_textured_modulated_store_pixels_##target();                     \
3259   vshr.u8 texels_b, texels_b, #3;                                              \
3260                                                                                \
3261   shade_blocks_textured_modulate_##dithering(r);                               \
3262   shade_blocks_textured_modulate_##dithering(g);                               \
3263   shade_blocks_textured_modulate_##dithering(b);                               \
3264                                                                                \
3265   vand.u16 pixels, texels, d128_0x8000;                                        \
3266   vceq.u16 zero_mask, texels, #0;                                              \
3267                                                                                \
3268   subs num_blocks, num_blocks, #1;                                             \
3269                                                                                \
3270   vqshrun.s16 pixels_r_low, pixels_r, #4;                                      \
3271   vqshrun.s16 pixels_g_low, pixels_g, #4;                                      \
3272   vqshrun.s16 pixels_b_low, pixels_b, #4;                                      \
3273                                                                                \
3274   shade_blocks_textured_modulated_apply_msb_mask_##target();                   \
3275   vorr.u16 draw_mask, draw_mask, zero_mask;                                    \
3276   vshr.u8 pixels_r_low, pixels_r_low, #3;                                      \
3277   vbic.u8 pixels_gb_low, pixels_gb_low, d128_0x07;                             \
3278                                                                                \
3279   bne 0b;                                                                      \
3280                                                                                \
3281  1:                                                                            \
3282   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
3283   vmlal.u8 pixels, pixels_g_low, d64_4;                                        \
3284   vmlal.u8 pixels, pixels_b_low, d64_128;                                      \
3285                                                                                \
3286   shade_blocks_textured_modulated_store_draw_mask_##target(28);                \
3287   shade_blocks_textured_modulated_store_pixels_##target();                     \
3288                                                                                \
3289   ldmia sp!, { r4 - r5, pc }                                                   \
3290
3291
3292 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
3293 shade_blocks_textured_modulated_builder(shaded, undithered, direct);
3294 shade_blocks_textured_modulated_builder(unshaded, dithered, direct);
3295 shade_blocks_textured_modulated_builder(unshaded, undithered, direct);
3296
3297 shade_blocks_textured_modulated_builder(shaded, dithered, indirect);
3298 shade_blocks_textured_modulated_builder(shaded, undithered, indirect);
3299 shade_blocks_textured_modulated_builder(unshaded, dithered, indirect);
3300 shade_blocks_textured_modulated_builder(unshaded, undithered, indirect);
3301
3302
3303 #undef c_64
3304 #undef fb_ptr
3305 #undef color_ptr
3306
3307 #undef color_r
3308 #undef color_g
3309 #undef color_b
3310
3311 #undef test_mask
3312 #undef pixels
3313 #undef draw_mask
3314 #undef zero_mask
3315 #undef fb_pixels
3316 #undef msb_mask
3317 #undef msb_mask_low
3318 #undef msb_mask_high
3319
3320 #define psx_gpu                                  r0
3321 #define num_blocks                               r1
3322 #define mask_msb_ptr                             r2
3323 #define color_ptr                                r3
3324
3325 #define block_ptr_load                           r0
3326 #define draw_mask_store_ptr                      r3
3327 #define draw_mask_bits_ptr                       r12
3328 #define draw_mask_ptr                            r12
3329 #define pixel_store_ptr                          r14
3330
3331 #define fb_ptr_cmp                               r4
3332
3333 #define fb_ptr                                   r3
3334 #define fb_ptr_next                              r14
3335
3336 #define c_64                                     r2
3337
3338 #define test_mask                                q0
3339 #define pixels                                   q1
3340 #define draw_mask                                q2
3341 #define zero_mask                                q3
3342 #define draw_mask_combined                       q4
3343 #define fb_pixels                                q5
3344 #define fb_pixels_next                           q6
3345 #define msb_mask                                 q7
3346
3347 #define draw_mask_low                            d4
3348 #define draw_mask_high                           d5
3349 #define msb_mask_low                             d14
3350 #define msb_mask_high                            d15
3351
3352 .align 3
3353 function(shade_blocks_textured_unmodulated_indirect)
3354   str r14, [ sp, #-4 ]
3355   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3356
3357   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3358   add pixel_store_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3359
3360   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3361   add draw_mask_store_ptr, psx_gpu, #psx_gpu_blocks_offset
3362
3363   mov c_64, #64
3364   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3365
3366   vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3367   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3368    [ draw_mask_bits_ptr, :16 ], c_64
3369   vceq.u16 zero_mask, pixels, #0
3370
3371   vtst.u16 draw_mask, draw_mask, test_mask
3372   vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3373
3374   subs num_blocks, num_blocks, #1
3375   beq 1f
3376
3377  0:
3378   vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3379   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3380
3381   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3382    [ draw_mask_bits_ptr, :16 ], c_64
3383   vceq.u16 zero_mask, pixels, #0
3384
3385   vtst.u16 draw_mask, draw_mask, test_mask
3386   vst1.u32 { pixels }, [ pixel_store_ptr, :128 ], c_64
3387
3388   vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3389   subs num_blocks, num_blocks, #1
3390
3391   bne 0b
3392
3393  1:
3394   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3395   vst1.u32 { draw_mask_combined }, [ draw_mask_store_ptr, :128 ], c_64
3396
3397   ldr pc, [ sp, #-4 ]
3398
3399
3400 .align 3
3401
3402 function(shade_blocks_textured_unmodulated_direct)
3403   stmdb sp!, { r4, r14 }
3404   add draw_mask_bits_ptr, psx_gpu, #(psx_gpu_blocks_offset + 40)
3405
3406   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3407   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3408
3409   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3410   mov c_64, #64
3411
3412   vld1.u32 { test_mask }, [ psx_gpu, :128 ]
3413   add block_ptr_load, psx_gpu, #psx_gpu_blocks_offset
3414
3415   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3416    [ draw_mask_bits_ptr, :16 ], c_64
3417   ldr fb_ptr_next, [ block_ptr_load, #44 ]
3418
3419   vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3420   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3421   vceq.u16 zero_mask, pixels, #0
3422   vtst.u16 draw_mask, draw_mask, test_mask
3423
3424   subs num_blocks, num_blocks, #1
3425   beq 1f
3426
3427  0:
3428   mov fb_ptr, fb_ptr_next
3429   ldr fb_ptr_next, [ block_ptr_load, #44 ]
3430
3431   vorr.u16 pixels, pixels, msb_mask
3432
3433   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3434   vmov fb_pixels, fb_pixels_next
3435
3436   vld1.u16 { draw_mask_low[], draw_mask_high[] },                              \
3437    [ draw_mask_bits_ptr, :16 ], c_64
3438   vbif.u16 fb_pixels, pixels, draw_mask_combined
3439
3440   vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
3441
3442   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3443   add fb_ptr_cmp, fb_ptr_cmp, #14
3444   cmp fb_ptr_cmp, #28
3445   bls 4f
3446
3447   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3448   vceq.u16 zero_mask, pixels, #0
3449
3450   vst1.u16 { fb_pixels }, [ fb_ptr ]
3451   vtst.u16 draw_mask, draw_mask, test_mask
3452
3453  3:
3454   subs num_blocks, num_blocks, #1
3455   bne 0b
3456
3457  1:
3458   vorr.u16 draw_mask_combined, draw_mask, zero_mask
3459   vbif.u16 fb_pixels_next, pixels, draw_mask_combined
3460
3461   vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3462
3463   ldmia sp!, { r4, pc }
3464
3465  4:
3466   vst1.u16 { fb_pixels }, [ fb_ptr ]
3467   vceq.u16 zero_mask, pixels, #0
3468
3469   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3470   vtst.u16 draw_mask, draw_mask, test_mask
3471
3472   bal 3b
3473
3474
3475 function(shade_blocks_unshaded_untextured_indirect)
3476   bx lr
3477
3478 .align 3
3479
3480 function(shade_blocks_unshaded_untextured_direct)
3481   stmdb sp!, { r4, r14 }
3482   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
3483
3484   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
3485   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
3486
3487   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
3488   add color_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
3489
3490   add block_ptr_load, psx_gpu, #(psx_gpu_blocks_offset + 44)
3491   vld1.u16 { pixels }, [ color_ptr, :128 ]
3492
3493   mov c_64, #64
3494   vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3495
3496   vorr.u16 pixels, pixels, msb_mask
3497   subs num_blocks, num_blocks, #1
3498
3499   ldr fb_ptr_next, [ block_ptr_load ], #64
3500
3501   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3502   beq 1f
3503
3504  0:
3505   vmov fb_pixels, fb_pixels_next
3506   mov fb_ptr, fb_ptr_next
3507   ldr fb_ptr_next, [ block_ptr_load ], #64
3508
3509   vbif.u16 fb_pixels, pixels, draw_mask
3510   vld1.u16 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
3511
3512   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
3513   add fb_ptr_cmp, fb_ptr_cmp, #14
3514   cmp fb_ptr_cmp, #28
3515   bls 4f
3516
3517   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3518   vst1.u16 { fb_pixels }, [ fb_ptr ]
3519
3520  3:
3521   subs num_blocks, num_blocks, #1
3522   bne 0b
3523
3524  1:
3525   vbif.u16 fb_pixels_next, pixels, draw_mask
3526   vst1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3527
3528   ldmia sp!, { r4, pc }
3529
3530  4:
3531   vst1.u16 { fb_pixels }, [ fb_ptr ]
3532   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ]
3533   bal 3b
3534
3535
3536 #undef draw_mask_ptr
3537 #undef c_64
3538 #undef fb_ptr
3539 #undef fb_ptr_next
3540 #undef fb_ptr_cmp
3541
3542 #define psx_gpu                                  r0
3543 #define num_blocks                               r1
3544 #define msb_mask_ptr                             r2
3545 #define pixel_ptr                                r3
3546 #define draw_mask_ptr                            r0
3547 #define c_64                                     r2
3548 #define fb_ptr                                   r12
3549 #define fb_ptr_next                              r14
3550 #define fb_ptr_cmp                               r4
3551
3552 #undef msb_mask
3553 #undef draw_mask
3554 #undef pixels
3555 #undef fb_pixels
3556 #undef d128_0x8000
3557 #undef msb_mask_low
3558 #undef msb_mask_high
3559 #undef draw_mask_next
3560 #undef pixels_g
3561 #undef blend_pixels
3562 #undef fb_pixels_next
3563
3564 #define msb_mask                                 q0
3565 #define draw_mask                                q1
3566 #define pixels                                   q2
3567 #define fb_pixels                                q3
3568 #define blend_pixels                             q4
3569 #define pixels_no_msb                            q5
3570 #define blend_mask                               q6
3571 #define fb_pixels_no_msb                         q7
3572 #define d128_0x8000                              q8
3573 #define d128_0x0421                              q9
3574 #define fb_pixels_next                           q10
3575 #define blend_pixels_next                        q11
3576 #define pixels_next                              q12
3577 #define draw_mask_next                           q13
3578 #define write_mask                               q14
3579
3580 #define pixels_rb                                q5
3581 #define pixels_mg                                q7
3582 #define pixels_g                                 q7
3583 #define d128_0x7C1F                              q8
3584 #define d128_0x03E0                              q9
3585 #define fb_pixels_rb                             q10
3586 #define fb_pixels_g                              q11
3587 #define fb_pixels_masked                         q11
3588 #define d128_0x83E0                              q15
3589 #define pixels_fourth                            q7
3590 #define d128_0x1C07                              q12
3591 #define d128_0x00E0                              q13
3592 #define d128_0x80E0                              q13
3593
3594 #define msb_mask_low                             d0
3595 #define msb_mask_high                            d1
3596
3597 #define blend_blocks_average_set_blend_mask_textured(source)                   \
3598   vclt.s16 blend_mask, source, #0                                              \
3599
3600 #define blend_blocks_average_set_stp_bit_textured()                            \
3601   vorr.u16 blend_pixels, #0x8000                                               \
3602
3603 #define blend_blocks_average_combine_textured(source)                          \
3604   vbif.u16 blend_pixels, source, blend_mask                                    \
3605   
3606 #define blend_blocks_average_set_blend_mask_untextured(source)                 \
3607
3608 #define blend_blocks_average_set_stp_bit_untextured()                          \
3609
3610 #define blend_blocks_average_combine_untextured(source)                        \
3611
3612 #define blend_blocks_average_mask_set_on()                                     \
3613   vclt.s16 write_mask, fb_pixels_next, #0                                      \
3614
3615 #define blend_blocks_average_mask_copy_on()                                    \
3616   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3617
3618 #define blend_blocks_average_mask_copy_b_on()                                  \
3619   vorr.u16 draw_mask_next, draw_mask_next, write_mask                          \
3620
3621 #define blend_blocks_average_mask_set_off()                                    \
3622
3623 #define blend_blocks_average_mask_copy_off()                                   \
3624   vmov draw_mask, draw_mask_next                                               \
3625
3626 #define blend_blocks_average_mask_copy_b_off()                                 \
3627
3628 #define blend_blocks_average_builder(texturing, mask_evaluate)                 \
3629 .align 3;                                                                      \
3630                                                                                \
3631 function(blend_blocks_##texturing##_average_##mask_evaluate)                   \
3632   stmdb sp!, { r4, r14 };                                                      \
3633   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3634   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3635                                                                                \
3636   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3637   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
3638                                                                                \
3639   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3640   mov c_64, #64;                                                               \
3641                                                                                \
3642   vmov.u16 d128_0x8000, #0x8000;                                               \
3643   vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64;                  \
3644   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3645                                                                                \
3646   vmov.u16 d128_0x0421, #0x0400;                                               \
3647   vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64;                         \
3648                                                                                \
3649   vorr.u16 d128_0x0421, #0x0021;                                               \
3650   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ];                                \
3651                                                                                \
3652   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3653   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3654   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3655   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3656   blend_blocks_average_mask_set_##mask_evaluate();                             \
3657   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3658                                                                                \
3659   subs num_blocks, num_blocks, #1;                                             \
3660   beq 1f;                                                                      \
3661                                                                                \
3662  0:                                                                            \
3663   mov fb_ptr, fb_ptr_next;                                                     \
3664   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3665                                                                                \
3666   vmov pixels, pixels_next;                                                    \
3667   vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64;                         \
3668                                                                                \
3669   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3670                                                                                \
3671   blend_blocks_average_mask_copy_##mask_evaluate();                            \
3672   vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64;                  \
3673                                                                                \
3674   blend_blocks_average_set_blend_mask_##texturing(pixels);                     \
3675   blend_blocks_average_set_stp_bit_##texturing();                              \
3676   vmov fb_pixels, fb_pixels_next;                                              \
3677   blend_blocks_average_combine_##texturing(pixels);                            \
3678                                                                                \
3679   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3680   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3681   cmp fb_ptr_cmp, #28;                                                         \
3682   bls 2f;                                                                      \
3683                                                                                \
3684   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ];                                \
3685   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3686                                                                                \
3687   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3688   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3689                                                                                \
3690   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3691   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3692                                                                                \
3693   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3694   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3695   blend_blocks_average_mask_set_##mask_evaluate();                             \
3696   vst1.u16 { fb_pixels }, [ fb_ptr ];                                          \
3697                                                                                \
3698  3:                                                                            \
3699   subs num_blocks, num_blocks, #1;                                             \
3700   bne 0b;                                                                      \
3701                                                                                \
3702  1:                                                                            \
3703   blend_blocks_average_mask_copy_b_##mask_evaluate();                          \
3704   vhadd.u16 blend_pixels, fb_pixels_no_msb, blend_pixels_next;                 \
3705                                                                                \
3706   blend_blocks_average_set_blend_mask_##texturing(pixels_next);                \
3707   blend_blocks_average_set_stp_bit_##texturing();                              \
3708   blend_blocks_average_combine_##texturing(pixels_next);                       \
3709                                                                                \
3710   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3711   vbif.u16 fb_pixels_next, blend_pixels, draw_mask_next;                       \
3712   vst1.u16 { fb_pixels_next }, [ fb_ptr_next ];                                \
3713                                                                                \
3714   ldmia sp!, { r4, pc };                                                       \
3715                                                                                \
3716  2:                                                                            \
3717   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3718   vbif.u16 fb_pixels, blend_pixels, draw_mask;                                 \
3719   vst1.u16 { fb_pixels }, [ fb_ptr ];                                          \
3720                                                                                \
3721   vld1.u16 { fb_pixels_next }, [ fb_ptr_next ];                                \
3722   veor.u16 blend_pixels_next, pixels_next, fb_pixels_next;                     \
3723   vbic.u16 pixels_no_msb, pixels_next, d128_0x8000;                            \
3724   vand.u16 blend_pixels_next, blend_pixels_next, d128_0x0421;                  \
3725   vsub.u16 blend_pixels_next, pixels_no_msb, blend_pixels_next;                \
3726   vbic.u16 fb_pixels_no_msb, fb_pixels_next, d128_0x8000;                      \
3727                                                                                \
3728   bal 3b                                                                       \
3729
3730 blend_blocks_average_builder(textured, off)
3731 blend_blocks_average_builder(untextured, off)
3732 blend_blocks_average_builder(textured, on)
3733 blend_blocks_average_builder(untextured, on)
3734
3735
3736 #define blend_blocks_add_mask_set_on()                                         \
3737   vclt.s16 write_mask, fb_pixels, #0                                           \
3738
3739 #define blend_blocks_add_mask_copy_on()                                        \
3740   vorr.u16 draw_mask, draw_mask, write_mask                                    \
3741
3742 #define blend_blocks_add_mask_set_off()                                        \
3743
3744 #define blend_blocks_add_mask_copy_off()                                       \
3745
3746
3747 #define blend_blocks_add_textured_builder(mask_evaluate)                       \
3748 .align 3;                                                                      \
3749                                                                                \
3750 function(blend_blocks_textured_add_##mask_evaluate)                            \
3751   stmdb sp!, { r4, r14 };                                                      \
3752   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3753   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3754                                                                                \
3755   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3756   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
3757                                                                                \
3758   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3759   mov c_64, #64;                                                               \
3760                                                                                \
3761   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3762   vmov.u16 d128_0x03E0, #0x0300;                                               \
3763   vmov.u16 d128_0x83E0, #0x8000;                                               \
3764   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3765   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3766   vorr.u16 d128_0x83E0, d128_0x83E0, d128_0x03E0;                              \
3767                                                                                \
3768   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
3769   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3770   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
3771   vclt.s16 blend_mask, pixels, #0;                                             \
3772   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3773   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3774   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3775                                                                                \
3776   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3777   vorr.u16 pixels, pixels, msb_mask;                                           \
3778   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3779   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3780   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3781   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3782   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3783   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3784   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3785   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3786                                                                                \
3787   subs num_blocks, num_blocks, #1;                                             \
3788   beq 1f;                                                                      \
3789                                                                                \
3790  0:                                                                            \
3791   mov fb_ptr, fb_ptr_next;                                                     \
3792                                                                                \
3793   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3794                                                                                \
3795   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
3796   vclt.s16 blend_mask, pixels, #0;                                             \
3797                                                                                \
3798   vorr.u16 pixels, pixels, msb_mask;                                           \
3799   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3800   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
3801                                                                                \
3802   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3803   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
3804                                                                                \
3805   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3806   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3807   cmp fb_ptr_cmp, #28;                                                         \
3808   bls 2f;                                                                      \
3809                                                                                \
3810   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3811   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3812   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3813   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3814   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3815   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3816   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
3817                                                                                \
3818  3:                                                                            \
3819   vand.u16 fb_pixels_g, fb_pixels_masked, d128_0x03E0;                         \
3820   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3821   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_mg;                                \
3822   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3823   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x83E0;                              \
3824                                                                                \
3825   subs num_blocks, num_blocks, #1;                                             \
3826   bne 0b;                                                                      \
3827                                                                                \
3828  1:                                                                            \
3829   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3830   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3831   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
3832                                                                                \
3833   ldmia sp!, { r4, pc };                                                       \
3834                                                                                \
3835  2:                                                                            \
3836   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
3837   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3838                                                                                \
3839   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3840   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3841   vand.u16 fb_pixels_masked, fb_pixels, blend_mask;                            \
3842   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3843   vand.u16 fb_pixels_rb, fb_pixels_masked, d128_0x7C1F;                        \
3844   bal 3b                                                                       \
3845
3846
3847 #define blend_blocks_add_untextured_builder(mask_evaluate)                     \
3848 .align 3;                                                                      \
3849                                                                                \
3850 function(blend_blocks_untextured_add_##mask_evaluate)                          \
3851   stmdb sp!, { r4, r14 };                                                      \
3852   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3853   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3854                                                                                \
3855   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3856   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
3857                                                                                \
3858   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3859   mov c_64, #64;                                                               \
3860                                                                                \
3861   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3862   vmov.u16 d128_0x03E0, #0x0300;                                               \
3863   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3864   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3865                                                                                \
3866   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
3867   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3868   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
3869   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3870   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3871   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3872                                                                                \
3873   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3874   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3875   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3876   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3877   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3878   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3879   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3880   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3881                                                                                \
3882   subs num_blocks, num_blocks, #1;                                             \
3883   beq 1f;                                                                      \
3884                                                                                \
3885  0:                                                                            \
3886   mov fb_ptr, fb_ptr_next;                                                     \
3887                                                                                \
3888   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
3889                                                                                \
3890   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
3891                                                                                \
3892   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3893   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3894   vand.u16 pixels_g, pixels, d128_0x03E0;                                      \
3895                                                                                \
3896   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3897   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
3898                                                                                \
3899   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
3900   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
3901   cmp fb_ptr_cmp, #28;                                                         \
3902   bls 2f;                                                                      \
3903                                                                                \
3904   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3905   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3906   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3907   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3908   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3909   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
3910                                                                                \
3911  3:                                                                            \
3912   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
3913   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
3914   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
3915   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
3916   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
3917                                                                                \
3918   subs num_blocks, num_blocks, #1;                                             \
3919   bne 0b;                                                                      \
3920                                                                                \
3921  1:                                                                            \
3922   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
3923   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
3924   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
3925   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
3926                                                                                \
3927   ldmia sp!, { r4, pc };                                                       \
3928                                                                                \
3929  2:                                                                            \
3930   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
3931   vand.u16 pixels_rb, pixels, d128_0x7C1F;                                     \
3932                                                                                \
3933   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
3934   blend_blocks_add_mask_set_##mask_evaluate();                                 \
3935   blend_blocks_add_mask_copy_##mask_evaluate();                                \
3936   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
3937   bal 3b                                                                       \
3938
3939
3940 blend_blocks_add_textured_builder(off)
3941 blend_blocks_add_textured_builder(on)
3942 blend_blocks_add_untextured_builder(off)
3943 blend_blocks_add_untextured_builder(on)
3944
3945 #define blend_blocks_subtract_set_blend_mask_textured()                        \
3946   vclt.s16 blend_mask, pixels_next, #0                                         \
3947
3948 #define blend_blocks_subtract_combine_textured()                               \
3949   vbif.u16 blend_pixels, pixels, blend_mask                                    \
3950
3951 #define blend_blocks_subtract_set_stb_textured()                               \
3952   vorr.u16 blend_pixels, #0x8000                                               \
3953
3954 #define blend_blocks_subtract_msb_mask_textured()                              \
3955   vorr.u16 pixels, pixels_next, msb_mask                                       \
3956
3957 #define blend_blocks_subtract_set_blend_mask_untextured()                      \
3958
3959 #define blend_blocks_subtract_combine_untextured()                             \
3960
3961 #define blend_blocks_subtract_set_stb_untextured()                             \
3962   vorr.u16 blend_pixels, blend_pixels, msb_mask                                \
3963
3964 #define blend_blocks_subtract_msb_mask_untextured()                            \
3965
3966
3967 #define blend_blocks_subtract_mask_set_on()                                    \
3968   vclt.s16 write_mask, fb_pixels, #0                                           \
3969
3970 #define blend_blocks_subtract_mask_copy_on()                                   \
3971   vorr.u16 draw_mask, draw_mask_next, write_mask                               \
3972
3973 #define blend_blocks_subtract_mask_set_off()                                   \
3974
3975 #define blend_blocks_subtract_mask_copy_off()                                  \
3976   vmov draw_mask, draw_mask_next                                               \
3977
3978
3979 #define blend_blocks_subtract_builder(texturing, mask_evaluate)                \
3980 .align 3;                                                                      \
3981                                                                                \
3982 function(blend_blocks_##texturing##_subtract_##mask_evaluate)                  \
3983   stmdb sp!, { r4, r14 };                                                      \
3984   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
3985   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
3986                                                                                \
3987   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
3988   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
3989                                                                                \
3990   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
3991   mov c_64, #64;                                                               \
3992                                                                                \
3993   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
3994   vmov.u16 d128_0x03E0, #0x0300;                                               \
3995   vorr.u16 d128_0x7C1F, #0x001F;                                               \
3996   vorr.u16 d128_0x03E0, #0x00E0;                                               \
3997                                                                                \
3998   vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64;                  \
3999   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4000   vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64;                         \
4001   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4002   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4003   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4004   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4005                                                                                \
4006   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4007   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4008   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4009   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4010   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4011                                                                                \
4012   subs num_blocks, num_blocks, #1;                                             \
4013   beq 1f;                                                                      \
4014                                                                                \
4015  0:                                                                            \
4016   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4017   mov fb_ptr, fb_ptr_next;                                                     \
4018   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4019                                                                                \
4020   vld1.u32 { draw_mask_next }, [ draw_mask_ptr, :128 ], c_64;                  \
4021   blend_blocks_subtract_msb_mask_##texturing();                                \
4022                                                                                \
4023   vld1.u32 { pixels_next }, [ pixel_ptr, :128 ], c_64;                         \
4024   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4025   vand.u16 pixels_rb, pixels_next, d128_0x7C1F;                                \
4026   blend_blocks_subtract_set_stb_##texturing();                                 \
4027   vand.u16 pixels_g, pixels_next, d128_0x03E0;                                 \
4028   blend_blocks_subtract_combine_##texturing();                                 \
4029   blend_blocks_subtract_set_blend_mask_##texturing();                          \
4030   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4031                                                                                \
4032   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4033   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4034   cmp fb_ptr_cmp, #28;                                                         \
4035   bls 2f;                                                                      \
4036                                                                                \
4037   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4038   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4039   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4040   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4041   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4042   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4043   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4044                                                                                \
4045  3:                                                                            \
4046   subs num_blocks, num_blocks, #1;                                             \
4047   bne 0b;                                                                      \
4048                                                                                \
4049  1:                                                                            \
4050   blend_blocks_subtract_mask_copy_##mask_evaluate();                           \
4051                                                                                \
4052   blend_blocks_subtract_msb_mask_##texturing();                                \
4053   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4054   blend_blocks_subtract_set_stb_##texturing();                                 \
4055   blend_blocks_subtract_combine_##texturing();                                 \
4056   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4057   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
4058                                                                                \
4059   ldmia sp!, { r4, pc };                                                       \
4060                                                                                \
4061  2:                                                                            \
4062   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4063   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4064   blend_blocks_subtract_mask_set_##mask_evaluate();                            \
4065   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4066   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4067   vqsub.u8 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4068   vqsub.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                \
4069   bal 3b                                                                       \
4070
4071
4072 blend_blocks_subtract_builder(textured, off)
4073 blend_blocks_subtract_builder(textured, on)
4074 blend_blocks_subtract_builder(untextured, off)
4075 blend_blocks_subtract_builder(untextured, on)
4076
4077
4078 #define blend_blocks_add_fourth_textured_builder(mask_evaluate)                \
4079 .align 3;                                                                      \
4080                                                                                \
4081 function(blend_blocks_textured_add_fourth_##mask_evaluate)                     \
4082   stmdb sp!, { r4, r14 };                                                      \
4083   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4084   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
4085                                                                                \
4086   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4087   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
4088                                                                                \
4089   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4090   mov c_64, #64;                                                               \
4091                                                                                \
4092   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4093   vmov.u16 d128_0x03E0, #0x0300;                                               \
4094   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4095   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4096   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4097   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4098   vorr.u16 d128_0x1C07, #0x0007;                                               \
4099                                                                                \
4100   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
4101   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4102   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
4103   vclt.s16 blend_mask, pixels, #0;                                             \
4104   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4105   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4106   vshr.s16 pixels_fourth, pixels, #2;                                          \
4107   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4108                                                                                \
4109   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4110   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4111   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4112   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4113   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4114   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4115   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4116   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4117                                                                                \
4118   subs num_blocks, num_blocks, #1;                                             \
4119   beq 1f;                                                                      \
4120                                                                                \
4121  0:                                                                            \
4122   mov fb_ptr, fb_ptr_next;                                                     \
4123   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4124                                                                                \
4125   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4126   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4127                                                                                \
4128   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
4129   vclt.s16 blend_mask, pixels, #0;                                             \
4130   vshr.s16 pixels_fourth, pixels, #2;                                          \
4131   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4132   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4133                                                                                \
4134   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4135   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
4136                                                                                \
4137   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4138   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4139   cmp fb_ptr_cmp, #28;                                                         \
4140   bls 2f;                                                                      \
4141                                                                                \
4142   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4143   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4144   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4145   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4146   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4147   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4148                                                                                \
4149  3:                                                                            \
4150   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4151   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4152   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4153   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4154   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4155                                                                                \
4156   subs num_blocks, num_blocks, #1;                                             \
4157   bne 0b;                                                                      \
4158                                                                                \
4159  1:                                                                            \
4160   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4161   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4162   vbif.u16 blend_pixels, pixels, blend_mask;                                   \
4163   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4164   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
4165                                                                                \
4166   ldmia sp!, { r4, pc };                                                       \
4167                                                                                \
4168  2:                                                                            \
4169   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4170   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4171                                                                                \
4172   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4173   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4174   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4175   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4176   bal 3b                                                                       \
4177
4178
4179
4180 #define blend_blocks_add_fourth_untextured_builder(mask_evaluate)              \
4181 .align 3;                                                                      \
4182                                                                                \
4183 function(blend_blocks_untextured_add_fourth_##mask_evaluate)                   \
4184   stmdb sp!, { r4, r14 };                                                      \
4185   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset;                         \
4186   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
4187                                                                                \
4188   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16);                       \
4189   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ];         \
4190                                                                                \
4191   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset;                          \
4192   mov c_64, #64;                                                               \
4193                                                                                \
4194   vmov.u16 d128_0x7C1F, #0x7C00;                                               \
4195   vmov.u16 d128_0x03E0, #0x0300;                                               \
4196   vmov.u16 d128_0x1C07, #0x1C00;                                               \
4197   vmov.u16 d128_0x00E0, #0x00E0;                                               \
4198   vorr.u16 d128_0x7C1F, #0x001F;                                               \
4199   vorr.u16 d128_0x03E0, #0x00E0;                                               \
4200   vorr.u16 d128_0x1C07, #0x0007;                                               \
4201                                                                                \
4202   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
4203   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4204   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
4205   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4206   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4207   vshr.s16 pixels_fourth, pixels, #2;                                          \
4208   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4209                                                                                \
4210   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4211   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4212   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4213   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4214   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4215   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4216   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4217   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4218                                                                                \
4219   subs num_blocks, num_blocks, #1;                                             \
4220   beq 1f;                                                                      \
4221                                                                                \
4222  0:                                                                            \
4223   mov fb_ptr, fb_ptr_next;                                                     \
4224   ldr fb_ptr_next, [ pixel_ptr, #28 ];                                         \
4225                                                                                \
4226   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64;                              \
4227                                                                                \
4228   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4229   vshr.s16 pixels_fourth, pixels, #2;                                          \
4230   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4231   vand.u16 pixels_rb, pixels_fourth, d128_0x1C07;                              \
4232                                                                                \
4233   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4234   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
4235                                                                                \
4236   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
4237   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
4238   cmp fb_ptr_cmp, #28;                                                         \
4239   bls 2f;                                                                      \
4240                                                                                \
4241   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4242   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4243   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4244   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4245   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4246   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4247                                                                                \
4248  3:                                                                            \
4249   vand.u16 fb_pixels_g, fb_pixels, d128_0x03E0;                                \
4250   vadd.u16 fb_pixels_rb, fb_pixels_rb, pixels_rb;                              \
4251   vadd.u16 fb_pixels_g, fb_pixels_g, pixels_g;                                 \
4252   vmin.u8 fb_pixels_rb, fb_pixels_rb, d128_0x7C1F;                             \
4253   vmin.u16 fb_pixels_g, fb_pixels_g, d128_0x03E0;                              \
4254                                                                                \
4255   subs num_blocks, num_blocks, #1;                                             \
4256   bne 0b;                                                                      \
4257                                                                                \
4258  1:                                                                            \
4259   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
4260   vorr.u16 blend_pixels, blend_pixels, msb_mask;                               \
4261   vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
4262   vst1.u16 { blend_pixels }, [ fb_ptr_next ];                                  \
4263                                                                                \
4264   ldmia sp!, { r4, pc };                                                       \
4265                                                                                \
4266  2:                                                                            \
4267   vst1.u16 { blend_pixels }, [ fb_ptr ];                                       \
4268   vand.u16 pixels_g, pixels_fourth, d128_0x00E0;                               \
4269                                                                                \
4270   vld1.u16 { fb_pixels }, [ fb_ptr_next ];                                     \
4271   blend_blocks_add_mask_set_##mask_evaluate();                                 \
4272   blend_blocks_add_mask_copy_##mask_evaluate();                                \
4273   vand.u16 fb_pixels_rb, fb_pixels, d128_0x7C1F;                               \
4274   bal 3b                                                                       \
4275
4276
4277 blend_blocks_add_fourth_textured_builder(off)
4278 blend_blocks_add_fourth_textured_builder(on)
4279 blend_blocks_add_fourth_untextured_builder(off)
4280 blend_blocks_add_fourth_untextured_builder(on)
4281
4282 // TODO: Optimize this more. Need a scene that actually uses it for
4283 // confirmation..
4284
4285 .align 3
4286
4287 function(blend_blocks_textured_unblended_on)         
4288   stmdb sp!, { r4, r14 }
4289   add mask_msb_ptr, psx_gpu, #psx_gpu_mask_msb_offset
4290   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4291
4292   add pixel_ptr, psx_gpu, #(psx_gpu_blocks_offset + 16)
4293   vld1.u16 { msb_mask_low[], msb_mask_high[] }, [ mask_msb_ptr, :16 ]
4294
4295   add draw_mask_ptr, psx_gpu, #psx_gpu_blocks_offset
4296   mov c_64, #64
4297
4298   ldr fb_ptr, [ pixel_ptr, #28 ]
4299   vld1.u16 { fb_pixels }, [ fb_ptr ]
4300   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4301   vclt.s16 write_mask, fb_pixels, #0
4302   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4303
4304   subs num_blocks, num_blocks, #1
4305   beq 1f
4306
4307  0:
4308   vorr.u16 draw_mask, draw_mask, write_mask
4309   vbif.u16 fb_pixels, pixels, draw_mask
4310   vst1.u16 { fb_pixels }, [ fb_ptr ]
4311
4312   ldr fb_ptr, [ pixel_ptr, #28 ]
4313   vld1.u16 { fb_pixels }, [ fb_ptr ]
4314   vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64
4315   vclt.s16 write_mask, fb_pixels, #0
4316   vld1.u32 { pixels }, [ pixel_ptr, :128 ], c_64
4317
4318   subs num_blocks, num_blocks, #1
4319   bne 0b
4320  
4321  1:
4322   vorr.u16 draw_mask, draw_mask, write_mask
4323   vbif.u16 fb_pixels, pixels, draw_mask
4324   vst1.u16 { fb_pixels }, [ fb_ptr ]
4325
4326   ldmia sp!, { r4, pc }
4327
4328
4329 function(blend_blocks_textured_unblended_off)
4330   bx lr
4331
4332
4333 function(warmup)
4334   mov r3, #64
4335   cmp r0, #0
4336   bxeq lr
4337
4338  0:
4339   vld1.u32 { u_whole_8, v_whole_8 }, [ r1, :128 ], r3
4340
4341   subs r0, r0, #1
4342   bne 0b
4343
4344   bx lr
4345
4346 #undef vram_ptr
4347 #undef color
4348 #undef width
4349 #undef height
4350 #undef pitch
4351
4352 #define vram_ptr                                          r0
4353 #define color                                             r1
4354 #define width                                             r2
4355 #define height                                            r3
4356
4357 #define pitch                                             r1
4358
4359 #define num_width                                         r12
4360
4361 #undef colors_a
4362 #undef colors_b
4363
4364 #define colors_a                                          q0
4365 #define colors_b                                          q1
4366
4367 .align 3
4368
4369 function(render_block_fill_body)
4370   vdup.u16 colors_a, color
4371   mov pitch, #2048
4372
4373   vmov colors_b, colors_a
4374   sub pitch, pitch, width, lsl #1
4375
4376   mov num_width, width
4377
4378  0:  
4379   vst1.u32 { colors_a, colors_b }, [ vram_ptr, :256 ]!
4380
4381   subs num_width, num_width, #16
4382   bne 0b
4383
4384   add vram_ptr, vram_ptr, pitch
4385   mov num_width, width
4386
4387   subs height, height, #1
4388   bne 0b
4389
4390   bx lr
4391  
4392
4393 #undef x
4394 #undef y
4395 #undef width
4396 #undef height
4397 #undef fb_ptr
4398 #undef texture_mask
4399 #undef num_blocks
4400 #undef temp
4401 #undef dirty_textures_mask
4402 #undef clut_ptr
4403 #undef current_texture_mask
4404
4405 #define psx_gpu                                           r0
4406 #define x                                                 r1
4407 #define y                                                 r2
4408 #define u                                                 r3
4409 #define v                                                 r4
4410 #define width                                             r5
4411 #define height                                            r6
4412 #define offset_u                                          r8
4413 #define offset_v                                          r9
4414 #define offset_u_right                                    r10
4415 #define width_rounded                                     r11
4416 #define height_rounded                                    r12
4417
4418 #define texture_offset_base                               r1
4419 #define tile_width                                        r2
4420 #define tile_height                                       r3
4421 #define num_blocks                                        r4
4422 #define block                                             r5
4423 #define sub_tile_height                                   r6
4424 #define fb_ptr                                            r7
4425 #define texture_mask                                      r8
4426 #define column_data                                       r9
4427 #define texture_offset                                    r10
4428 #define tiles_remaining                                   r11
4429 #define fb_ptr_advance_column                             r12
4430 #define texture_block_ptr                                 r14
4431
4432 #define texture_page_ptr                                  r3
4433 #define left_block_mask                                   r4
4434 #define right_block_mask                                  r5
4435 #define texture_mask_rev                                  r10
4436 #define control_mask                                      r11
4437
4438 #define dirty_textures_mask                               r4
4439 #define clut_ptr                                          r5
4440 #define current_texture_mask                              r6
4441
4442
4443 #undef texels
4444 #undef clut_low_a
4445 #undef clut_low_b
4446 #undef clut_high_a
4447 #undef clut_high_b
4448 #undef clut_a
4449 #undef clut_b
4450 #undef texels_low
4451 #undef texels_high
4452
4453 #define texels                                            d0
4454 #define draw_masks_fb_ptrs                                q1
4455
4456 #define draw_mask_fb_ptr_left                             d2
4457 #define draw_mask_fb_ptr_right                            d3
4458
4459 #define clut_low_a                                        d4
4460 #define clut_low_b                                        d5
4461 #define clut_high_a                                       d6
4462 #define clut_high_b                                       d7
4463
4464 #define block_masks                                       d8
4465 #define block_masks_shifted                               d9
4466
4467 #define clut_a                                            q2
4468 #define clut_b                                            q3
4469
4470 #define texels_low                                        d10
4471 #define texels_high                                       d11
4472
4473
4474 setup_sprite_flush_blocks_single:
4475   vpush { q1 - q4 }
4476
4477   stmdb sp!, { r0 - r3, r12, r14 }
4478   bl flush_render_block_buffer
4479   ldmia sp!, { r0 - r3, r12, r14 }
4480
4481   vpop { q1 - q4 }
4482
4483   add block, psx_gpu, #psx_gpu_blocks_offset
4484
4485   mov num_blocks, sub_tile_height
4486   bx lr
4487
4488
4489 setup_sprite_flush_blocks_double:
4490   vpush { q1 - q4 }
4491
4492   stmdb sp!, { r0 - r3, r12, r14 }
4493   bl flush_render_block_buffer
4494   ldmia sp!, { r0 - r3, r12, r14 }
4495
4496   vpop { q1 - q4 }
4497
4498   add block, psx_gpu, #psx_gpu_blocks_offset
4499
4500   mov num_blocks, sub_tile_height, lsl #1
4501   bx lr
4502
4503
4504 setup_sprite_update_texture_4bpp_cache:
4505   stmdb sp!, { r0 - r3, r14 }
4506   bl update_texture_4bpp_cache
4507   ldmia sp!, { r0 - r3, pc }
4508
4509
4510 setup_sprite_update_texture_8bpp_cache:
4511   stmdb sp!, { r0 - r3, r14 }
4512   bl update_texture_8bpp_cache
4513   ldmia sp!, { r0 - r3, pc }
4514
4515
4516 #define setup_sprite_tiled_initialize_4bpp()                                   \
4517   ldr dirty_textures_mask,                                                     \
4518    [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ];                      \
4519   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ];                         \
4520                                                                                \
4521   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4522   vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ];                             \
4523                                                                                \
4524   tst current_texture_mask, dirty_textures_mask;                               \
4525   vuzp.u8 clut_a, clut_b;                                                      \
4526                                                                                \
4527   blne setup_sprite_update_texture_4bpp_cache                                  \
4528
4529 #define setup_sprite_tiled_initialize_8bpp()                                   \
4530   ldr dirty_textures_mask,                                                     \
4531    [ psx_gpu, #psx_gpu_dirty_textures_8bpp_mask_offset ];                      \
4532   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]; \
4533                                                                                \
4534   tst current_texture_mask, dirty_textures_mask;                               \
4535   blne setup_sprite_update_texture_8bpp_cache                                  \
4536
4537
4538 #define setup_sprite_tile_setup_block_no(side, offset, texture_mode)           \
4539
4540 #define setup_sprite_block_count_single()                                      \
4541   sub_tile_height                                                              \
4542
4543 #define setup_sprite_block_count_double()                                      \
4544   sub_tile_height, lsl #1                                                      \
4545
4546 #define setup_sprite_tile_add_blocks(type)                                     \
4547   add num_blocks, num_blocks, setup_sprite_block_count_##type();               \
4548   cmp num_blocks, #MAX_BLOCKS;                                                 \
4549                                                                                \
4550   blgt setup_sprite_flush_blocks_##type                                        \
4551
4552
4553 #define setup_sprite_tile_full_4bpp(edge)                                      \
4554   setup_sprite_tile_add_blocks(double);                                        \
4555                                                                                \
4556  4:                                                                            \
4557   and texture_block_ptr, texture_offset, texture_mask;                         \
4558   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4559                                                                                \
4560   pld [ fb_ptr ];                                                              \
4561   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4562   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4563                                                                                \
4564   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4565   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4566                                                                                \
4567   vst2.u8 { texels_low, texels_high }, [ block, :128 ];                        \
4568   add texture_block_ptr, texture_offset, #8;                                   \
4569                                                                                \
4570   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4571   add block, block, #40;                                                       \
4572                                                                                \
4573   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4574   add fb_ptr, fb_ptr, #16;                                                     \
4575                                                                                \
4576   vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ];                          \
4577   add block, block, #24;                                                       \
4578                                                                                \
4579   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4580   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4581                                                                                \
4582   pld [ fb_ptr ];                                                              \
4583   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4584   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4585                                                                                \
4586   vst2.u8 { texels_low, texels_high }, [ block, :128 ];                        \
4587   add block, block, #40;                                                       \
4588                                                                                \
4589   add texture_offset, texture_offset, #0x10;                                   \
4590   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4591                                                                                \
4592   vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ];                         \
4593   add block, block, #24;                                                       \
4594                                                                                \
4595   subs sub_tile_height, sub_tile_height, #1;                                   \
4596   bne 4b;                                                                      \
4597                                                                                \
4598   add texture_offset, texture_offset, #0xF00;                                  \
4599   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4600
4601   
4602 #define setup_sprite_tile_half_4bpp(edge)                                      \
4603   setup_sprite_tile_add_blocks(single);                                        \
4604                                                                                \
4605  4:                                                                            \
4606   and texture_block_ptr, texture_offset, texture_mask;                         \
4607   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4608                                                                                \
4609   pld [ fb_ptr ];                                                              \
4610   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4611   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4612                                                                                \
4613   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
4614   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
4615                                                                                \
4616   vst2.u8 { texels_low, texels_high }, [ block, :128 ];                        \
4617   add block, block, #40;                                                       \
4618                                                                                \
4619   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4620   vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ];                        \
4621                                                                                \
4622   add block, block, #24;                                                       \
4623   add texture_offset, texture_offset, #0x10;                                   \
4624                                                                                \
4625   add fb_ptr, fb_ptr, #2048;                                                   \
4626   subs sub_tile_height, sub_tile_height, #1;                                   \
4627                                                                                \
4628   bne 4b;                                                                      \
4629                                                                                \
4630   add texture_offset, texture_offset, #0xF00;                                  \
4631   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4632  
4633  
4634 #define setup_sprite_tile_full_8bpp(edge)                                      \
4635   setup_sprite_tile_add_blocks(double);                                        \
4636   add block, block, #16;                                                       \
4637                                                                                \
4638  4:                                                                            \
4639   and texture_block_ptr, texture_offset, texture_mask;                         \
4640   vmov.u32 draw_mask_fb_ptr_left[1], fb_ptr;                                   \
4641                                                                                \
4642   pld [ fb_ptr ];                                                              \
4643   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4644   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4645                                                                                \
4646   add texture_block_ptr, texture_offset, #8;                                   \
4647   vst1.u32 { texels }, [ block, :64 ];                                         \
4648                                                                                \
4649   and texture_block_ptr, texture_block_ptr, texture_mask;                      \
4650   add block, block, #24;                                                       \
4651                                                                                \
4652   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4653                                                                                \
4654   add fb_ptr, fb_ptr, #16;                                                     \
4655   vst1.u32 { draw_mask_fb_ptr_left }, [ block, :64 ];                          \
4656                                                                                \
4657   add block, block, #40;                                                       \
4658   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4659   pld [ fb_ptr ];                                                              \
4660                                                                                \
4661   vmov.u32 draw_mask_fb_ptr_right[1], fb_ptr;                                  \
4662   vst1.u32 { texels }, [ block, :64 ];                                         \
4663   add block, block, #24;                                                       \
4664                                                                                \
4665   add texture_offset, texture_offset, #0x10;                                   \
4666   add fb_ptr, fb_ptr, #(2048 - 16);                                            \
4667                                                                                \
4668   vst1.u32 { draw_mask_fb_ptr_right }, [ block, :64 ];                         \
4669   add block, block, #40;                                                       \
4670                                                                                \
4671   subs sub_tile_height, sub_tile_height, #1;                                   \
4672   bne 4b;                                                                      \
4673                                                                                \
4674   sub block, block, #16;                                                       \
4675   add texture_offset, texture_offset, #0xF00;                                  \
4676   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4677
4678   
4679 #define setup_sprite_tile_half_8bpp(edge)                                      \
4680   setup_sprite_tile_add_blocks(single);                                        \
4681   add block, block, #16;                                                       \
4682                                                                                \
4683  4:                                                                            \
4684   and texture_block_ptr, texture_offset, texture_mask;                         \
4685   vmov.u32 draw_mask_fb_ptr_##edge[1], fb_ptr;                                 \
4686   pld [ fb_ptr ];                                                              \
4687                                                                                \
4688   add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
4689   vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
4690                                                                                \
4691   vst1.u32 { texels }, [ block, :64 ];                                         \
4692   add block, block, #24;                                                       \
4693                                                                                \
4694   vst1.u32 { draw_mask_fb_ptr_##edge }, [ block, :64 ];                        \
4695   add block, block, #40;                                                       \
4696                                                                                \
4697   add texture_offset, texture_offset, #0x10;                                   \
4698   add fb_ptr, fb_ptr, #2048;                                                   \
4699                                                                                \
4700   subs sub_tile_height, sub_tile_height, #1;                                   \
4701   bne 4b;                                                                      \
4702                                                                                \
4703   sub block, block, #16;                                                       \
4704   add texture_offset, texture_offset, #0xF00;                                  \
4705   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
4706
4707  
4708 #define setup_sprite_tile_column_edge_pre_adjust_half_right()                  \
4709   add texture_offset, texture_offset_base, #8;                                 \
4710   add fb_ptr, fb_ptr, #16                                                      \
4711
4712 #define setup_sprite_tile_column_edge_pre_adjust_half_left()                   \
4713   mov texture_offset, texture_offset_base                                      \
4714
4715 #define setup_sprite_tile_column_edge_pre_adjust_half(edge)                    \
4716   setup_sprite_tile_column_edge_pre_adjust_half_##edge()                       \
4717
4718 #define setup_sprite_tile_column_edge_pre_adjust_full(edge)                    \
4719   mov texture_offset, texture_offset_base                                      \
4720
4721 #define setup_sprite_tile_column_edge_post_adjust_half_right()                 \
4722   sub fb_ptr, fb_ptr, #16                                                      \
4723
4724 #define setup_sprite_tile_column_edge_post_adjust_half_left()                  \
4725
4726 #define setup_sprite_tile_column_edge_post_adjust_half(edge)                   \
4727   setup_sprite_tile_column_edge_post_adjust_half_##edge()                      \
4728
4729 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
4730
4731
4732 #define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode)  \
4733   mov sub_tile_height, column_data;                                            \
4734   setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge);                  \
4735   setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
4736   setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge)                  \
4737
4738 #define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode)   \
4739   and sub_tile_height, column_data, #0xFF;                                     \
4740   mov tiles_remaining, column_data, lsr #16;                                   \
4741   setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge);                  \
4742   setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
4743                                                                                \
4744   subs tiles_remaining, tiles_remaining, #1;                                   \
4745   beq 2f;                                                                      \
4746                                                                                \
4747  3:                                                                            \
4748   mov sub_tile_height, #16;                                                    \
4749   setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
4750   subs tiles_remaining, tiles_remaining, #1;                                   \
4751   bne 3b;                                                                      \
4752                                                                                \
4753  2:                                                                            \
4754   uxtb sub_tile_height, column_data, ror #8;                                   \
4755   setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
4756   setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge)                  \
4757
4758
4759 #define setup_sprite_column_data_single()                                      \
4760   mov column_data, height;                                                     \
4761   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]          \
4762
4763 #define setup_sprite_column_data_multi()                                       \
4764   and height_rounded, height_rounded, #0xF;                                    \
4765   rsb column_data, offset_v, #16;                                              \
4766                                                                                \
4767   add height_rounded, height_rounded, #1;                                      \
4768   sub tile_height, tile_height, #1;                                            \
4769                                                                                \
4770   orr column_data, column_data, tile_height, lsl #16;                          \
4771   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ];         \
4772                                                                                \
4773   orr column_data, column_data, height_rounded, lsl #8                         \
4774
4775 #define setup_sprite_tile_column_width_single(texture_mode, multi_height,      \
4776  edge_mode, edge)                                                              \
4777  setup_sprite_##texture_mode##_single_##multi_height##_##edge_mode##_##edge:   \
4778   setup_sprite_column_data_##multi_height();                                   \
4779   vext.32 block_masks_shifted, block_masks, block_masks, #1;                   \
4780   vorr.u32 block_masks, block_masks, block_masks_shifted;                      \
4781   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4782   vdup.u8 draw_mask_fb_ptr_right, block_masks[1];                              \
4783                                                                                \
4784   setup_sprite_tile_column_height_##multi_height(edge_mode, edge,              \
4785    texture_mode);                                                              \
4786   ldmia sp!, { r4 - r11, pc }                                                  \
4787
4788 #define setup_sprite_tiled_advance_column()                                    \
4789   add texture_offset_base, texture_offset_base, #0x100;                        \
4790   tst texture_offset_base, #0xF00;                                             \
4791   subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00)             \
4792
4793 #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode,      \
4794  right_mode)                                                                   \
4795  setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode:        \
4796   setup_sprite_column_data_##multi_height();                                   \
4797   mov fb_ptr_advance_column, #32;                                              \
4798                                                                                \
4799   sub fb_ptr_advance_column, height, lsl #11;                                  \
4800   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
4801                                                                                \
4802   vdup.u8 draw_mask_fb_ptr_right, block_masks[1];                              \
4803   setup_sprite_tile_column_height_##multi_height(left_mode, right, tm);        \
4804                                                                                \
4805   subs tile_width, tile_width, #2;                                             \
4806   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4807                                                                                \
4808   vmov.u8 draw_masks_fb_ptrs, #0;                                              \
4809   beq 1f;                                                                      \
4810                                                                                \
4811  0:                                                                            \
4812   setup_sprite_tiled_advance_column();                                         \
4813   setup_sprite_tile_column_height_##multi_height(full, none, tm);              \
4814   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
4815   subs tile_width, tile_width, #1;                                             \
4816   bne 0b;                                                                      \
4817                                                                                \
4818  1:                                                                            \
4819   vdup.u8 draw_mask_fb_ptr_left, block_masks[4];                               \
4820   vdup.u8 draw_mask_fb_ptr_right, block_masks[5];                              \
4821                                                                                \
4822   setup_sprite_tiled_advance_column();                                         \
4823   setup_sprite_tile_column_height_##multi_height(right_mode, left, tm);        \
4824   ldmia sp!, { r4 - r11, pc }                                                  \
4825
4826
4827 // r0: psx_gpu
4828 // r1: x
4829 // r2: y
4830 // r3: u
4831 // [ sp ]: v
4832 // [ sp + 4 ]: width
4833 // [ sp + 8 ]: height
4834 // [ sp + 12 ]: color (unused)
4835
4836 #define setup_sprite_tiled_builder(texture_mode)                               \
4837                                                                                \
4838 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, full);       \
4839 setup_sprite_tile_column_width_single(texture_mode, multi,  full, none);       \
4840 setup_sprite_tile_column_width_multi(texture_mode,  single, full, full);       \
4841 setup_sprite_tile_column_width_single(texture_mode, single, full, none);       \
4842 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, full);       \
4843 setup_sprite_tile_column_width_single(texture_mode, multi,  half, right);      \
4844 setup_sprite_tile_column_width_multi(texture_mode,  single, half, full);       \
4845 setup_sprite_tile_column_width_single(texture_mode, single, half, right);      \
4846 setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, half);       \
4847 setup_sprite_tile_column_width_single(texture_mode, multi,  half, left);       \
4848 setup_sprite_tile_column_width_multi(texture_mode,  single, full, half);       \
4849 setup_sprite_tile_column_width_single(texture_mode, single, half, left);       \
4850 setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, half);       \
4851 setup_sprite_tile_column_width_multi(texture_mode,  single, half, half);       \
4852                                                                                \
4853 .align 4;                                                                      \
4854                                                                                \
4855 function(setup_sprite_##texture_mode)                                          \
4856   stmdb sp!, { r4 - r11, r14 };                                                \
4857   setup_sprite_tiled_initialize_##texture_mode();                              \
4858                                                                                \
4859   ldr v, [ sp, #36 ];                                                          \
4860   and offset_u, u, #0xF;                                                       \
4861                                                                                \
4862   ldr width, [ sp, #40 ];                                                      \
4863   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
4864                                                                                \
4865   ldr height, [ sp, #44 ];                                                     \
4866   add fb_ptr, fb_ptr, y, lsl #11;                                              \
4867                                                                                \
4868   add fb_ptr, fb_ptr, x, lsl #1;                                               \
4869   and offset_v, v, #0xF;                                                       \
4870                                                                                \
4871   sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
4872   add width_rounded, offset_u, width;                                          \
4873                                                                                \
4874   add height_rounded, offset_v, height;                                        \
4875   add width_rounded, width_rounded, #15;                                       \
4876                                                                                \
4877   add height_rounded, height_rounded, #15;                                     \
4878   mov tile_width, width_rounded, lsr #4;                                       \
4879                                                                                \
4880   /* texture_offset_base = VH-VL-00-00                                       */\
4881   mov texture_offset_base, v, lsl #8;                                          \
4882   and offset_u_right, width_rounded, #0xF;                                     \
4883                                                                                \
4884   /* texture_offset_base = VH-UH-UL-00                                       */\
4885   bfi texture_offset_base, u, #4, #8;                                          \
4886   movw right_block_mask, #0xFFFE;                                              \
4887                                                                                \
4888   /* texture_offset_base = VH-UH-VL-00                                       */\
4889   bfi texture_offset_base, v, #4, #4;                                          \
4890   movw left_block_mask, #0xFFFF;                                               \
4891                                                                                \
4892   mov tile_height, height_rounded, lsr #4;                                     \
4893   mvn left_block_mask, left_block_mask, lsl offset_u;                          \
4894                                                                                \
4895   /* texture_mask = HH-HL-WH-WL                                              */\
4896   ldrh texture_mask, [ psx_gpu, #psx_gpu_texture_mask_width_offset ];          \
4897   mov right_block_mask, right_block_mask, lsl offset_u_right;                  \
4898                                                                                \
4899   /* texture_mask_rev = WH-WL-HH-HL                                          */\
4900   rev16 texture_mask_rev, texture_mask;                                        \
4901   vmov block_masks, left_block_mask, right_block_mask;                         \
4902                                                                                \
4903   /* texture_mask = HH-HL-HL-WL                                              */\
4904   bfi texture_mask, texture_mask_rev, #4, #4;                                  \
4905   /* texture_mask_rev = 00-00-00-WH                                          */\
4906   mov texture_mask_rev, texture_mask_rev, lsr #12;                             \
4907                                                                                \
4908   /* texture_mask = HH-WH-HL-WL                                              */\
4909   bfi texture_mask, texture_mask_rev, #8, #4;                                  \
4910   and left_block_mask, left_block_mask, #0xFF;                                 \
4911                                                                                \
4912   mov control_mask, #0;                                                        \
4913   cmp left_block_mask, #0xFF;                                                  \
4914                                                                                \
4915   uxtb right_block_mask, right_block_mask, ror #8;                             \
4916   orreq control_mask, control_mask, #0x4;                                      \
4917                                                                                \
4918   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
4919   cmp right_block_mask, #0xFF;                                                 \
4920                                                                                \
4921   orreq control_mask, control_mask, #0x8;                                      \
4922   cmp tile_width, #1;                                                          \
4923                                                                                \
4924   add block, psx_gpu, #psx_gpu_blocks_offset;                                  \
4925   orreq control_mask, control_mask, #0x1;                                      \
4926                                                                                \
4927   cmp tile_height, #1;                                                         \
4928   add block, block, num_blocks, lsl #6;                                        \
4929                                                                                \
4930   orreq control_mask, control_mask, #0x2;                                      \
4931   ldr pc, [ pc, control_mask, lsl #2 ];                                        \
4932   nop;                                                                         \
4933                                                                                \
4934  .word setup_sprite_##texture_mode##_multi_multi_full_full;                    \
4935  .word setup_sprite_##texture_mode##_single_multi_full_none;                   \
4936  .word setup_sprite_##texture_mode##_multi_single_full_full;                   \
4937  .word setup_sprite_##texture_mode##_single_single_full_none;                  \
4938  .word setup_sprite_##texture_mode##_multi_multi_half_full;                    \
4939  .word setup_sprite_##texture_mode##_single_multi_half_right;                  \
4940  .word setup_sprite_##texture_mode##_multi_single_half_full;                   \
4941  .word setup_sprite_##texture_mode##_single_single_half_right;                 \
4942  .word setup_sprite_##texture_mode##_multi_multi_full_half;                    \
4943  .word setup_sprite_##texture_mode##_single_multi_half_left;                   \
4944  .word setup_sprite_##texture_mode##_multi_single_full_half;                   \
4945  .word setup_sprite_##texture_mode##_single_single_half_left;                  \
4946  .word setup_sprite_##texture_mode##_multi_multi_half_half;                    \
4947  .word 0x00000000;                                                             \
4948  .word setup_sprite_##texture_mode##_multi_single_half_half                    \
4949
4950
4951 setup_sprite_tiled_builder(4bpp);
4952 setup_sprite_tiled_builder(8bpp);
4953
4954
4955 #undef block_ptr
4956 #undef num_blocks
4957 #undef clut_ptr
4958
4959 #define psx_gpu                                           r0
4960 #define block_ptr                                         r0
4961 #define num_blocks                                        r1
4962 #define clut_ptr                                          r2
4963 #define texel_shift_mask                                  r3
4964 #define block_pixels_a                                    r4
4965 #define block_pixels_b                                    r5
4966 #define texel_0                                           r6
4967 #define texel_2                                           r7
4968 #define texel_4                                           r8
4969 #define texel_6                                           r9
4970 #define texel_1                                           r10
4971 #define texel_3                                           r11
4972 #define texel_5                                           r12
4973 #define texel_7                                           r14
4974 #define texels_01                                         r6
4975 #define texels_23                                         r7
4976 #define texels_45                                         r8
4977 #define texels_67                                         r9
4978
4979 function(texture_sprite_blocks_8bpp)
4980   stmdb sp!, { r4 - r11, r14 }
4981   movw texel_shift_mask, #(0xFF << 1)
4982
4983   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
4984   ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ]
4985
4986   add block_ptr, psx_gpu, #psx_gpu_blocks_offset
4987   ldr block_pixels_a, [ block_ptr, #16 ]
4988
4989  0:
4990   and texel_0, texel_shift_mask, block_pixels_a, lsl #1
4991   ldr block_pixels_b, [ block_ptr, #20 ]
4992
4993   and texel_1, texel_shift_mask, block_pixels_a, lsr #7
4994   ldrh texel_0, [ clut_ptr, texel_0 ]
4995
4996   and texel_2, texel_shift_mask, block_pixels_a, lsr #15
4997   ldrh texel_1, [ clut_ptr, texel_1 ]
4998
4999   and texel_3, texel_shift_mask, block_pixels_a, lsr #23
5000   ldr block_pixels_a, [ block_ptr, #(64 + 16) ]
5001
5002   ldrh texel_2, [ clut_ptr, texel_2 ]
5003   and texel_4, texel_shift_mask, block_pixels_b, lsl #1
5004
5005   ldrh texel_3, [ clut_ptr, texel_3 ]
5006   and texel_5, texel_shift_mask, block_pixels_b, lsr #7
5007
5008   ldrh texel_4, [ clut_ptr, texel_4 ]
5009   and texel_6, texel_shift_mask, block_pixels_b, lsr #15
5010
5011   ldrh texel_5, [ clut_ptr, texel_5 ]
5012   and texel_7, texel_shift_mask, block_pixels_b, lsr #23
5013
5014   ldrh texel_6, [ clut_ptr, texel_6 ]
5015   orr texels_01, texel_0, texel_1, lsl #16
5016
5017   ldrh texel_7, [ clut_ptr, texel_7 ]
5018   orr texels_23, texel_2, texel_3, lsl #16
5019
5020   orr texels_45, texel_4, texel_5, lsl #16
5021   str texels_01, [ block_ptr, #0 ]
5022
5023   orr texels_67, texel_6, texel_7, lsl #16
5024   str texels_23, [ block_ptr, #4 ]
5025
5026   subs num_blocks, num_blocks, #1
5027   str texels_45, [ block_ptr, #8 ]
5028
5029   str texels_67, [ block_ptr, #12 ]
5030   add block_ptr, block_ptr, #64
5031
5032   bne 0b
5033
5034   ldmia sp!, { r4 - r11, pc }
5035
5036
5037 #undef width_rounded
5038 #undef texture_mask
5039 #undef num_blocks
5040 #undef texture_offset
5041
5042 #define psx_gpu                                           r0
5043 #define x                                                 r1
5044 #define y                                                 r2
5045 #define u                                                 r3
5046 #define v                                                 r4
5047 #define width                                             r5
5048 #define height                                            r6
5049 #define left_offset                                       r8
5050 #define width_rounded                                     r9
5051 #define right_width                                       r10
5052 #define block_width                                       r11
5053
5054 #define texture_offset_base                               r1
5055 #define texture_mask                                      r2
5056 #define texture_page_ptr                                  r3
5057 #define num_blocks                                        r4
5058 #define block                                             r5
5059 #define fb_ptr                                            r7
5060 #define texture_offset                                    r8
5061 #define blocks_remaining                                  r9
5062 #define fb_ptr_pitch                                      r12
5063 #define texture_block_ptr                                 r14
5064
5065 #define texture_mask_width                                r2
5066 #define texture_mask_height                               r3
5067 #define left_mask_bits                                    r4
5068 #define right_mask_bits                                   r5
5069
5070
5071 #undef block_masks
5072 #undef block_masks_shifted
5073 #undef texels
5074
5075 #define block_masks                                       d0
5076 #define block_masks_shifted                               d1
5077 #define draw_mask_fb_ptr                                  d2
5078 #define texels                                            q2
5079
5080
5081 setup_sprites_16bpp_flush_single:
5082   vpush { d0 - d2 }
5083
5084   stmdb sp!, { r0 - r3, r12, r14 }
5085   bl flush_render_block_buffer
5086   ldmia sp!, { r0 - r3, r12, r14 }
5087
5088   vpop { d0 - d2 }
5089
5090   add block, psx_gpu, #psx_gpu_blocks_offset
5091   mov num_blocks, #1
5092
5093   bx lr
5094
5095 setup_sprites_16bpp_flush_row:
5096   vpush { d0 - d2 }
5097
5098   stmdb sp!, { r0 - r3, r12, r14 }
5099   bl flush_render_block_buffer
5100   ldmia sp!, { r0 - r3, r12, r14 }
5101
5102   vpop { d0 - d2 }
5103
5104   add block, psx_gpu, #psx_gpu_blocks_offset
5105   mov num_blocks, block_width
5106
5107   bx lr
5108
5109 function(setup_sprite_16bpp)
5110   stmdb sp!, { r4 - r11, r14 }
5111   ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5112
5113   ldr v, [ sp, #36 ]
5114   add fb_ptr, fb_ptr, y, lsl #11
5115
5116   ldr width, [ sp, #40 ]
5117   add fb_ptr, fb_ptr, x, lsl #1
5118
5119   ldr height, [ sp, #44 ]
5120   and left_offset, u, #0x7
5121
5122   add texture_offset_base, u, u
5123   add width_rounded, width, #7
5124
5125   add texture_offset_base, v, lsl #11
5126   mov left_mask_bits, #0xFF
5127   
5128   ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
5129   add width_rounded, width_rounded, left_offset
5130
5131   ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
5132   sub fb_ptr, fb_ptr, left_offset, lsl #1
5133
5134   add texture_mask, texture_mask_width, texture_mask_width
5135   mov right_mask_bits, #0xFE
5136
5137   and right_width, width_rounded, #0x7
5138   mvn left_mask_bits, left_mask_bits, lsl left_offset
5139
5140   add texture_mask, texture_mask_height, lsl #11
5141   mov block_width, width_rounded, lsr #3
5142
5143   mov right_mask_bits, right_mask_bits, lsl right_width
5144   movw fb_ptr_pitch, #(2048 + 16)
5145
5146   sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4
5147   vmov block_masks, left_mask_bits, right_mask_bits
5148
5149   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5150   add block, psx_gpu, #psx_gpu_blocks_offset
5151
5152   bic texture_offset_base, texture_offset_base, #0xF
5153   cmp block_width, #1
5154
5155   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
5156   add block, block, num_blocks, lsl #6
5157
5158   bne 0f
5159
5160   vext.32 block_masks_shifted, block_masks, block_masks, #1
5161   vorr.u32 block_masks, block_masks, block_masks_shifted
5162   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5163
5164  1:
5165   add num_blocks, num_blocks, #1
5166   cmp num_blocks, #MAX_BLOCKS
5167   blgt setup_sprites_16bpp_flush_single
5168
5169   and texture_block_ptr, texture_offset_base, texture_mask
5170   subs height, height, #1
5171
5172   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5173   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5174
5175   vst1.u32 { texels }, [ block, :128 ]
5176   add block, block, #40
5177
5178   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5179   pld [ fb_ptr ]
5180
5181   vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5182
5183   add block, block, #24
5184   add texture_offset_base, texture_offset_base, #2048
5185   add fb_ptr, fb_ptr, #2048
5186   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5187   bne 1b
5188
5189   ldmia sp!, { r4 - r11, pc }
5190
5191  0:
5192   add num_blocks, num_blocks, block_width
5193   mov texture_offset, texture_offset_base
5194
5195   cmp num_blocks, #MAX_BLOCKS
5196   blgt setup_sprites_16bpp_flush_row
5197
5198   add texture_offset_base, texture_offset_base, #2048
5199   and texture_block_ptr, texture_offset, texture_mask
5200
5201   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5202   vld1.u32 { texels }, [ texture_block_ptr, :128 ]  
5203
5204   vst1.u32 { texels }, [ block, :128 ]
5205   add block, block, #40
5206
5207   vdup.u8 draw_mask_fb_ptr, block_masks[0]
5208   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5209   pld [ fb_ptr ]
5210
5211   vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5212   subs blocks_remaining, block_width, #2
5213
5214   add texture_offset, texture_offset, #16
5215   add fb_ptr, fb_ptr, #16
5216
5217   vmov.u8 draw_mask_fb_ptr, #0
5218
5219   add block, block, #24
5220   beq 2f
5221
5222  1:
5223   and texture_block_ptr, texture_offset, texture_mask
5224   subs blocks_remaining, blocks_remaining, #1
5225
5226   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5227   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5228
5229   vst1.u32 { texels }, [ block, :128 ]
5230   add block, block, #40
5231
5232   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5233   pld [ fb_ptr ]
5234
5235   vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5236   
5237   add texture_offset, texture_offset, #16
5238   add fb_ptr, fb_ptr, #16
5239
5240   add block, block, #24
5241   bne 1b
5242
5243  2:
5244   and texture_block_ptr, texture_offset, texture_mask
5245   add texture_block_ptr, texture_page_ptr, texture_block_ptr
5246
5247   vld1.u32 { texels }, [ texture_block_ptr, :128 ]
5248   vdup.u8 draw_mask_fb_ptr, block_masks[4]
5249
5250   vst1.u32 { texels }, [ block, :128 ]
5251   add block, block, #40
5252
5253   vmov.u32 draw_mask_fb_ptr[1], fb_ptr
5254   vst1.u32 { draw_mask_fb_ptr }, [ block, :64 ]
5255   
5256   add block, block, #24
5257   subs height, height, #1
5258
5259   add fb_ptr, fb_ptr, fb_ptr_pitch
5260   strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
5261
5262   bne 0b
5263
5264   ldmia sp!, { r4 - r11, pc }
5265
5266
5267 #undef texture_page_ptr
5268 #undef vram_ptr
5269 #undef dirty_textures_mask
5270 #undef current_texture_mask
5271
5272 #define psx_gpu                                           r0
5273 #define current_texture_page                              r1
5274 #define texture_page_ptr                                  r2
5275 #define vram_ptr_a                                        r3
5276 #define current_texture_page_x                            r12
5277 #define current_texture_page_y                            r4
5278 #define dirty_textures_mask                               r5
5279 #define tile_y                                            r6
5280 #define tile_x                                            r7
5281 #define sub_y                                             r8
5282 #define current_texture_mask                              r9
5283 #define c_4096                                            r10
5284 #define vram_ptr_b                                        r11
5285
5286 #define texel_block_a                                     d0
5287 #define texel_block_b                                     d1
5288 #define texel_block_expanded_a                            q1
5289 #define texel_block_expanded_b                            q2
5290 #define texel_block_expanded_ab                           q2
5291 #define texel_block_expanded_c                            q3
5292 #define texel_block_expanded_d                            q4
5293 #define texel_block_expanded_cd                           q3
5294
5295 function(update_texture_4bpp_cache)
5296   stmdb sp!, { r4 - r11, r14 }
5297   vpush { q0 - q3 }
5298
5299   ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5300
5301   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
5302   ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5303
5304   and current_texture_page_x, current_texture_page, #0xF
5305   ldr current_texture_mask, [ psx_gpu, #psx_gpu_current_texture_mask_offset ]
5306
5307   mov current_texture_page_y, current_texture_page, lsr #4
5308   ldr dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5309
5310   add vram_ptr_a, vram_ptr_a, current_texture_page_y, lsl #19
5311   mov tile_y, #16
5312
5313   add vram_ptr_a, vram_ptr_a, current_texture_page_x, lsl #7  
5314   bic dirty_textures_mask, current_texture_mask
5315   
5316   mov tile_x, #16
5317   str dirty_textures_mask, [ psx_gpu, #psx_gpu_dirty_textures_4bpp_mask_offset ]
5318
5319   mov sub_y, #8
5320   movw c_4096, #4096
5321
5322   add vram_ptr_b, vram_ptr_a, #2048
5323
5324  0:
5325   vld1.u32 { texel_block_a }, [ vram_ptr_a, :64 ], c_4096
5326   vld1.u32 { texel_block_b }, [ vram_ptr_b, :64 ], c_4096
5327
5328   vmovl.u8 texel_block_expanded_a, texel_block_a
5329   vshll.u8 texel_block_expanded_b, texel_block_a, #4
5330   vmovl.u8 texel_block_expanded_c, texel_block_b
5331   vshll.u8 texel_block_expanded_d, texel_block_b, #4
5332
5333   vbic.u16 texel_block_expanded_a, #0x00F0
5334   vbic.u16 texel_block_expanded_b, #0x00F0
5335   vbic.u16 texel_block_expanded_c, #0x00F0
5336   vbic.u16 texel_block_expanded_d, #0x00F0
5337
5338   vorr.u16 texel_block_expanded_ab, texel_block_expanded_a,                    \
5339    texel_block_expanded_b
5340   vorr.u16 texel_block_expanded_cd, texel_block_expanded_c,                    \
5341    texel_block_expanded_d
5342
5343   vst1.u32 { texel_block_expanded_ab, texel_block_expanded_cd },               \
5344    [ texture_page_ptr, :256 ]!
5345
5346   subs sub_y, sub_y, #1
5347   bne 0b
5348
5349   mov sub_y, #8
5350   add vram_ptr_a, vram_ptr_a, #8
5351   add vram_ptr_b, vram_ptr_b, #8
5352
5353   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5354   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5355
5356   subs tile_x, tile_x, #1
5357   bne 0b
5358
5359   mov tile_x, #16
5360   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5361   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5362
5363   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5364   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5365
5366   subs tile_y, tile_y, #1
5367   bne 0b
5368
5369   vpop { q0 - q3 }
5370   ldmia sp!, { r4 - r11, pc }
5371
5372
5373 #undef current_texture_page
5374
5375 #define psx_gpu                                           r0
5376 #define texture_page                                      r1
5377 #define texture_page_ptr                                  r2
5378 #define vram_ptr_a                                        r3
5379 #define texture_page_x                                    r12
5380 #define texture_page_y                                    r4
5381 #define current_texture_page                              r5
5382 #define tile_y                                            r6
5383 #define tile_x                                            r7
5384 #define sub_y                                             r8
5385 #define c_4096                                            r10
5386 #define vram_ptr_b                                        r11
5387
5388
5389 #undef texels_a
5390 #undef texels_b
5391
5392 #define texels_a                                          q0
5393 #define texels_b                                          q1
5394 #define texels_c                                          q2
5395 #define texels_d                                          q3
5396
5397
5398 function(update_texture_8bpp_cache_slice)
5399   stmdb sp!, { r4 - r11, r14 }
5400   vpush { q0 - q3 }
5401
5402   ldrb current_texture_page, [ psx_gpu, #psx_gpu_current_texture_page_offset ]
5403   ldr vram_ptr_a, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
5404
5405   ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_base_offset ]
5406   mov tile_y, #16
5407
5408   and texture_page_x, texture_page, #0xF
5409   mov texture_page_y, texture_page, lsr #4
5410
5411   add vram_ptr_a, vram_ptr_a, texture_page_x, lsl #7  
5412   mov tile_x, #8
5413
5414   add vram_ptr_a, vram_ptr_a, texture_page_y, lsl #19
5415   eor current_texture_page, current_texture_page, texture_page
5416
5417   ands current_texture_page, current_texture_page, #0x1
5418   mov sub_y, #4
5419
5420   addne texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
5421   movw c_4096, #4096
5422
5423   add vram_ptr_b, vram_ptr_a, #2048
5424
5425  0:
5426   vld1.u32 { texels_a }, [ vram_ptr_a, :128 ], c_4096
5427   vld1.u32 { texels_b }, [ vram_ptr_b, :128 ], c_4096
5428   vld1.u32 { texels_c }, [ vram_ptr_a, :128 ], c_4096
5429   vld1.u32 { texels_d }, [ vram_ptr_b, :128 ], c_4096
5430
5431   vst1.u32 { texels_a, texels_b }, [ texture_page_ptr, :256 ]!
5432   vst1.u32 { texels_c, texels_d }, [ texture_page_ptr, :256 ]!
5433
5434   subs sub_y, sub_y, #1
5435   bne 0b
5436
5437   mov sub_y, #4
5438
5439   add vram_ptr_a, vram_ptr_a, #16
5440   add vram_ptr_b, vram_ptr_b, #16
5441
5442   sub vram_ptr_a, vram_ptr_a, #(16 * 2048)
5443   sub vram_ptr_b, vram_ptr_b, #(16 * 2048)
5444
5445   subs tile_x, tile_x, #1
5446   bne 0b
5447
5448   mov tile_x, #8
5449
5450   add vram_ptr_a, vram_ptr_a, #(16 * 2048)
5451   add vram_ptr_b, vram_ptr_b, #(16 * 2048)
5452
5453   sub vram_ptr_a, vram_ptr_a, #(8 * 16)
5454   sub vram_ptr_b, vram_ptr_b, #(8 * 16)
5455
5456   subs tile_y, tile_y, #1
5457   add texture_page_ptr, texture_page_ptr, #(8 * 16 * 16)
5458
5459   bne 0b
5460
5461   vpop { q0 - q3 }
5462   ldmia sp!, { r4 - r11, pc }
5463