rice: optimize IsTriangleVisible
[mupen64plus-pandora.git] / source / gles2rice / src / RenderBase_neon.S
1 /*
2  * (C) GraÅžvydas "notaz" Ignotas, 2014
3  *
4  * This work is licensed under the terms of GNU GPL version 2 or later.
5  * See the COPYING file in the top-level directory.
6  */
7
8 #include "arm_features.h"
9 #include "RenderBase_neon.h"
10
11 .syntax unified
12 .text
13 .align 3
14
15 /*
16  * ProcessVertexData register map:
17  *
18  *  q | d | c code
19  * ...      
20  * 12  24   gRSPworldProject _11,_12,_13,_14
21  *     25   
22  * 13  26   gRSPworldProject _21,_22,_23,_24
23  *     27   
24  * 14  28   gRSPworldProject _31,_32,_33,_34
25  *     29   
26  * 15  30   gRSPworldProject _41,_42,_43,_44
27  *     31   
28  *
29  * r4 vtx[], 16 bytes:
30  * short y, x, flag, z, tv, tu;
31  * / uint8 a, b, g, r;
32  * \ char  a, z, y, x;
33  *
34  *  outputs:
35  * r0        - XVECTOR4 *g_vtxTransformed
36  * r1        - XVECTOR4 *g_vecProjected
37  * r2        - uint32   *g_dwVtxDifColor
38  * r3        - VECTOR2  *g_fVtxTxtCoords
39  *     sp+00 - float    *g_fFogCoord
40  * r6  sp+04 - uint32   *g_clipFlag2
41  *  inputs:
42  * r11 sp+08 - uint32      dwNum
43  * r10 sp+0c - int         neon_flags
44  * r4  sp+10 - FiddledVtx  vtx[], (r4 [0], r5 [1])
45  * r7  sp+14 - Light      *gRSPlights
46  *     sp+18 - float      *fRSPAmbientLightRGBA
47  *     sp+1c - XMATRIX    *gRSPworldProject
48  *     sp+20 - XMATRIX    *gRSPmodelViewTop
49  *     sp+24 - uint32      gRSPnumLights
50  *     sp+28 - float       gRSPfFogMin
51  *     sp+2c - uint32      primitiveColor
52  *     sp+30 - uint32      primitiveColor
53  */
54 FUNCTION(pv_neon):
55     ldr         r12, [sp, #0x10]
56     pld         [r12]
57
58     push        {r4-r11,lr}
59     vpush       {q4-q7}
60
61     mov         r4, r12               @ vtx
62     ldr         r12, [sp, #0x64+0x1c]
63     vld1.32     {q12,q13}, [r12, :128]! @ load gRSPworldProject
64     vld1.32     {q14,q15}, [r12, :128]
65     ldr         r6, [sp, #0x64+0x04]  @ g_clipFlag2
66     add         r5, r4, #16           @ vtx + 1
67     ldr         r11, [sp, #0x64+0x08] @ dwNum
68     ldr         r10, [sp, #0x64+0x0c] @ neon_flags
69
70 0:
71     vld1.16     d12, [r4]!            @ vtx[0] .z .flag .x .y (reg)
72     vmovl.s16   q6, d12
73     vld1.16     d14, [r5]!            @ vtx[1] .z .flag .x .y
74     vmovl.s16   q7, d14
75     vcvt.f32.s32 q6, q6               @ q6 = vtx_raw0
76     vcvt.f32.s32 q7, q7               @ q7 = vtx_raw1
77     vdup.32     q0, d12[1]            @ vtx_raw0.x (dup)
78     vdup.32     q1, d12[0]            @ vtx_raw0.y (dup)
79     vdup.32     q2, d13[1]            @ vtx_raw0.z (dup)
80     vdup.32     q3, d14[1]            @ vtx_raw1.x (dup)
81     vdup.32     q4, d14[0]            @ vtx_raw1.y (dup)
82     vdup.32     q5, d15[1]            @ vtx_raw1.z (dup)
83     /* note: order of operations matters greatly,
84      * may cause like 20 fraction bits to differ! */
85     vmul.f32    q0, q0, q12
86     vmul.f32    q3, q3, q12
87     vmla.f32    q0, q1, q13
88     vmla.f32    q3, q4, q13
89     vmul.f32    q2, q2, q14           @ yes, mul+add is
90     vmul.f32    q5, q5, q14           @ faster than mla
91     vadd.f32    q0, q2
92     vadd.f32    q3, q5
93     vadd.f32    q0, q15               @ q0 = g_vtxTransformed[i]
94     vadd.f32    q3, q15               @ q3 = g_vtxTransformed[i + 1]
95
96                                       vld1.16     d16[1], [r4]! @ [0].v
97     vmov        d2, d1
98                                       vld1.16     d16[0], [r4]! @ [0].u
99     vsri.64     d2, d7, #32
100                                       vld1.16     d18[1], [r5]! @ [0].v
101 #if 1
102     vrecpe.f32  d4, d2                @ inv [0][1] .w
103                                       vld1.16     d18[0], [r5]! @ [0].u
104     vrecps.f32  d5, d2, d4            @ step
105                                       vmovl.s16   q8, d16
106     /* write g_vtxTransformed */      vst1.32     {q0}, [r0, :128]!
107                                       vmovl.s16   q9, d18
108     /* ... [1] */                     vst1.32     {q3}, [r0, :128]!
109                                       vcvt.f32.s32 d16, d16
110                                       vcvt.f32.s32 d18, d18
111     vmul.f32    d4, d5, d4            @ better inv
112                                       bic         r9, r5, #63
113                                       pld         [r9, #64]
114     vrecps.f32  d5, d2, d4            @ step
115     /* wrt u,v to g_fVtxTxtCoords */  vst1.32     {d16}, [r3]!
116     /* ... [1] */                     vst1.32     {d18}, [r3]!
117                                       vmov.f32    d20, #1.0
118                                       vmov.f32    d21, #-1.0
119     vmul.f32    d4, d5, d4            @ better inv [0][1] .w
120  #if 0
121     vrecps.f32  d5, d2, d4            @ step
122     vmul.f32    d4, d5, d4            @ better inv
123  #endif
124 #else
125     mov         r12, #0x3f800000      @ 1.0f
126     vmov.f32    s6, r12
127     vdiv.f32    s8, s6, s4
128     vdiv.f32    s9, s6, s5
129  #error incomplete
130 #endif
131
132                                       mov         r8, #X_CLIP_MAX
133                                       mov         r9, #Y_CLIP_MAX
134                                       vmov        d22, r8, r9
135     vmul.f32    q0, q0, d4[1]         @ .x .y .z .w *= [0] .w
136     vmul.f32    q1, q3, d4[0]
137     vshr.u64    d5, d4, #32           @ [0] .w
138                                       mov         r8, #X_CLIP_MIN
139                                       mov         r9, #Y_CLIP_MIN
140                                       vmov        d23, r8, r9
141     vsli.64     d3, d4, #32           @ insert [1] .w
142     vsli.64     d1, d5, #32
143                                       vsli.u64    d5, d4, #32 @ [0] [1] .w
144                                       vcgt.f32    d6, d0, d20 @ .xy > 1.0?
145                                       vcgt.f32    d7, d21, d0
146                                       vcgt.f32    d4, d5, #0  @ .w > 0?
147     vst1.32     {q0,q1}, [r1]!        @ wrt g_vecProjected
148                                       vcgt.f32    d8, d2, d20
149                                       vcgt.f32    d9, d21, d2
150     vld1.32     d0[0], [r4]!          @ mem: [0] .azyx
151                                       vand        q3, q11
152                                       vand        q4, q11
153                                       vorr        d6, d6, d7
154                                       vorr        d7, d8, d9
155     vld1.32     d0[1], [r5]!          @ mem: [1] .azyx
156                                       vpadd.u32   d6, d7
157     vrev32.8    d0, d0                @ make 0xaazzyyxx [1][0]
158     vsli.u64    d1, d3, #32           @ d3 = [1] [0] .z
159     vmovl.s8    q4, d0
160                                       vand        d6, d4
161     vmovl.s16   q1, d8
162     vmovl.s16   q2, d9
163                                       vst1.32     {d6}, [r6]! @ g_clipFlag2
164
165     tst         r10, #PV_NEON_ENABLE_LIGHT
166     beq         pv_neon_no_light
167 @ pv_neon_light:
168     @ live NEON registers:
169     @ d1    = [1][0] .z (must preserve)
170     @ q1,q2 = azyx [1][0]
171     @ q12+  = gRSPworldProject
172     ldr         r12, [sp, #0x64+0x20]
173     vcvt.f32.s32 q1, q1
174     vcvt.f32.s32 q2, q2
175     vld1.32     {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop
176     vld1.32     {q10},   [r12, :128]
177
178     vdup.32     q5, d4[0]             @ [1] .x (dup)
179     vdup.32     q6, d4[1]             @ [1] .y (dup)
180     vdup.32     q7, d5[0]             @ [1] .z (dup)
181     vdup.32     q2, d2[0]             @ [0] .x (dup)
182     vdup.32     q3, d2[1]             @ [0] .y (dup)
183     vdup.32     q4, d3[0]             @ [0] .z (dup)
184     vmul.f32    q2, q2, q8
185     vmul.f32    q5, q5, q8
186     vmla.f32    q2, q3, q9
187     vmla.f32    q5, q6, q9
188     vmul.f32    q4, q4, q10
189     vmul.f32    q7, q7, q10
190     vadd.f32    q4, q2                @ q4 = temp[0] .xyz0
191     vadd.f32    q5, q7                @ q5 = temp[1] .xyz0
192     vmul.f32    q2, q4, q4            @ temp .xyz0 ^2
193     vmul.f32    q3, q5, q5
194     vpadd.f32   d2, d4, d5
195     vpadd.f32   d3, d6, d7
196     movw        r8, #0x0000ffff
197     movt        r8, #0x7f7f           @ max normal float, ~3.4e+38
198     vdup.32     d4, r8
199     vpadd.f32   d2, d2, d3            @ d2 = [1][0] x^2 + y^2 + z^2
200     vcgt.f32    d5, d2, #0
201     vbif        d2, d4, d5            @ if (d2 == 0) d2 = MAXFLOAT
202
203     vrsqrte.f32 d3, d2                @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum
204     vmul.f32    d4, d3, d2
205     ldr         r9, [sp, #0x64+0x18]  @ &fRSPAmbientLightRGBA
206     ldr         r7, [sp, #0x64+0x14]  @ gRSPlights
207     ldr         r8, [sp, #0x64+0x24]  @ gRSPnumLights
208     vrsqrts.f32 d4, d3, d4            @ step
209                                       vld1.32     {q6}, [r9] @ rgb
210                                       vld1.32     {q7}, [r9] @ rgb
211     vmul.f32    d3, d3, d4            @ 1/sqrt(d2)
212 #if 0 /* not necessary? */
213     vmul.f32    d4, d3, d2
214     vrsqrts.f32 d4, d3, d4            @ step
215     vmul.f32    d3, d3, d4            @ 1/sqrt(d2)
216 #endif
217     vmul.f32    q2, q4, d3[0]         @ q2 = normal[0] .xyz
218     vmul.f32    q3, q5, d3[1]         @ q3 = normal[1] .xyz
219
220 1:
221     vld1.32     {q8}, [r7]
222     vmul.f32    q4, q8, q2            @ gRSPlights[l] * normal
223     vmul.f32    q5, q8, q3
224     vpadd.f32   d8, d8, d9
225     vpadd.f32   d10, d10, d11
226     vpadd.f32   d8, d8, d10           @ d8 = [1][0] fCosT
227     vcgt.f32    d9, d8, #0            @ if (!(fCosT > 0))
228     vand        d8, d9                @   fCosT = 0
229     add         r9, r7, #OFFSETOF_Light_fr
230     vld1.32     {q8}, [r9]            @ .fr .fg .fb
231     vdup.32     q5, d8[1]             @ [1] fCosT (dup)
232     vdup.32     q4, d8[0]             @
233     vmla.f32    q7, q8, q5            @ .rgb += frgb * fCosT
234     vmla.f32    q6, q8, q4
235     add         r7, #SIZEOF_Light
236     subs        r8, #1
237     bgt         1b
238
239     movt        r8, #0x437f           @ float 255
240     vdup.32     q8, r8
241     vcgt.f32    q4, q6, q8            @ if (.rgb > 255)
242     vcgt.f32    q5, q7, q8
243     vbit        q6, q8, q4            @   .rgb = 255
244     vbit        q7, q8, q5
245     vcvt.u32.f32 q6, q6
246     vcvt.u32.f32 q7, q7
247     ldrb        r8, [r4, #-4]         @ .a from vtx
248     ldrb        r9, [r5, #-4]
249     vext.32     q4, q6, q6, #3        @ reg: .abgr -> .bgra
250     vext.32     q5, q7, q7, #3
251     vmov.32     d8[0], r8             @ use .a from input
252     vmov.32     d10[0], r9
253     vmovn.u32   d8, q4
254     vmovn.u32   d10, q5
255     vmovn.u16   d0, q4
256     vmovn.u16   d2, q5
257     vsli.u64    d0, d2, #32
258     vrev32.8    d0, d0                @ 0xbbggrraa -> 0xaarrggbb
259     b           pv_neon_fog_alpha
260
261 pv_neon_no_light:
262     tst         r10, #PV_NEON_ENABLE_SHADE
263     vldr        d0, [sp, #0x64+0x2c]  @ primitiveColor [0] [1]
264     beq         pv_neon_fog_alpha
265     @ easier to do with ARM
266     ldr         r8, [r4, #-4]
267     ldr         r9, [r5, #-4]
268     ror         r8, #8                @ mem: .argb -> .rgba
269     ror         r9, #8                @ reg: 0xbbggrraa -> ..
270     vmov        d0, r8, r9
271
272 pv_neon_fog_alpha:
273     tst         r10, #PV_NEON_FOG_ALPHA
274     beq         pv_neon_next
275     vmov.f32    d20, #1.0
276     vcgt.f32    d2, d1, d20           @ [0] [1] .z > 1.0?
277     vcgt.f32    d3, d1, #0            @ > 0?
278     movw        r8, #0
279     movt        r8, #0x4f7f           @ r8 = (float)(255<<24)
280     vbit        d1, d20, d2           @ make 1.0 if needed
281     vand        d1, d3
282     vdup.32     d4, r8
283     vmul.f32    d1, d1, d4
284     vcvt.u32.f32 d1, d1
285     vmov.u32    d5, #0xff000000
286     vbit        d0, d1, d5
287
288 pv_neon_next:
289     subs        r11, #2
290     vst1.32     {d0}, [r2]!           @ g_dwVtxDifColor
291     add         r4, #16
292     add         r5, #16
293     bgt         0b
294     nop
295
296     vpop        {q4-q7}
297     pop         {r4-r11,pc}
298     .size       pv_neon, .-pv_neon
299  
300
301 @ (float *d, const float *m1, const float *m2, const float *s)
302 FUNCTION(multiply_subtract2):
303     vld1.32     {d1}, [r1]
304     vld1.32     {d2}, [r2]
305     vmul.f32    d0, d1, d2
306     vld1.32     {d3}, [r3]
307     vsub.f32    d0, d3
308     vst1.32     {d0}, [r0]
309     bx          lr
310     .size       multiply_subtract2, .-multiply_subtract2
311
312
313 @ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2)
314 FUNCTION(tv_direction):
315     vld1.32     {q0}, [r0]
316     vld1.32     {q2}, [r2]
317     vld1.32     {q1}, [r1]
318     vsub.f32    d6, d4, d0     @ d6 = V2,V1
319     vsub.f32    d7, d4, d2     @ d7 = W2,W1
320     vmul.f32    d1, d5         @ d1 = v0.w * v2.w
321     vrev64.32   d7, d7
322     vmul.f32    d6, d7         @ d6 = V2*W1,V1*W2
323     vmul.f32    d1, d3         @ d1 *= v1.w
324     vshr.u64    d7, d6, #32
325     vsub.f32    d6, d7         @ d6[0] = V1*W2 - V2*W1
326     vshr.u64    d1, d1, #32
327     vmul.f32    d0, d1, d6
328     vmov.32     r0, d0[0]
329     bx          lr
330
331
332 @ vim:filetype=armasm:expandtab