RICE: Copy of Notaz optim to GLES1.1 version
[mupen64plus-pandora.git] / source / rice_gles / src / RenderBase_neon.S
1 /*
2  * (C) GraÅžvydas "notaz" Ignotas, 2014
3  *
4  * This work is licensed under the terms of GNU GPL version 2 or later.
5  * See the COPYING file in the top-level directory.
6  */
7
8 #include "arm_features.h"
9 #include "RenderBase_neon.h"
10
11 .syntax unified
12 .text
13 .align 3
14
15 /*
16  * ProcessVertexData register map:
17  *
18  *  q | d | c code
19  * ...      
20  * 12  24   gRSPworldProject _11,_12,_13,_14
21  *     25   
22  * 13  26   gRSPworldProject _21,_22,_23,_24
23  *     27   
24  * 14  28   gRSPworldProject _31,_32,_33,_34
25  *     29   
26  * 15  30   gRSPworldProject _41,_42,_43,_44
27  *     31   
28  *
29  * r4 vtx[], 16 bytes:
30  * short y, x, flag, z, tv, tu;
31  * / uint8 a, b, g, r;
32  * \ char  a, z, y, x;
33  *
34  *  outputs:
35  * r0        - XVECTOR4 *g_vtxTransformed
36  * r1        - XVECTOR4 *g_vecProjected
37  * r2        - uint32   *g_dwVtxDifColor
38  * r3        - VECTOR2  *g_fVtxTxtCoords
39  *     sp+00 - float    *g_fFogCoord
40  * r6  sp+04 - uint32   *g_clipFlag2
41  *  inputs:
42  * r11 sp+08 - uint32      dwNum
43  * r10 sp+0c - int         neon_flags
44  * r4  sp+10 - FiddledVtx  vtx[], (r4 [0], r5 [1])
45  * r7  sp+14 - Light      *gRSPlights
46  *     sp+18 - float      *fRSPAmbientLightRGBA
47  *     sp+1c - XMATRIX    *gRSPworldProject
48  *     sp+20 - XMATRIX    *gRSPmodelViewTop
49  *     sp+24 - uint32      gRSPnumLights
50  *     sp+28 - float       gRSPfFogMin
51  *     sp+2c - uint32      primitiveColor
52  *     sp+30 - uint32      primitiveColor
53  */
54 FUNCTION(pv_neon):
55     ldr         r12, [sp, #0x10]
56     pld         [r12]
57
58     push        {r4-r11,lr}
59     vpush       {q4-q7}
60
61     mov         r4, r12               @ vtx
62     ldr         r12, [sp, #0x64+0x1c]
63     vld1.32     {q12,q13}, [r12, :128]! @ load gRSPworldProject
64     vld1.32     {q14,q15}, [r12, :128]
65     ldr         r6, [sp, #0x64+0x04]  @ g_clipFlag2
66     add         r5, r4, #16           @ vtx + 1
67     ldr         r11, [sp, #0x64+0x08] @ dwNum
68     ldr         r10, [sp, #0x64+0x0c] @ neon_flags
69
70 0:
71     vld1.16     d12, [r4]!            @ vtx[0] .z .flag .x .y (reg)
72     vmovl.s16   q6, d12
73     vld1.16     d14, [r5]!            @ vtx[1] .z .flag .x .y
74     vmovl.s16   q7, d14
75     vcvt.f32.s32 q6, q6               @ q6 = vtx_raw0
76     vcvt.f32.s32 q7, q7               @ q7 = vtx_raw1
77     vdup.32     q0, d12[1]            @ vtx_raw0.x (dup)
78     vdup.32     q1, d12[0]            @ vtx_raw0.y (dup)
79     vdup.32     q2, d13[1]            @ vtx_raw0.z (dup)
80     vdup.32     q3, d14[1]            @ vtx_raw1.x (dup)
81     vdup.32     q4, d14[0]            @ vtx_raw1.y (dup)
82     vdup.32     q5, d15[1]            @ vtx_raw1.z (dup)
83     /* note: order of operations matters greatly,
84      * may cause like 20 fraction bits to differ! */
85     vmul.f32    q0, q0, q12
86     vmul.f32    q3, q3, q12
87     vmla.f32    q0, q1, q13
88     vmla.f32    q3, q4, q13
89     vmul.f32    q2, q2, q14           @ yes, mul+add is
90     vmul.f32    q5, q5, q14           @ faster than mla
91     vadd.f32    q0, q2
92     vadd.f32    q3, q5
93     vadd.f32    q0, q15               @ q0 = g_vtxTransformed[i]
94     vadd.f32    q3, q15               @ q3 = g_vtxTransformed[i + 1]
95
96                                       vld1.16     d16[1], [r4]! @ [0].v
97     vmov        d2, d1
98                                       vld1.16     d16[0], [r4]! @ [0].u
99     vsri.64     d2, d7, #32
100                                       vld1.16     d18[1], [r5]! @ [0].v
101 #if 1
102     vrecpe.f32  d4, d2                @ inv [0][1] .w
103                                       vld1.16     d18[0], [r5]! @ [0].u
104     vrecps.f32  d5, d2, d4            @ step
105                                       vmovl.s16   q8, d16
106     /* g_vtxTransformed[0] */         vst1.32     {q0}, [r0, :128]!
107                                       vmovl.s16   q9, d18
108                                       vcvt.f32.s32 d16, d16
109                                       vcvt.f32.s32 d18, d18
110     vmul.f32    d4, d5, d4            @ better inv
111                                       bic         r9, r5, #63
112                                       pld         [r9, #64]
113     vrecps.f32  d5, d2, d4            @ step
114                                       cmp         r11, #1
115     /* u,v g_fVtxTxtCoords[0] */      vst1.32     {d16}, [r3]!
116                                       beq         99f
117     /* g_vtxTransformed[1] */         vst1.32     {q3}, [r0, :128]!
118     /* ... [1] */                     vst1.32     {d18}, [r3]!
119                                       99:
120                                       vmov.f32    d20, #1.0
121                                       vmov.f32    d21, #-1.0
122     vmul.f32    d4, d5, d4            @ better inv [0][1] .w
123  #if 0
124     vrecps.f32  d5, d2, d4            @ step
125     vmul.f32    d4, d5, d4            @ better inv
126  #endif
127 #else
128     mov         r12, #0x3f800000      @ 1.0f
129     vmov.f32    s6, r12
130     vdiv.f32    s8, s6, s4
131     vdiv.f32    s9, s6, s5
132  #error incomplete
133 #endif
134
135                                       mov         r8, #X_CLIP_MAX
136                                       mov         r9, #Y_CLIP_MAX
137                                       vmov        d22, r8, r9
138     vmul.f32    q0, q0, d4[1]         @ .x .y .z .w *= [0] .w
139     vmul.f32    q1, q3, d4[0]
140     vshr.u64    d5, d4, #32           @ [0] .w
141                                       mov         r8, #X_CLIP_MIN
142                                       mov         r9, #Y_CLIP_MIN
143                                       vmov        d23, r8, r9
144     vsli.64     d3, d4, #32           @ insert [1] .w
145     vsli.64     d1, d5, #32
146                                       vsli.u64    d5, d4, #32 @ [0] [1] .w
147                                       vcgt.f32    d6, d0, d20 @ .xy > 1.0?
148                                       vcgt.f32    d7, d21, d0
149                                       vcgt.f32    d4, d5, #0  @ .w > 0?
150     vst1.32     {q0}, [r1]!           @ g_vecProjected[0]
151                                       vcgt.f32    d8, d2, d20
152                                       vcgt.f32    d9, d21, d2
153     vld1.32     d0[0], [r4]!          @ mem: [0] .azyx
154                                       vand        q3, q11
155                                       vand        q4, q11
156     cmp         r11, #1
157     beq         99f
158     vst1.32     {q1}, [r1]!           @ g_vecProjected[1]
159 99:
160                                       vorr        d6, d6, d7
161                                       vorr        d7, d8, d9
162     vld1.32     d0[1], [r5]!          @ mem: [1] .azyx
163                                       vpadd.u32   d6, d7
164     vrev32.8    d0, d0                @ make 0xaazzyyxx [1][0]
165     vsli.u64    d1, d3, #32           @ d3 = [1] [0] .z
166     vmovl.s8    q4, d0
167                                       vand        d6, d4
168     vmovl.s16   q1, d8
169     vmovl.s16   q2, d9
170                                       vst1.32     {d6}, [r6]! @ g_clipFlag2
171
172     tst         r10, #PV_NEON_ENABLE_LIGHT
173     beq         pv_neon_no_light
174 @ pv_neon_light:
175     @ live NEON registers:
176     @ d1    = [1][0] .z (must preserve)
177     @ q1,q2 = azyx [1][0]
178     @ q12+  = gRSPworldProject
179     ldr         r12, [sp, #0x64+0x20]
180     vcvt.f32.s32 q1, q1
181     vcvt.f32.s32 q2, q2
182     vld1.32     {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop
183     vld1.32     {q10},   [r12, :128]
184
185     vdup.32     q5, d4[0]             @ [1] .x (dup)
186     vdup.32     q6, d4[1]             @ [1] .y (dup)
187     vdup.32     q7, d5[0]             @ [1] .z (dup)
188     vdup.32     q2, d2[0]             @ [0] .x (dup)
189     vdup.32     q3, d2[1]             @ [0] .y (dup)
190     vdup.32     q4, d3[0]             @ [0] .z (dup)
191     vmul.f32    q2, q2, q8
192     vmul.f32    q5, q5, q8
193     vmla.f32    q2, q3, q9
194     vmla.f32    q5, q6, q9
195     vmul.f32    q4, q4, q10
196     vmul.f32    q7, q7, q10
197     vadd.f32    q4, q2                @ q4 = temp[0] .xyz0
198     vadd.f32    q5, q7                @ q5 = temp[1] .xyz0
199     vmul.f32    q2, q4, q4            @ temp .xyz0 ^2
200     vmul.f32    q3, q5, q5
201     vpadd.f32   d2, d4, d5
202     vpadd.f32   d3, d6, d7
203     movw        r8, #0x0000ffff
204     movt        r8, #0x7f7f           @ max normal float, ~3.4e+38
205     vdup.32     d4, r8
206     vpadd.f32   d2, d2, d3            @ d2 = [1][0] x^2 + y^2 + z^2
207     vcgt.f32    d5, d2, #0
208     vbif        d2, d4, d5            @ if (d2 == 0) d2 = MAXFLOAT
209
210     vrsqrte.f32 d3, d2                @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum
211     vmul.f32    d4, d3, d2
212     ldr         r9, [sp, #0x64+0x18]  @ &fRSPAmbientLightRGBA
213     ldr         r7, [sp, #0x64+0x14]  @ gRSPlights
214     ldr         r8, [sp, #0x64+0x24]  @ gRSPnumLights
215     vrsqrts.f32 d4, d3, d4            @ step
216                                       vld1.32     {q6}, [r9] @ rgb
217                                       vld1.32     {q7}, [r9] @ rgb
218     vmul.f32    d3, d3, d4            @ 1/sqrt(d2)
219 #if 0 /* not necessary? */
220     vmul.f32    d4, d3, d2
221     vrsqrts.f32 d4, d3, d4            @ step
222     vmul.f32    d3, d3, d4            @ 1/sqrt(d2)
223 #endif
224     vmul.f32    q2, q4, d3[0]         @ q2 = normal[0] .xyz
225     vmul.f32    q3, q5, d3[1]         @ q3 = normal[1] .xyz
226
227 1:
228     vld1.32     {q8}, [r7]
229     vmul.f32    q4, q8, q2            @ gRSPlights[l] * normal
230     vmul.f32    q5, q8, q3
231     vpadd.f32   d8, d8, d9
232     vpadd.f32   d10, d10, d11
233     vpadd.f32   d8, d8, d10           @ d8 = [1][0] fCosT
234     vcgt.f32    d9, d8, #0            @ if (!(fCosT > 0))
235     vand        d8, d9                @   fCosT = 0
236     add         r9, r7, #OFFSETOF_Light_fr
237     vld1.32     {q8}, [r9]            @ .fr .fg .fb
238     vdup.32     q5, d8[1]             @ [1] fCosT (dup)
239     vdup.32     q4, d8[0]             @
240     vmla.f32    q7, q8, q5            @ .rgb += frgb * fCosT
241     vmla.f32    q6, q8, q4
242     add         r7, #SIZEOF_Light
243     subs        r8, #1
244     bgt         1b
245
246     movt        r8, #0x437f           @ float 255
247     vdup.32     q8, r8
248     vcgt.f32    q4, q6, q8            @ if (.rgb > 255)
249     vcgt.f32    q5, q7, q8
250     vbit        q6, q8, q4            @   .rgb = 255
251     vbit        q7, q8, q5
252     vcvt.u32.f32 q6, q6
253     vcvt.u32.f32 q7, q7
254     ldrb        r8, [r4, #-4]         @ .a from vtx
255     ldrb        r9, [r5, #-4]
256     vext.32     q4, q6, q6, #3        @ reg: .abgr -> .bgra
257     vext.32     q5, q7, q7, #3
258     vmov.32     d8[0], r8             @ use .a from input
259     vmov.32     d10[0], r9
260     vmovn.u32   d8, q4
261     vmovn.u32   d10, q5
262     vmovn.u16   d0, q4
263     vmovn.u16   d2, q5
264     vsli.u64    d0, d2, #32
265     vrev32.8    d0, d0                @ 0xbbggrraa -> 0xaarrggbb
266     b           pv_neon_fog_alpha
267
268 pv_neon_no_light:
269     tst         r10, #PV_NEON_ENABLE_SHADE
270     vldr        d0, [sp, #0x64+0x2c]  @ primitiveColor [0] [1]
271     beq         pv_neon_fog_alpha
272     @ easier to do with ARM
273     ldr         r8, [r4, #-4]
274     ldr         r9, [r5, #-4]
275     ror         r8, #8                @ mem: .argb -> .rgba
276     ror         r9, #8                @ reg: 0xbbggrraa -> ..
277     vmov        d0, r8, r9
278
279 pv_neon_fog_alpha:
280     tst         r10, #PV_NEON_FOG_ALPHA
281     beq         pv_neon_next
282     vmov.f32    d20, #1.0
283     vcgt.f32    d2, d1, d20           @ [0] [1] .z > 1.0?
284     vcgt.f32    d3, d1, #0            @ > 0?
285     movw        r8, #0
286     movt        r8, #0x4f7f           @ r8 = (float)(255<<24)
287     vbit        d1, d20, d2           @ make 1.0 if needed
288     vand        d1, d3
289     vdup.32     d4, r8
290     vmul.f32    d1, d1, d4
291     vcvt.u32.f32 d1, d1
292     vmov.u32    d5, #0xff000000
293     vbit        d0, d1, d5
294
295 pv_neon_next:
296     subs        r11, #2
297     vst1.32     {d0}, [r2]!           @ g_dwVtxDifColor
298     add         r4, #16
299     add         r5, #16
300     bgt         0b
301     nop
302
303     vpop        {q4-q7}
304     pop         {r4-r11,pc}
305     .size       pv_neon, .-pv_neon
306  
307
308 @ (float *d, const float *m1, const float *m2, const float *s)
309 FUNCTION(multiply_subtract2):
310     vld1.32     {d1}, [r1]
311     vld1.32     {d2}, [r2]
312     vmul.f32    d0, d1, d2
313     vld1.32     {d3}, [r3]
314     vsub.f32    d0, d3
315     vst1.32     {d0}, [r0]
316     bx          lr
317     .size       multiply_subtract2, .-multiply_subtract2
318
319
320 @ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2)
321 FUNCTION(tv_direction):
322     vld1.32     {q0}, [r0]
323     vld1.32     {q2}, [r2]
324     vld1.32     {q1}, [r1]
325     vsub.f32    d6, d4, d0     @ d6 = V2,V1
326     vsub.f32    d7, d4, d2     @ d7 = W2,W1
327     vmul.f32    d1, d5         @ d1 = v0.w * v2.w
328     vrev64.32   d7, d7
329     vmul.f32    d6, d7         @ d6 = V2*W1,V1*W2
330     vmul.f32    d1, d3         @ d1 *= v1.w
331     vshr.u64    d7, d6, #32
332     vsub.f32    d6, d7         @ d6[0] = V1*W2 - V2*W1
333     vshr.u64    d1, d1, #32
334     vmul.f32    d0, d1, d6
335     vmov.32     r0, d0[0]
336     bx          lr
337
338
339 @ vim:filetype=armasm:expandtab