RICE: Copy of Notaz optim to GLES1.1 version
[mupen64plus-pandora.git] / source / rice_gles / src / RenderBase_neon.S
CommitLineData
48d77f73 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2014
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8#include "arm_features.h"
9#include "RenderBase_neon.h"
10
11.syntax unified
12.text
13.align 3
14
15/*
16 * ProcessVertexData register map:
17 *
18 * q | d | c code
19 * ...
20 * 12 24 gRSPworldProject _11,_12,_13,_14
21 * 25
22 * 13 26 gRSPworldProject _21,_22,_23,_24
23 * 27
24 * 14 28 gRSPworldProject _31,_32,_33,_34
25 * 29
26 * 15 30 gRSPworldProject _41,_42,_43,_44
27 * 31
28 *
29 * r4 vtx[], 16 bytes:
30 * short y, x, flag, z, tv, tu;
31 * / uint8 a, b, g, r;
32 * \ char a, z, y, x;
33 *
34 * outputs:
35 * r0 - XVECTOR4 *g_vtxTransformed
36 * r1 - XVECTOR4 *g_vecProjected
37 * r2 - uint32 *g_dwVtxDifColor
38 * r3 - VECTOR2 *g_fVtxTxtCoords
39 * sp+00 - float *g_fFogCoord
40 * r6 sp+04 - uint32 *g_clipFlag2
41 * inputs:
42 * r11 sp+08 - uint32 dwNum
43 * r10 sp+0c - int neon_flags
44 * r4 sp+10 - FiddledVtx vtx[], (r4 [0], r5 [1])
45 * r7 sp+14 - Light *gRSPlights
46 * sp+18 - float *fRSPAmbientLightRGBA
47 * sp+1c - XMATRIX *gRSPworldProject
48 * sp+20 - XMATRIX *gRSPmodelViewTop
49 * sp+24 - uint32 gRSPnumLights
50 * sp+28 - float gRSPfFogMin
51 * sp+2c - uint32 primitiveColor
52 * sp+30 - uint32 primitiveColor
53 */
54FUNCTION(pv_neon):
55 ldr r12, [sp, #0x10]
56 pld [r12]
57
58 push {r4-r11,lr}
59 vpush {q4-q7}
60
61 mov r4, r12 @ vtx
62 ldr r12, [sp, #0x64+0x1c]
63 vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject
64 vld1.32 {q14,q15}, [r12, :128]
65 ldr r6, [sp, #0x64+0x04] @ g_clipFlag2
66 add r5, r4, #16 @ vtx + 1
67 ldr r11, [sp, #0x64+0x08] @ dwNum
68 ldr r10, [sp, #0x64+0x0c] @ neon_flags
69
700:
71 vld1.16 d12, [r4]! @ vtx[0] .z .flag .x .y (reg)
72 vmovl.s16 q6, d12
73 vld1.16 d14, [r5]! @ vtx[1] .z .flag .x .y
74 vmovl.s16 q7, d14
75 vcvt.f32.s32 q6, q6 @ q6 = vtx_raw0
76 vcvt.f32.s32 q7, q7 @ q7 = vtx_raw1
77 vdup.32 q0, d12[1] @ vtx_raw0.x (dup)
78 vdup.32 q1, d12[0] @ vtx_raw0.y (dup)
79 vdup.32 q2, d13[1] @ vtx_raw0.z (dup)
80 vdup.32 q3, d14[1] @ vtx_raw1.x (dup)
81 vdup.32 q4, d14[0] @ vtx_raw1.y (dup)
82 vdup.32 q5, d15[1] @ vtx_raw1.z (dup)
83 /* note: order of operations matters greatly,
84 * may cause like 20 fraction bits to differ! */
85 vmul.f32 q0, q0, q12
86 vmul.f32 q3, q3, q12
87 vmla.f32 q0, q1, q13
88 vmla.f32 q3, q4, q13
89 vmul.f32 q2, q2, q14 @ yes, mul+add is
90 vmul.f32 q5, q5, q14 @ faster than mla
91 vadd.f32 q0, q2
92 vadd.f32 q3, q5
93 vadd.f32 q0, q15 @ q0 = g_vtxTransformed[i]
94 vadd.f32 q3, q15 @ q3 = g_vtxTransformed[i + 1]
95
96 vld1.16 d16[1], [r4]! @ [0].v
97 vmov d2, d1
98 vld1.16 d16[0], [r4]! @ [0].u
99 vsri.64 d2, d7, #32
100 vld1.16 d18[1], [r5]! @ [0].v
101#if 1
102 vrecpe.f32 d4, d2 @ inv [0][1] .w
103 vld1.16 d18[0], [r5]! @ [0].u
104 vrecps.f32 d5, d2, d4 @ step
105 vmovl.s16 q8, d16
106 /* g_vtxTransformed[0] */ vst1.32 {q0}, [r0, :128]!
107 vmovl.s16 q9, d18
108 vcvt.f32.s32 d16, d16
109 vcvt.f32.s32 d18, d18
110 vmul.f32 d4, d5, d4 @ better inv
111 bic r9, r5, #63
112 pld [r9, #64]
113 vrecps.f32 d5, d2, d4 @ step
114 cmp r11, #1
115 /* u,v g_fVtxTxtCoords[0] */ vst1.32 {d16}, [r3]!
116 beq 99f
117 /* g_vtxTransformed[1] */ vst1.32 {q3}, [r0, :128]!
118 /* ... [1] */ vst1.32 {d18}, [r3]!
119 99:
120 vmov.f32 d20, #1.0
121 vmov.f32 d21, #-1.0
122 vmul.f32 d4, d5, d4 @ better inv [0][1] .w
123 #if 0
124 vrecps.f32 d5, d2, d4 @ step
125 vmul.f32 d4, d5, d4 @ better inv
126 #endif
127#else
128 mov r12, #0x3f800000 @ 1.0f
129 vmov.f32 s6, r12
130 vdiv.f32 s8, s6, s4
131 vdiv.f32 s9, s6, s5
132 #error incomplete
133#endif
134
135 mov r8, #X_CLIP_MAX
136 mov r9, #Y_CLIP_MAX
137 vmov d22, r8, r9
138 vmul.f32 q0, q0, d4[1] @ .x .y .z .w *= [0] .w
139 vmul.f32 q1, q3, d4[0]
140 vshr.u64 d5, d4, #32 @ [0] .w
141 mov r8, #X_CLIP_MIN
142 mov r9, #Y_CLIP_MIN
143 vmov d23, r8, r9
144 vsli.64 d3, d4, #32 @ insert [1] .w
145 vsli.64 d1, d5, #32
146 vsli.u64 d5, d4, #32 @ [0] [1] .w
147 vcgt.f32 d6, d0, d20 @ .xy > 1.0?
148 vcgt.f32 d7, d21, d0
149 vcgt.f32 d4, d5, #0 @ .w > 0?
150 vst1.32 {q0}, [r1]! @ g_vecProjected[0]
151 vcgt.f32 d8, d2, d20
152 vcgt.f32 d9, d21, d2
153 vld1.32 d0[0], [r4]! @ mem: [0] .azyx
154 vand q3, q11
155 vand q4, q11
156 cmp r11, #1
157 beq 99f
158 vst1.32 {q1}, [r1]! @ g_vecProjected[1]
15999:
160 vorr d6, d6, d7
161 vorr d7, d8, d9
162 vld1.32 d0[1], [r5]! @ mem: [1] .azyx
163 vpadd.u32 d6, d7
164 vrev32.8 d0, d0 @ make 0xaazzyyxx [1][0]
165 vsli.u64 d1, d3, #32 @ d3 = [1] [0] .z
166 vmovl.s8 q4, d0
167 vand d6, d4
168 vmovl.s16 q1, d8
169 vmovl.s16 q2, d9
170 vst1.32 {d6}, [r6]! @ g_clipFlag2
171
172 tst r10, #PV_NEON_ENABLE_LIGHT
173 beq pv_neon_no_light
174@ pv_neon_light:
175 @ live NEON registers:
176 @ d1 = [1][0] .z (must preserve)
177 @ q1,q2 = azyx [1][0]
178 @ q12+ = gRSPworldProject
179 ldr r12, [sp, #0x64+0x20]
180 vcvt.f32.s32 q1, q1
181 vcvt.f32.s32 q2, q2
182 vld1.32 {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop
183 vld1.32 {q10}, [r12, :128]
184
185 vdup.32 q5, d4[0] @ [1] .x (dup)
186 vdup.32 q6, d4[1] @ [1] .y (dup)
187 vdup.32 q7, d5[0] @ [1] .z (dup)
188 vdup.32 q2, d2[0] @ [0] .x (dup)
189 vdup.32 q3, d2[1] @ [0] .y (dup)
190 vdup.32 q4, d3[0] @ [0] .z (dup)
191 vmul.f32 q2, q2, q8
192 vmul.f32 q5, q5, q8
193 vmla.f32 q2, q3, q9
194 vmla.f32 q5, q6, q9
195 vmul.f32 q4, q4, q10
196 vmul.f32 q7, q7, q10
197 vadd.f32 q4, q2 @ q4 = temp[0] .xyz0
198 vadd.f32 q5, q7 @ q5 = temp[1] .xyz0
199 vmul.f32 q2, q4, q4 @ temp .xyz0 ^2
200 vmul.f32 q3, q5, q5
201 vpadd.f32 d2, d4, d5
202 vpadd.f32 d3, d6, d7
203 movw r8, #0x0000ffff
204 movt r8, #0x7f7f @ max normal float, ~3.4e+38
205 vdup.32 d4, r8
206 vpadd.f32 d2, d2, d3 @ d2 = [1][0] x^2 + y^2 + z^2
207 vcgt.f32 d5, d2, #0
208 vbif d2, d4, d5 @ if (d2 == 0) d2 = MAXFLOAT
209
210 vrsqrte.f32 d3, d2 @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum
211 vmul.f32 d4, d3, d2
212 ldr r9, [sp, #0x64+0x18] @ &fRSPAmbientLightRGBA
213 ldr r7, [sp, #0x64+0x14] @ gRSPlights
214 ldr r8, [sp, #0x64+0x24] @ gRSPnumLights
215 vrsqrts.f32 d4, d3, d4 @ step
216 vld1.32 {q6}, [r9] @ rgb
217 vld1.32 {q7}, [r9] @ rgb
218 vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
219#if 0 /* not necessary? */
220 vmul.f32 d4, d3, d2
221 vrsqrts.f32 d4, d3, d4 @ step
222 vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
223#endif
224 vmul.f32 q2, q4, d3[0] @ q2 = normal[0] .xyz
225 vmul.f32 q3, q5, d3[1] @ q3 = normal[1] .xyz
226
2271:
228 vld1.32 {q8}, [r7]
229 vmul.f32 q4, q8, q2 @ gRSPlights[l] * normal
230 vmul.f32 q5, q8, q3
231 vpadd.f32 d8, d8, d9
232 vpadd.f32 d10, d10, d11
233 vpadd.f32 d8, d8, d10 @ d8 = [1][0] fCosT
234 vcgt.f32 d9, d8, #0 @ if (!(fCosT > 0))
235 vand d8, d9 @ fCosT = 0
236 add r9, r7, #OFFSETOF_Light_fr
237 vld1.32 {q8}, [r9] @ .fr .fg .fb
238 vdup.32 q5, d8[1] @ [1] fCosT (dup)
239 vdup.32 q4, d8[0] @
240 vmla.f32 q7, q8, q5 @ .rgb += frgb * fCosT
241 vmla.f32 q6, q8, q4
242 add r7, #SIZEOF_Light
243 subs r8, #1
244 bgt 1b
245
246 movt r8, #0x437f @ float 255
247 vdup.32 q8, r8
248 vcgt.f32 q4, q6, q8 @ if (.rgb > 255)
249 vcgt.f32 q5, q7, q8
250 vbit q6, q8, q4 @ .rgb = 255
251 vbit q7, q8, q5
252 vcvt.u32.f32 q6, q6
253 vcvt.u32.f32 q7, q7
254 ldrb r8, [r4, #-4] @ .a from vtx
255 ldrb r9, [r5, #-4]
256 vext.32 q4, q6, q6, #3 @ reg: .abgr -> .bgra
257 vext.32 q5, q7, q7, #3
258 vmov.32 d8[0], r8 @ use .a from input
259 vmov.32 d10[0], r9
260 vmovn.u32 d8, q4
261 vmovn.u32 d10, q5
262 vmovn.u16 d0, q4
263 vmovn.u16 d2, q5
264 vsli.u64 d0, d2, #32
265 vrev32.8 d0, d0 @ 0xbbggrraa -> 0xaarrggbb
266 b pv_neon_fog_alpha
267
268pv_neon_no_light:
269 tst r10, #PV_NEON_ENABLE_SHADE
270 vldr d0, [sp, #0x64+0x2c] @ primitiveColor [0] [1]
271 beq pv_neon_fog_alpha
272 @ easier to do with ARM
273 ldr r8, [r4, #-4]
274 ldr r9, [r5, #-4]
275 ror r8, #8 @ mem: .argb -> .rgba
276 ror r9, #8 @ reg: 0xbbggrraa -> ..
277 vmov d0, r8, r9
278
279pv_neon_fog_alpha:
280 tst r10, #PV_NEON_FOG_ALPHA
281 beq pv_neon_next
282 vmov.f32 d20, #1.0
283 vcgt.f32 d2, d1, d20 @ [0] [1] .z > 1.0?
284 vcgt.f32 d3, d1, #0 @ > 0?
285 movw r8, #0
286 movt r8, #0x4f7f @ r8 = (float)(255<<24)
287 vbit d1, d20, d2 @ make 1.0 if needed
288 vand d1, d3
289 vdup.32 d4, r8
290 vmul.f32 d1, d1, d4
291 vcvt.u32.f32 d1, d1
292 vmov.u32 d5, #0xff000000
293 vbit d0, d1, d5
294
295pv_neon_next:
296 subs r11, #2
297 vst1.32 {d0}, [r2]! @ g_dwVtxDifColor
298 add r4, #16
299 add r5, #16
300 bgt 0b
301 nop
302
303 vpop {q4-q7}
304 pop {r4-r11,pc}
305 .size pv_neon, .-pv_neon
306
307
308@ (float *d, const float *m1, const float *m2, const float *s)
309FUNCTION(multiply_subtract2):
310 vld1.32 {d1}, [r1]
311 vld1.32 {d2}, [r2]
312 vmul.f32 d0, d1, d2
313 vld1.32 {d3}, [r3]
314 vsub.f32 d0, d3
315 vst1.32 {d0}, [r0]
316 bx lr
317 .size multiply_subtract2, .-multiply_subtract2
318
319
320@ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2)
321FUNCTION(tv_direction):
322 vld1.32 {q0}, [r0]
323 vld1.32 {q2}, [r2]
324 vld1.32 {q1}, [r1]
325 vsub.f32 d6, d4, d0 @ d6 = V2,V1
326 vsub.f32 d7, d4, d2 @ d7 = W2,W1
327 vmul.f32 d1, d5 @ d1 = v0.w * v2.w
328 vrev64.32 d7, d7
329 vmul.f32 d6, d7 @ d6 = V2*W1,V1*W2
330 vmul.f32 d1, d3 @ d1 *= v1.w
331 vshr.u64 d7, d6, #32
332 vsub.f32 d6, d7 @ d6[0] = V1*W2 - V2*W1
333 vshr.u64 d1, d1, #32
334 vmul.f32 d0, d1, d6
335 vmov.32 r0, d0[0]
336 bx lr
337
338
339@ vim:filetype=armasm:expandtab