rice: optimize IsTriangleVisible
[mupen64plus-pandora.git] / source / gles2rice / src / RenderBase_neon.S
CommitLineData
5c6423ae 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2014
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8#include "arm_features.h"
61b9f2df 9#include "RenderBase_neon.h"
5c6423ae 10
11.syntax unified
12.text
13.align 3
14
5c6423ae 15/*
16 * ProcessVertexData register map:
17 *
18 * q | d | c code
5c6423ae 19 * ...
61b9f2df 20 * 12 24 gRSPworldProject _11,_12,_13,_14
5c6423ae 21 * 25
61b9f2df 22 * 13 26 gRSPworldProject _21,_22,_23,_24
5c6423ae 23 * 27
61b9f2df 24 * 14 28 gRSPworldProject _31,_32,_33,_34
5c6423ae 25 * 29
61b9f2df 26 * 15 30 gRSPworldProject _41,_42,_43,_44
5c6423ae 27 * 31
28 *
29 * r4 vtx[], 16 bytes:
30 * short y, x, flag, z, tv, tu;
31 * / uint8 a, b, g, r;
32 * \ char a, z, y, x;
33 *
34 * outputs:
35 * r0 - XVECTOR4 *g_vtxTransformed
36 * r1 - XVECTOR4 *g_vecProjected
37 * r2 - uint32 *g_dwVtxDifColor
38 * r3 - VECTOR2 *g_fVtxTxtCoords
39 * sp+00 - float *g_fFogCoord
61b9f2df 40 * r6 sp+04 - uint32 *g_clipFlag2
5c6423ae 41 * inputs:
42 * r11 sp+08 - uint32 dwNum
61b9f2df 43 * r10 sp+0c - int neon_flags
44 * r4 sp+10 - FiddledVtx vtx[], (r4 [0], r5 [1])
45 * r7 sp+14 - Light *gRSPlights
5c6423ae 46 * sp+18 - float *fRSPAmbientLightRGBA
61b9f2df 47 * sp+1c - XMATRIX *gRSPworldProject
5c6423ae 48 * sp+20 - XMATRIX *gRSPmodelViewTop
49 * sp+24 - uint32 gRSPnumLights
50 * sp+28 - float gRSPfFogMin
61b9f2df 51 * sp+2c - uint32 primitiveColor
52 * sp+30 - uint32 primitiveColor
5c6423ae 53 */
61b9f2df 54FUNCTION(pv_neon):
5c6423ae 55 ldr r12, [sp, #0x10]
56 pld [r12]
57
58 push {r4-r11,lr}
59 vpush {q4-q7}
60
61 mov r4, r12 @ vtx
62 ldr r12, [sp, #0x64+0x1c]
5c6423ae 63 vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject
64 vld1.32 {q14,q15}, [r12, :128]
61b9f2df 65 ldr r6, [sp, #0x64+0x04] @ g_clipFlag2
66 add r5, r4, #16 @ vtx + 1
67 ldr r11, [sp, #0x64+0x08] @ dwNum
68 ldr r10, [sp, #0x64+0x0c] @ neon_flags
5c6423ae 69
700:
61b9f2df 71 vld1.16 d12, [r4]! @ vtx[0] .z .flag .x .y (reg)
72 vmovl.s16 q6, d12
73 vld1.16 d14, [r5]! @ vtx[1] .z .flag .x .y
74 vmovl.s16 q7, d14
75 vcvt.f32.s32 q6, q6 @ q6 = vtx_raw0
76 vcvt.f32.s32 q7, q7 @ q7 = vtx_raw1
77 vdup.32 q0, d12[1] @ vtx_raw0.x (dup)
78 vdup.32 q1, d12[0] @ vtx_raw0.y (dup)
79 vdup.32 q2, d13[1] @ vtx_raw0.z (dup)
80 vdup.32 q3, d14[1] @ vtx_raw1.x (dup)
81 vdup.32 q4, d14[0] @ vtx_raw1.y (dup)
82 vdup.32 q5, d15[1] @ vtx_raw1.z (dup)
83 /* note: order of operations matters greatly,
84 * may cause like 20 fraction bits to differ! */
85 vmul.f32 q0, q0, q12
86 vmul.f32 q3, q3, q12
87 vmla.f32 q0, q1, q13
88 vmla.f32 q3, q4, q13
89 vmul.f32 q2, q2, q14 @ yes, mul+add is
90 vmul.f32 q5, q5, q14 @ faster than mla
91 vadd.f32 q0, q2
92 vadd.f32 q3, q5
93 vadd.f32 q0, q15 @ q0 = g_vtxTransformed[i]
94 vadd.f32 q3, q15 @ q3 = g_vtxTransformed[i + 1]
5c6423ae 95
61b9f2df 96 vld1.16 d16[1], [r4]! @ [0].v
97 vmov d2, d1
98 vld1.16 d16[0], [r4]! @ [0].u
99 vsri.64 d2, d7, #32
100 vld1.16 d18[1], [r5]! @ [0].v
5c6423ae 101#if 1
61b9f2df 102 vrecpe.f32 d4, d2 @ inv [0][1] .w
103 vld1.16 d18[0], [r5]! @ [0].u
104 vrecps.f32 d5, d2, d4 @ step
105 vmovl.s16 q8, d16
106 /* write g_vtxTransformed */ vst1.32 {q0}, [r0, :128]!
107 vmovl.s16 q9, d18
108 /* ... [1] */ vst1.32 {q3}, [r0, :128]!
109 vcvt.f32.s32 d16, d16
110 vcvt.f32.s32 d18, d18
111 vmul.f32 d4, d5, d4 @ better inv
112 bic r9, r5, #63
113 pld [r9, #64]
114 vrecps.f32 d5, d2, d4 @ step
115 /* wrt u,v to g_fVtxTxtCoords */ vst1.32 {d16}, [r3]!
116 /* ... [1] */ vst1.32 {d18}, [r3]!
117 vmov.f32 d20, #1.0
118 vmov.f32 d21, #-1.0
119 vmul.f32 d4, d5, d4 @ better inv [0][1] .w
5c6423ae 120 #if 0
61b9f2df 121 vrecps.f32 d5, d2, d4 @ step
122 vmul.f32 d4, d5, d4 @ better inv
5c6423ae 123 #endif
124#else
61b9f2df 125 mov r12, #0x3f800000 @ 1.0f
126 vmov.f32 s6, r12
127 vdiv.f32 s8, s6, s4
128 vdiv.f32 s9, s6, s5
129 #error incomplete
5c6423ae 130#endif
131
61b9f2df 132 mov r8, #X_CLIP_MAX
133 mov r9, #Y_CLIP_MAX
134 vmov d22, r8, r9
135 vmul.f32 q0, q0, d4[1] @ .x .y .z .w *= [0] .w
136 vmul.f32 q1, q3, d4[0]
137 vshr.u64 d5, d4, #32 @ [0] .w
138 mov r8, #X_CLIP_MIN
139 mov r9, #Y_CLIP_MIN
140 vmov d23, r8, r9
141 vsli.64 d3, d4, #32 @ insert [1] .w
142 vsli.64 d1, d5, #32
143 vsli.u64 d5, d4, #32 @ [0] [1] .w
144 vcgt.f32 d6, d0, d20 @ .xy > 1.0?
145 vcgt.f32 d7, d21, d0
146 vcgt.f32 d4, d5, #0 @ .w > 0?
147 vst1.32 {q0,q1}, [r1]! @ wrt g_vecProjected
148 vcgt.f32 d8, d2, d20
149 vcgt.f32 d9, d21, d2
150 vld1.32 d0[0], [r4]! @ mem: [0] .azyx
151 vand q3, q11
152 vand q4, q11
153 vorr d6, d6, d7
154 vorr d7, d8, d9
155 vld1.32 d0[1], [r5]! @ mem: [1] .azyx
156 vpadd.u32 d6, d7
157 vrev32.8 d0, d0 @ make 0xaazzyyxx [1][0]
158 vsli.u64 d1, d3, #32 @ d3 = [1] [0] .z
159 vmovl.s8 q4, d0
160 vand d6, d4
161 vmovl.s16 q1, d8
162 vmovl.s16 q2, d9
163 vst1.32 {d6}, [r6]! @ g_clipFlag2
164
165 tst r10, #PV_NEON_ENABLE_LIGHT
166 beq pv_neon_no_light
167@ pv_neon_light:
168 @ live NEON registers:
169 @ d1 = [1][0] .z (must preserve)
170 @ q1,q2 = azyx [1][0]
171 @ q12+ = gRSPworldProject
172 ldr r12, [sp, #0x64+0x20]
173 vcvt.f32.s32 q1, q1
174 vcvt.f32.s32 q2, q2
175 vld1.32 {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop
176 vld1.32 {q10}, [r12, :128]
177
178 vdup.32 q5, d4[0] @ [1] .x (dup)
179 vdup.32 q6, d4[1] @ [1] .y (dup)
180 vdup.32 q7, d5[0] @ [1] .z (dup)
181 vdup.32 q2, d2[0] @ [0] .x (dup)
182 vdup.32 q3, d2[1] @ [0] .y (dup)
183 vdup.32 q4, d3[0] @ [0] .z (dup)
184 vmul.f32 q2, q2, q8
185 vmul.f32 q5, q5, q8
186 vmla.f32 q2, q3, q9
187 vmla.f32 q5, q6, q9
188 vmul.f32 q4, q4, q10
189 vmul.f32 q7, q7, q10
190 vadd.f32 q4, q2 @ q4 = temp[0] .xyz0
191 vadd.f32 q5, q7 @ q5 = temp[1] .xyz0
192 vmul.f32 q2, q4, q4 @ temp .xyz0 ^2
193 vmul.f32 q3, q5, q5
194 vpadd.f32 d2, d4, d5
195 vpadd.f32 d3, d6, d7
196 movw r8, #0x0000ffff
197 movt r8, #0x7f7f @ max normal float, ~3.4e+38
198 vdup.32 d4, r8
199 vpadd.f32 d2, d2, d3 @ d2 = [1][0] x^2 + y^2 + z^2
200 vcgt.f32 d5, d2, #0
201 vbif d2, d4, d5 @ if (d2 == 0) d2 = MAXFLOAT
202
203 vrsqrte.f32 d3, d2 @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum
204 vmul.f32 d4, d3, d2
205 ldr r9, [sp, #0x64+0x18] @ &fRSPAmbientLightRGBA
206 ldr r7, [sp, #0x64+0x14] @ gRSPlights
207 ldr r8, [sp, #0x64+0x24] @ gRSPnumLights
208 vrsqrts.f32 d4, d3, d4 @ step
209 vld1.32 {q6}, [r9] @ rgb
210 vld1.32 {q7}, [r9] @ rgb
211 vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
212#if 0 /* not necessary? */
213 vmul.f32 d4, d3, d2
214 vrsqrts.f32 d4, d3, d4 @ step
215 vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
216#endif
217 vmul.f32 q2, q4, d3[0] @ q2 = normal[0] .xyz
218 vmul.f32 q3, q5, d3[1] @ q3 = normal[1] .xyz
219
2201:
221 vld1.32 {q8}, [r7]
222 vmul.f32 q4, q8, q2 @ gRSPlights[l] * normal
223 vmul.f32 q5, q8, q3
224 vpadd.f32 d8, d8, d9
225 vpadd.f32 d10, d10, d11
226 vpadd.f32 d8, d8, d10 @ d8 = [1][0] fCosT
227 vcgt.f32 d9, d8, #0 @ if (!(fCosT > 0))
228 vand d8, d9 @ fCosT = 0
229 add r9, r7, #OFFSETOF_Light_fr
230 vld1.32 {q8}, [r9] @ .fr .fg .fb
231 vdup.32 q5, d8[1] @ [1] fCosT (dup)
232 vdup.32 q4, d8[0] @
233 vmla.f32 q7, q8, q5 @ .rgb += frgb * fCosT
234 vmla.f32 q6, q8, q4
235 add r7, #SIZEOF_Light
236 subs r8, #1
237 bgt 1b
238
239 movt r8, #0x437f @ float 255
240 vdup.32 q8, r8
241 vcgt.f32 q4, q6, q8 @ if (.rgb > 255)
242 vcgt.f32 q5, q7, q8
243 vbit q6, q8, q4 @ .rgb = 255
244 vbit q7, q8, q5
245 vcvt.u32.f32 q6, q6
246 vcvt.u32.f32 q7, q7
247 ldrb r8, [r4, #-4] @ .a from vtx
248 ldrb r9, [r5, #-4]
249 vext.32 q4, q6, q6, #3 @ reg: .abgr -> .bgra
250 vext.32 q5, q7, q7, #3
251 vmov.32 d8[0], r8 @ use .a from input
252 vmov.32 d10[0], r9
253 vmovn.u32 d8, q4
254 vmovn.u32 d10, q5
255 vmovn.u16 d0, q4
256 vmovn.u16 d2, q5
257 vsli.u64 d0, d2, #32
258 vrev32.8 d0, d0 @ 0xbbggrraa -> 0xaarrggbb
259 b pv_neon_fog_alpha
260
261pv_neon_no_light:
262 tst r10, #PV_NEON_ENABLE_SHADE
263 vldr d0, [sp, #0x64+0x2c] @ primitiveColor [0] [1]
264 beq pv_neon_fog_alpha
265 @ easier to do with ARM
266 ldr r8, [r4, #-4]
267 ldr r9, [r5, #-4]
268 ror r8, #8 @ mem: .argb -> .rgba
269 ror r9, #8 @ reg: 0xbbggrraa -> ..
270 vmov d0, r8, r9
271
272pv_neon_fog_alpha:
273 tst r10, #PV_NEON_FOG_ALPHA
274 beq pv_neon_next
275 vmov.f32 d20, #1.0
276 vcgt.f32 d2, d1, d20 @ [0] [1] .z > 1.0?
277 vcgt.f32 d3, d1, #0 @ > 0?
278 movw r8, #0
279 movt r8, #0x4f7f @ r8 = (float)(255<<24)
280 vbit d1, d20, d2 @ make 1.0 if needed
281 vand d1, d3
282 vdup.32 d4, r8
283 vmul.f32 d1, d1, d4
284 vcvt.u32.f32 d1, d1
285 vmov.u32 d5, #0xff000000
286 vbit d0, d1, d5
287
288pv_neon_next:
289 subs r11, #2
290 vst1.32 {d0}, [r2]! @ g_dwVtxDifColor
291 add r4, #16
292 add r5, #16
293 bgt 0b
294 nop
5c6423ae 295
296 vpop {q4-q7}
297 pop {r4-r11,pc}
61b9f2df 298 .size pv_neon, .-pv_neon
5c6423ae 299
300
d6e5b275 301@ (float *d, const float *m1, const float *m2, const float *s)
302FUNCTION(multiply_subtract2):
303 vld1.32 {d1}, [r1]
304 vld1.32 {d2}, [r2]
305 vmul.f32 d0, d1, d2
306 vld1.32 {d3}, [r3]
307 vsub.f32 d0, d3
308 vst1.32 {d0}, [r0]
309 bx lr
310 .size multiply_subtract2, .-multiply_subtract2
311
312
3db2a2f9 313@ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2)
314FUNCTION(tv_direction):
315 vld1.32 {q0}, [r0]
316 vld1.32 {q2}, [r2]
317 vld1.32 {q1}, [r1]
318 vsub.f32 d6, d4, d0 @ d6 = V2,V1
319 vsub.f32 d7, d4, d2 @ d7 = W2,W1
320 vmul.f32 d1, d5 @ d1 = v0.w * v2.w
321 vrev64.32 d7, d7
322 vmul.f32 d6, d7 @ d6 = V2*W1,V1*W2
323 vmul.f32 d1, d3 @ d1 *= v1.w
324 vshr.u64 d7, d6, #32
325 vsub.f32 d6, d7 @ d6[0] = V1*W2 - V2*W1
326 vshr.u64 d1, d1, #32
327 vmul.f32 d0, d1, d6
328 vmov.32 r0, d0[0]
329 bx lr
330
331
61b9f2df 332@ vim:filetype=armasm:expandtab