rice: neon: fix last vertex overwrite
[mupen64plus-pandora.git] / source / gles2rice / src / RenderBase_neon.S
CommitLineData
5c6423ae 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2014
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8#include "arm_features.h"
61b9f2df 9#include "RenderBase_neon.h"
5c6423ae 10
11.syntax unified
12.text
13.align 3
14
5c6423ae 15/*
16 * ProcessVertexData register map:
17 *
18 * q | d | c code
5c6423ae 19 * ...
61b9f2df 20 * 12 24 gRSPworldProject _11,_12,_13,_14
5c6423ae 21 * 25
61b9f2df 22 * 13 26 gRSPworldProject _21,_22,_23,_24
5c6423ae 23 * 27
61b9f2df 24 * 14 28 gRSPworldProject _31,_32,_33,_34
5c6423ae 25 * 29
61b9f2df 26 * 15 30 gRSPworldProject _41,_42,_43,_44
5c6423ae 27 * 31
28 *
29 * r4 vtx[], 16 bytes:
30 * short y, x, flag, z, tv, tu;
31 * / uint8 a, b, g, r;
32 * \ char a, z, y, x;
33 *
34 * outputs:
35 * r0 - XVECTOR4 *g_vtxTransformed
36 * r1 - XVECTOR4 *g_vecProjected
37 * r2 - uint32 *g_dwVtxDifColor
38 * r3 - VECTOR2 *g_fVtxTxtCoords
39 * sp+00 - float *g_fFogCoord
61b9f2df 40 * r6 sp+04 - uint32 *g_clipFlag2
5c6423ae 41 * inputs:
42 * r11 sp+08 - uint32 dwNum
61b9f2df 43 * r10 sp+0c - int neon_flags
44 * r4 sp+10 - FiddledVtx vtx[], (r4 [0], r5 [1])
45 * r7 sp+14 - Light *gRSPlights
5c6423ae 46 * sp+18 - float *fRSPAmbientLightRGBA
61b9f2df 47 * sp+1c - XMATRIX *gRSPworldProject
5c6423ae 48 * sp+20 - XMATRIX *gRSPmodelViewTop
49 * sp+24 - uint32 gRSPnumLights
50 * sp+28 - float gRSPfFogMin
61b9f2df 51 * sp+2c - uint32 primitiveColor
52 * sp+30 - uint32 primitiveColor
5c6423ae 53 */
61b9f2df 54FUNCTION(pv_neon):
5c6423ae 55 ldr r12, [sp, #0x10]
56 pld [r12]
57
58 push {r4-r11,lr}
59 vpush {q4-q7}
60
61 mov r4, r12 @ vtx
62 ldr r12, [sp, #0x64+0x1c]
5c6423ae 63 vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject
64 vld1.32 {q14,q15}, [r12, :128]
61b9f2df 65 ldr r6, [sp, #0x64+0x04] @ g_clipFlag2
66 add r5, r4, #16 @ vtx + 1
67 ldr r11, [sp, #0x64+0x08] @ dwNum
68 ldr r10, [sp, #0x64+0x0c] @ neon_flags
5c6423ae 69
700:
61b9f2df 71 vld1.16 d12, [r4]! @ vtx[0] .z .flag .x .y (reg)
72 vmovl.s16 q6, d12
73 vld1.16 d14, [r5]! @ vtx[1] .z .flag .x .y
74 vmovl.s16 q7, d14
75 vcvt.f32.s32 q6, q6 @ q6 = vtx_raw0
76 vcvt.f32.s32 q7, q7 @ q7 = vtx_raw1
77 vdup.32 q0, d12[1] @ vtx_raw0.x (dup)
78 vdup.32 q1, d12[0] @ vtx_raw0.y (dup)
79 vdup.32 q2, d13[1] @ vtx_raw0.z (dup)
80 vdup.32 q3, d14[1] @ vtx_raw1.x (dup)
81 vdup.32 q4, d14[0] @ vtx_raw1.y (dup)
82 vdup.32 q5, d15[1] @ vtx_raw1.z (dup)
83 /* note: order of operations matters greatly,
84 * may cause like 20 fraction bits to differ! */
85 vmul.f32 q0, q0, q12
86 vmul.f32 q3, q3, q12
87 vmla.f32 q0, q1, q13
88 vmla.f32 q3, q4, q13
89 vmul.f32 q2, q2, q14 @ yes, mul+add is
90 vmul.f32 q5, q5, q14 @ faster than mla
91 vadd.f32 q0, q2
92 vadd.f32 q3, q5
93 vadd.f32 q0, q15 @ q0 = g_vtxTransformed[i]
94 vadd.f32 q3, q15 @ q3 = g_vtxTransformed[i + 1]
5c6423ae 95
61b9f2df 96 vld1.16 d16[1], [r4]! @ [0].v
97 vmov d2, d1
98 vld1.16 d16[0], [r4]! @ [0].u
99 vsri.64 d2, d7, #32
100 vld1.16 d18[1], [r5]! @ [0].v
5c6423ae 101#if 1
61b9f2df 102 vrecpe.f32 d4, d2 @ inv [0][1] .w
103 vld1.16 d18[0], [r5]! @ [0].u
104 vrecps.f32 d5, d2, d4 @ step
105 vmovl.s16 q8, d16
42669f3e 106 /* g_vtxTransformed[0] */ vst1.32 {q0}, [r0, :128]!
61b9f2df 107 vmovl.s16 q9, d18
61b9f2df 108 vcvt.f32.s32 d16, d16
109 vcvt.f32.s32 d18, d18
110 vmul.f32 d4, d5, d4 @ better inv
111 bic r9, r5, #63
112 pld [r9, #64]
113 vrecps.f32 d5, d2, d4 @ step
42669f3e 114 cmp r11, #1
115 /* u,v g_fVtxTxtCoords[0] */ vst1.32 {d16}, [r3]!
116 beq 99f
117 /* g_vtxTransformed[1] */ vst1.32 {q3}, [r0, :128]!
61b9f2df 118 /* ... [1] */ vst1.32 {d18}, [r3]!
42669f3e 119 99:
61b9f2df 120 vmov.f32 d20, #1.0
121 vmov.f32 d21, #-1.0
122 vmul.f32 d4, d5, d4 @ better inv [0][1] .w
5c6423ae 123 #if 0
61b9f2df 124 vrecps.f32 d5, d2, d4 @ step
125 vmul.f32 d4, d5, d4 @ better inv
5c6423ae 126 #endif
127#else
61b9f2df 128 mov r12, #0x3f800000 @ 1.0f
129 vmov.f32 s6, r12
130 vdiv.f32 s8, s6, s4
131 vdiv.f32 s9, s6, s5
132 #error incomplete
5c6423ae 133#endif
134
61b9f2df 135 mov r8, #X_CLIP_MAX
136 mov r9, #Y_CLIP_MAX
137 vmov d22, r8, r9
138 vmul.f32 q0, q0, d4[1] @ .x .y .z .w *= [0] .w
139 vmul.f32 q1, q3, d4[0]
140 vshr.u64 d5, d4, #32 @ [0] .w
141 mov r8, #X_CLIP_MIN
142 mov r9, #Y_CLIP_MIN
143 vmov d23, r8, r9
144 vsli.64 d3, d4, #32 @ insert [1] .w
145 vsli.64 d1, d5, #32
146 vsli.u64 d5, d4, #32 @ [0] [1] .w
147 vcgt.f32 d6, d0, d20 @ .xy > 1.0?
148 vcgt.f32 d7, d21, d0
149 vcgt.f32 d4, d5, #0 @ .w > 0?
42669f3e 150 vst1.32 {q0}, [r1]! @ g_vecProjected[0]
61b9f2df 151 vcgt.f32 d8, d2, d20
152 vcgt.f32 d9, d21, d2
153 vld1.32 d0[0], [r4]! @ mem: [0] .azyx
154 vand q3, q11
155 vand q4, q11
42669f3e 156 cmp r11, #1
157 beq 99f
158 vst1.32 {q1}, [r1]! @ g_vecProjected[1]
15999:
61b9f2df 160 vorr d6, d6, d7
161 vorr d7, d8, d9
162 vld1.32 d0[1], [r5]! @ mem: [1] .azyx
163 vpadd.u32 d6, d7
164 vrev32.8 d0, d0 @ make 0xaazzyyxx [1][0]
165 vsli.u64 d1, d3, #32 @ d3 = [1] [0] .z
166 vmovl.s8 q4, d0
167 vand d6, d4
168 vmovl.s16 q1, d8
169 vmovl.s16 q2, d9
170 vst1.32 {d6}, [r6]! @ g_clipFlag2
171
172 tst r10, #PV_NEON_ENABLE_LIGHT
173 beq pv_neon_no_light
174@ pv_neon_light:
175 @ live NEON registers:
176 @ d1 = [1][0] .z (must preserve)
177 @ q1,q2 = azyx [1][0]
178 @ q12+ = gRSPworldProject
179 ldr r12, [sp, #0x64+0x20]
180 vcvt.f32.s32 q1, q1
181 vcvt.f32.s32 q2, q2
182 vld1.32 {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop
183 vld1.32 {q10}, [r12, :128]
184
185 vdup.32 q5, d4[0] @ [1] .x (dup)
186 vdup.32 q6, d4[1] @ [1] .y (dup)
187 vdup.32 q7, d5[0] @ [1] .z (dup)
188 vdup.32 q2, d2[0] @ [0] .x (dup)
189 vdup.32 q3, d2[1] @ [0] .y (dup)
190 vdup.32 q4, d3[0] @ [0] .z (dup)
191 vmul.f32 q2, q2, q8
192 vmul.f32 q5, q5, q8
193 vmla.f32 q2, q3, q9
194 vmla.f32 q5, q6, q9
195 vmul.f32 q4, q4, q10
196 vmul.f32 q7, q7, q10
197 vadd.f32 q4, q2 @ q4 = temp[0] .xyz0
198 vadd.f32 q5, q7 @ q5 = temp[1] .xyz0
199 vmul.f32 q2, q4, q4 @ temp .xyz0 ^2
200 vmul.f32 q3, q5, q5
201 vpadd.f32 d2, d4, d5
202 vpadd.f32 d3, d6, d7
203 movw r8, #0x0000ffff
204 movt r8, #0x7f7f @ max normal float, ~3.4e+38
205 vdup.32 d4, r8
206 vpadd.f32 d2, d2, d3 @ d2 = [1][0] x^2 + y^2 + z^2
207 vcgt.f32 d5, d2, #0
208 vbif d2, d4, d5 @ if (d2 == 0) d2 = MAXFLOAT
209
210 vrsqrte.f32 d3, d2 @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum
211 vmul.f32 d4, d3, d2
212 ldr r9, [sp, #0x64+0x18] @ &fRSPAmbientLightRGBA
213 ldr r7, [sp, #0x64+0x14] @ gRSPlights
214 ldr r8, [sp, #0x64+0x24] @ gRSPnumLights
215 vrsqrts.f32 d4, d3, d4 @ step
216 vld1.32 {q6}, [r9] @ rgb
217 vld1.32 {q7}, [r9] @ rgb
218 vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
219#if 0 /* not necessary? */
220 vmul.f32 d4, d3, d2
221 vrsqrts.f32 d4, d3, d4 @ step
222 vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
223#endif
224 vmul.f32 q2, q4, d3[0] @ q2 = normal[0] .xyz
225 vmul.f32 q3, q5, d3[1] @ q3 = normal[1] .xyz
226
2271:
228 vld1.32 {q8}, [r7]
229 vmul.f32 q4, q8, q2 @ gRSPlights[l] * normal
230 vmul.f32 q5, q8, q3
231 vpadd.f32 d8, d8, d9
232 vpadd.f32 d10, d10, d11
233 vpadd.f32 d8, d8, d10 @ d8 = [1][0] fCosT
234 vcgt.f32 d9, d8, #0 @ if (!(fCosT > 0))
235 vand d8, d9 @ fCosT = 0
236 add r9, r7, #OFFSETOF_Light_fr
237 vld1.32 {q8}, [r9] @ .fr .fg .fb
238 vdup.32 q5, d8[1] @ [1] fCosT (dup)
239 vdup.32 q4, d8[0] @
240 vmla.f32 q7, q8, q5 @ .rgb += frgb * fCosT
241 vmla.f32 q6, q8, q4
242 add r7, #SIZEOF_Light
243 subs r8, #1
244 bgt 1b
245
246 movt r8, #0x437f @ float 255
247 vdup.32 q8, r8
248 vcgt.f32 q4, q6, q8 @ if (.rgb > 255)
249 vcgt.f32 q5, q7, q8
250 vbit q6, q8, q4 @ .rgb = 255
251 vbit q7, q8, q5
252 vcvt.u32.f32 q6, q6
253 vcvt.u32.f32 q7, q7
254 ldrb r8, [r4, #-4] @ .a from vtx
255 ldrb r9, [r5, #-4]
256 vext.32 q4, q6, q6, #3 @ reg: .abgr -> .bgra
257 vext.32 q5, q7, q7, #3
258 vmov.32 d8[0], r8 @ use .a from input
259 vmov.32 d10[0], r9
260 vmovn.u32 d8, q4
261 vmovn.u32 d10, q5
262 vmovn.u16 d0, q4
263 vmovn.u16 d2, q5
264 vsli.u64 d0, d2, #32
265 vrev32.8 d0, d0 @ 0xbbggrraa -> 0xaarrggbb
266 b pv_neon_fog_alpha
267
268pv_neon_no_light:
269 tst r10, #PV_NEON_ENABLE_SHADE
270 vldr d0, [sp, #0x64+0x2c] @ primitiveColor [0] [1]
271 beq pv_neon_fog_alpha
272 @ easier to do with ARM
273 ldr r8, [r4, #-4]
274 ldr r9, [r5, #-4]
275 ror r8, #8 @ mem: .argb -> .rgba
276 ror r9, #8 @ reg: 0xbbggrraa -> ..
277 vmov d0, r8, r9
278
279pv_neon_fog_alpha:
280 tst r10, #PV_NEON_FOG_ALPHA
281 beq pv_neon_next
282 vmov.f32 d20, #1.0
283 vcgt.f32 d2, d1, d20 @ [0] [1] .z > 1.0?
284 vcgt.f32 d3, d1, #0 @ > 0?
285 movw r8, #0
286 movt r8, #0x4f7f @ r8 = (float)(255<<24)
287 vbit d1, d20, d2 @ make 1.0 if needed
288 vand d1, d3
289 vdup.32 d4, r8
290 vmul.f32 d1, d1, d4
291 vcvt.u32.f32 d1, d1
292 vmov.u32 d5, #0xff000000
293 vbit d0, d1, d5
294
295pv_neon_next:
296 subs r11, #2
297 vst1.32 {d0}, [r2]! @ g_dwVtxDifColor
298 add r4, #16
299 add r5, #16
300 bgt 0b
301 nop
5c6423ae 302
303 vpop {q4-q7}
304 pop {r4-r11,pc}
61b9f2df 305 .size pv_neon, .-pv_neon
5c6423ae 306
307
d6e5b275 308@ (float *d, const float *m1, const float *m2, const float *s)
309FUNCTION(multiply_subtract2):
310 vld1.32 {d1}, [r1]
311 vld1.32 {d2}, [r2]
312 vmul.f32 d0, d1, d2
313 vld1.32 {d3}, [r3]
314 vsub.f32 d0, d3
315 vst1.32 {d0}, [r0]
316 bx lr
317 .size multiply_subtract2, .-multiply_subtract2
318
319
3db2a2f9 320@ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2)
321FUNCTION(tv_direction):
322 vld1.32 {q0}, [r0]
323 vld1.32 {q2}, [r2]
324 vld1.32 {q1}, [r1]
325 vsub.f32 d6, d4, d0 @ d6 = V2,V1
326 vsub.f32 d7, d4, d2 @ d7 = W2,W1
327 vmul.f32 d1, d5 @ d1 = v0.w * v2.w
328 vrev64.32 d7, d7
329 vmul.f32 d6, d7 @ d6 = V2*W1,V1*W2
330 vmul.f32 d1, d3 @ d1 *= v1.w
331 vshr.u64 d7, d6, #32
332 vsub.f32 d6, d7 @ d6[0] = V1*W2 - V2*W1
333 vshr.u64 d1, d1, #32
334 vmul.f32 d0, d1, d6
335 vmov.32 r0, d0[0]
336 bx lr
337
338
61b9f2df 339@ vim:filetype=armasm:expandtab