2 * (C) GraÅžvydas "notaz" Ignotas, 2014
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
8 #include "arm_features.h"
9 #include "RenderBase_neon.h"
16 * ProcessVertexData register map:
20 * 12 24 gRSPworldProject _11,_12,_13,_14
22 * 13 26 gRSPworldProject _21,_22,_23,_24
24 * 14 28 gRSPworldProject _31,_32,_33,_34
26 * 15 30 gRSPworldProject _41,_42,_43,_44
30 * short y, x, flag, z, tv, tu;
35 * r0 - XVECTOR4 *g_vtxTransformed
36 * r1 - XVECTOR4 *g_vecProjected
37 * r2 - uint32 *g_dwVtxDifColor
38 * r3 - VECTOR2 *g_fVtxTxtCoords
39 * sp+00 - float *g_fFogCoord
40 * r6 sp+04 - uint32 *g_clipFlag2
42 * r11 sp+08 - uint32 dwNum
43 * r10 sp+0c - int neon_flags
44 * r4 sp+10 - FiddledVtx vtx[], (r4 [0], r5 [1])
45 * r7 sp+14 - Light *gRSPlights
46 * sp+18 - float *fRSPAmbientLightRGBA
47 * sp+1c - XMATRIX *gRSPworldProject
48 * sp+20 - XMATRIX *gRSPmodelViewTop
49 * sp+24 - uint32 gRSPnumLights
50 * sp+28 - float gRSPfFogMin
51 * sp+2c - uint32 primitiveColor
52 * sp+30 - uint32 primitiveColor
62 ldr r12, [sp, #0x64+0x1c]
63 vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject
64 vld1.32 {q14,q15}, [r12, :128]
65 ldr r6, [sp, #0x64+0x04] @ g_clipFlag2
66 add r5, r4, #16 @ vtx + 1
67 ldr r11, [sp, #0x64+0x08] @ dwNum
68 ldr r10, [sp, #0x64+0x0c] @ neon_flags
71 vld1.16 d12, [r4]! @ vtx[0] .z .flag .x .y (reg)
73 vld1.16 d14, [r5]! @ vtx[1] .z .flag .x .y
75 vcvt.f32.s32 q6, q6 @ q6 = vtx_raw0
76 vcvt.f32.s32 q7, q7 @ q7 = vtx_raw1
77 vdup.32 q0, d12[1] @ vtx_raw0.x (dup)
78 vdup.32 q1, d12[0] @ vtx_raw0.y (dup)
79 vdup.32 q2, d13[1] @ vtx_raw0.z (dup)
80 vdup.32 q3, d14[1] @ vtx_raw1.x (dup)
81 vdup.32 q4, d14[0] @ vtx_raw1.y (dup)
82 vdup.32 q5, d15[1] @ vtx_raw1.z (dup)
83 /* note: order of operations matters greatly,
84 * may cause like 20 fraction bits to differ! */
89 vmul.f32 q2, q2, q14 @ yes, mul+add is
90 vmul.f32 q5, q5, q14 @ faster than mla
93 vadd.f32 q0, q15 @ q0 = g_vtxTransformed[i]
94 vadd.f32 q3, q15 @ q3 = g_vtxTransformed[i + 1]
96 vld1.16 d16[1], [r4]! @ [0].v
98 vld1.16 d16[0], [r4]! @ [0].u
100 vld1.16 d18[1], [r5]! @ [0].v
102 vrecpe.f32 d4, d2 @ inv [0][1] .w
103 vld1.16 d18[0], [r5]! @ [0].u
104 vrecps.f32 d5, d2, d4 @ step
106 /* g_vtxTransformed[0] */ vst1.32 {q0}, [r0, :128]!
108 vcvt.f32.s32 d16, d16
109 vcvt.f32.s32 d18, d18
110 vmul.f32 d4, d5, d4 @ better inv
113 vrecps.f32 d5, d2, d4 @ step
115 /* u,v g_fVtxTxtCoords[0] */ vst1.32 {d16}, [r3]!
117 /* g_vtxTransformed[1] */ vst1.32 {q3}, [r0, :128]!
118 /* ... [1] */ vst1.32 {d18}, [r3]!
122 vmul.f32 d4, d5, d4 @ better inv [0][1] .w
124 vrecps.f32 d5, d2, d4 @ step
125 vmul.f32 d4, d5, d4 @ better inv
128 mov r12, #0x3f800000 @ 1.0f
138 vmul.f32 q0, q0, d4[1] @ .x .y .z .w *= [0] .w
139 vmul.f32 q1, q3, d4[0]
140 vshr.u64 d5, d4, #32 @ [0] .w
144 vsli.64 d3, d4, #32 @ insert [1] .w
146 vsli.u64 d5, d4, #32 @ [0] [1] .w
147 vcgt.f32 d6, d0, d20 @ .xy > 1.0?
149 vcgt.f32 d4, d5, #0 @ .w > 0?
150 vst1.32 {q0}, [r1]! @ g_vecProjected[0]
153 vld1.32 d0[0], [r4]! @ mem: [0] .azyx
158 vst1.32 {q1}, [r1]! @ g_vecProjected[1]
162 vld1.32 d0[1], [r5]! @ mem: [1] .azyx
164 vrev32.8 d0, d0 @ make 0xaazzyyxx [1][0]
165 vsli.u64 d1, d3, #32 @ d3 = [1] [0] .z
170 vst1.32 {d6}, [r6]! @ g_clipFlag2
172 tst r10, #PV_NEON_ENABLE_LIGHT
175 @ live NEON registers:
176 @ d1 = [1][0] .z (must preserve)
177 @ q1,q2 = azyx [1][0]
178 @ q12+ = gRSPworldProject
179 ldr r12, [sp, #0x64+0x20]
182 vld1.32 {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop
183 vld1.32 {q10}, [r12, :128]
185 vdup.32 q5, d4[0] @ [1] .x (dup)
186 vdup.32 q6, d4[1] @ [1] .y (dup)
187 vdup.32 q7, d5[0] @ [1] .z (dup)
188 vdup.32 q2, d2[0] @ [0] .x (dup)
189 vdup.32 q3, d2[1] @ [0] .y (dup)
190 vdup.32 q4, d3[0] @ [0] .z (dup)
197 vadd.f32 q4, q2 @ q4 = temp[0] .xyz0
198 vadd.f32 q5, q7 @ q5 = temp[1] .xyz0
199 vmul.f32 q2, q4, q4 @ temp .xyz0 ^2
204 movt r8, #0x7f7f @ max normal float, ~3.4e+38
206 vpadd.f32 d2, d2, d3 @ d2 = [1][0] x^2 + y^2 + z^2
208 vbif d2, d4, d5 @ if (d2 == 0) d2 = MAXFLOAT
210 vrsqrte.f32 d3, d2 @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum
212 ldr r9, [sp, #0x64+0x18] @ &fRSPAmbientLightRGBA
213 ldr r7, [sp, #0x64+0x14] @ gRSPlights
214 ldr r8, [sp, #0x64+0x24] @ gRSPnumLights
215 vrsqrts.f32 d4, d3, d4 @ step
216 vld1.32 {q6}, [r9] @ rgb
217 vld1.32 {q7}, [r9] @ rgb
218 vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
219 #if 0 /* not necessary? */
221 vrsqrts.f32 d4, d3, d4 @ step
222 vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
224 vmul.f32 q2, q4, d3[0] @ q2 = normal[0] .xyz
225 vmul.f32 q3, q5, d3[1] @ q3 = normal[1] .xyz
229 vmul.f32 q4, q8, q2 @ gRSPlights[l] * normal
232 vpadd.f32 d10, d10, d11
233 vpadd.f32 d8, d8, d10 @ d8 = [1][0] fCosT
234 vcgt.f32 d9, d8, #0 @ if (!(fCosT > 0))
235 vand d8, d9 @ fCosT = 0
236 add r9, r7, #OFFSETOF_Light_fr
237 vld1.32 {q8}, [r9] @ .fr .fg .fb
238 vdup.32 q5, d8[1] @ [1] fCosT (dup)
240 vmla.f32 q7, q8, q5 @ .rgb += frgb * fCosT
242 add r7, #SIZEOF_Light
246 movt r8, #0x437f @ float 255
248 vcgt.f32 q4, q6, q8 @ if (.rgb > 255)
250 vbit q6, q8, q4 @ .rgb = 255
254 ldrb r8, [r4, #-4] @ .a from vtx
256 vext.32 q4, q6, q6, #3 @ reg: .abgr -> .bgra
257 vext.32 q5, q7, q7, #3
258 vmov.32 d8[0], r8 @ use .a from input
265 vrev32.8 d0, d0 @ 0xbbggrraa -> 0xaarrggbb
269 tst r10, #PV_NEON_ENABLE_SHADE
270 vldr d0, [sp, #0x64+0x2c] @ primitiveColor [0] [1]
271 beq pv_neon_fog_alpha
272 @ easier to do with ARM
275 ror r8, #8 @ mem: .argb -> .rgba
276 ror r9, #8 @ reg: 0xbbggrraa -> ..
280 tst r10, #PV_NEON_FOG_ALPHA
283 vcgt.f32 d2, d1, d20 @ [0] [1] .z > 1.0?
284 vcgt.f32 d3, d1, #0 @ > 0?
286 movt r8, #0x4f7f @ r8 = (float)(255<<24)
287 vbit d1, d20, d2 @ make 1.0 if needed
292 vmov.u32 d5, #0xff000000
297 vst1.32 {d0}, [r2]! @ g_dwVtxDifColor
305 .size pv_neon, .-pv_neon
308 @ (float *d, const float *m1, const float *m2, const float *s)
309 FUNCTION(multiply_subtract2):
317 .size multiply_subtract2, .-multiply_subtract2
320 @ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2)
321 FUNCTION(tv_direction):
325 vsub.f32 d6, d4, d0 @ d6 = V2,V1
326 vsub.f32 d7, d4, d2 @ d7 = W2,W1
327 vmul.f32 d1, d5 @ d1 = v0.w * v2.w
329 vmul.f32 d6, d7 @ d6 = V2*W1,V1*W2
330 vmul.f32 d1, d3 @ d1 *= v1.w
332 vsub.f32 d6, d7 @ d6[0] = V1*W2 - V2*W1
339 @ vim:filetype=armasm:expandtab