--- /dev/null
+/*
+ * (C) GraÅžvydas "notaz" Ignotas, 2014
+ *
+ * This work is licensed under the terms of GNU GPL version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "arm_features.h"
+
+.syntax unified
+.text
+.align 3
+
+.macro do_mac_flags rr1 rr2 rr3
+ cmp \rr1, #1
+.endm
+
+
+/*
+ * ProcessVertexData register map:
+ *
+ * q | d | c code
+ * 0 0
+ * 1
+ * 1 2
+ * 3
+ * 2 4
+ * 5
+ * 3 6
+ * 7
+ * 4 8
+ * 9
+ * 5 10
+ * 11
+ * 6 12
+ * 13
+ * 7 14 g_vecProjected
+ * 15
+ * 8 16
+ * 17
+ * ...
+ * 12 24 gRSPworldProject _11,_21,_31,_41
+ * 25
+ * 13 26 gRSPworldProject _12,_22,_32,_42
+ * 27
+ * 14 28 gRSPworldProject _13,_23,_33,_43
+ * 29
+ * 15 30 gRSPworldProject _14,_24,_34,_44
+ * 31
+ *
+ * r4 vtx[], 16 bytes:
+ * short y, x, flag, z, tv, tu;
+ * / uint8 a, b, g, r;
+ * \ char a, z, y, x;
+ *
+ * outputs:
+ * r0 - XVECTOR4 *g_vtxTransformed
+ * r1 - XVECTOR4 *g_vecProjected
+ * r2 - uint32 *g_dwVtxDifColor
+ * r3 - VECTOR2 *g_fVtxTxtCoords
+ * sp+00 - float *g_fFogCoord
+ * sp+04 - uint32 *g_clipFlag2
+ * inputs:
+ * r11 sp+08 - uint32 dwNum
+ * sp+0c - int neon_flags
+ * r4 sp+10 - FiddledVtx vtx[]
+ * sp+14 - Light *gRSPlights
+ * sp+18 - float *fRSPAmbientLightRGBA
+ * sp+1c - XMATRIX *gRSPworldProjectTransported
+ * sp+20 - XMATRIX *gRSPmodelViewTop
+ * sp+24 - uint32 gRSPnumLights
+ * sp+28 - float gRSPfFogMin
+ */
+FUNCTION(pv_neon): @
+ ldr r12, [sp, #0x10]
+ pld [r12]
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ mov r4, r12 @ vtx
+ ldr r12, [sp, #0x64+0x1c]
+ ldr r11, [sp, #0x64+0x08]
+ vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject
+ vld1.32 {q14,q15}, [r12, :128]
+
+0:
+ vld1.16 d4[1], [r4]! @ y
+ vld1.16 d4[0], [r4]! @ x
+ vld1.16 d4[3], [r4]! @ flag
+ vld1.16 d4[2], [r4]! @ z
+ vld1.16 d5[1], [r4]! @ v
+ vld1.16 d5[0], [r4]! @ u
+ vmovl.s16 q0, d4
+ vmovl.s16 q1, d5
+ mov r12, #0x3f800000 @ 1.0f
+ vcvt.f32.s32 q2, q0 @ q2 = vtx_raw
+ vcvt.f32.s32 q3, q1 @ d6 = float u, v
+ vmov.32 d5[1], r12 @ q2 = { x, y, x, 1.0f }
+ vmul.f32 q4, q2, q12
+ vmul.f32 q5, q2, q13
+ vmul.f32 q6, q2, q14
+ vmul.f32 q7, q2, q15
+ /* wrt u,v to g_fVtxTxtCoords */ vst1.32 {d6}, [r3]!
+ vpadd.f32 d7, d14, d15
+ vpadd.f32 d6, d12, d13
+ vpadd.f32 d5, d10, d11
+ vpadd.f32 d4, d8, d9
+ vpadd.f32 d1, d6, d7 @ g_vtxTransformed .z .w
+ vpadd.f32 d0, d4, d5 @ g_vtxTransformed .x .y
+
+#if 1
+ vrecpe.f32 d2, d1 @ inv .z(unused) .w
+ vrecps.f32 d3, d1, d2 @ step
+ /* wrt g_vtxTransformed */ vst1.32 {q0}, [r0]!
+ vmul.f32 d2, d3, d2 @ better inv
+ vrecps.f32 d3, d1, d2 @ step
+ vmul.f32 d2, d3, d2 @ better inv
+ #if 0
+ vrecps.f32 d3, d1, d2 @ step
+ vmul.f32 d2, d3, d2 @ better inv
+ #endif
+#else
+ vmov.f32 s5, r12
+ /* wrt g_vtxTransformed */ vst1.32 {q0}, [r0]!
+ vdiv.f32 s5, s5, s3
+#endif
+
+ vmul.f32 q7, q0, d2[1]
+ vshr.u64 d2, #32
+ vsli.64 d15, d2, #32
+ /* wrt g_vecProjected */ vst1.32 {q7}, [r1]!
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+ .size pv_neon, .-pv_neon
+
+
+@ vim:filetype=armasm