+@ note: non-std calling convention used
+@ r0 = CP2 (d,c) (must preserve)
+@ r1 = op
+@ r4,r5 = VXYZ(v) packed
+@ r6 = &MX11(mx)
+@ r7 = &CV1(cv)
+.global gteMVMVA_part_neon
+gteMVMVA_part_neon:
+ uxth r5, r5
+ vmov.32 d8[0], r4
+ vmov.32 d8[1], r5 @ VXYZ(v)
+ vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
+ vldmia r7, {d4-d5} @ CVx/gteTR*
+
+ vmov.i32 q15, #0
+ vext.16 d2, d1, d2, #2 @ xxx3 -> x321
+ vext.16 d1, d0, d1, #3 @ xx32 -> x321
+ vshll.s32 q3, d5, #12 @ gteTRZ/CV3
+ vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
+
+ vmull.s16 q8, d0, d8
+ vmull.s16 q9, d1, d8
+ vmull.s16 q10, d2, d8
+ vpadd.s32 d16, d16, d17
+ vpadd.s32 d17, d18, d19
+ vpadd.s32 d18, d20, d21
+ vpadal.s32 q2, q8
+ vpadal.s32 q3, q9
+ tst r1, #1<<19
+ beq 0f
+ vshr.s64 q2, q2, #12
+ vshr.s64 q3, q3, #12
+0:
+ vqmovn.s64 d8, q2 @ gteMAC|12
+ vqmovn.s64 d9, q3 @ gteMAC3
+
+ tst r1, #1<<10
+ add r3, r0, #4*25
+ vqmovn.s32 d10, q4 @ gteIR|123
+ vst1.32 d8, [r3]!
+ vst1.32 d9[0], [r3] @ wb gteMAC|123