+@ note: not std calling convention used
+@ r0 = CP2 (d,c) (must preserve)
+@ r1 = needs_shift12
+@ r4,r5 = VXYZ(v) packed
+@ r6 = &MX11(mx)
+@ r7 = &CV1(cv)
+.macro mvma_op do_flags
+ push {r8-r11}
+
+.if \do_flags
+ ands r3, r1, #1 @ gteFLAG, shift_need
+.else
+ tst r1, #1
+.endif
+ ldmia r7, {r7-r9} @ CV123
+ ldmia r6!,{r10-r12} @ MX1*,MX2*
+ asr r1, r7, #20
+ lsl r7, #12 @ expand to 64bit
+ smlalbb r7, r1, r10, r4 @ MX11 * vx
+ smlaltt r7, r1, r10, r4 @ MX12 * vy
+ smlalbb r7, r1, r11, r5 @ MX13 * vz
+ lsrne r7, #12
+ orrne r7, r1, lsl #20 @ gteMAC0
+.if \do_flags
+ asrne r1, #20
+ adds r2, r7, #0x80000000
+ adcs r1, #0
+ orrgt r3, #(1<<30)
+ orrmi r3, #(1<<31)|(1<<27)
+ tst r3, #1 @ repeat shift test
+.endif
+ asr r1, r8, #20
+ lsl r8, #12 @ expand to 64bit
+ smlaltb r8, r1, r11, r4 @ MX21 * vx
+ smlalbt r8, r1, r12, r4 @ MX22 * vy
+ smlaltb r8, r1, r12, r5 @ MX23 * vz
+ lsrne r8, #12
+ orrne r8, r1, lsl #20 @ gteMAC1
+.if \do_flags
+ asrne r1, #20
+ adds r2, r8, #0x80000000
+ adcs r1, #0
+ orrgt r3, #(1<<29)
+ orrmi r3, #(1<<31)|(1<<26)
+ tst r3, #1 @ repeat shift test
+.endif
+ ldmia r6!,{r10-r11} @ MX3*
+ asr r1, r9, #20
+ lsl r9, #12 @ expand to 64bit
+ smlalbb r9, r1, r10, r4 @ MX31 * vx
+ smlaltt r9, r1, r10, r4 @ MX32 * vy
+ smlalbb r9, r1, r11, r5 @ MX33 * vz
+ lsrne r9, #12
+ orrne r9, r1, lsl #20 @ gteMAC2
+.if \do_flags
+ asrne r1, #20
+ adds r2, r9, #0x80000000
+ adcs r1, #0
+ orrgt r3, #(1<<28)
+ orrmi r3, #(1<<31)|(1<<25)
+ bic r3, #1
+.else
+ mov r3, #0
+.endif
+ str r3, [r0, #4*(32+31)] @ gteFLAG
+ add r1, r0, #4*25
+ stmia r1, {r7-r9}
+
+ pop {r8-r11}
+ bx lr
+.endm
+
+.global gteMVMVA_part_arm
+gteMVMVA_part_arm:
+ mvma_op 1
+ .size gteMVMVA_part_arm, .-gteMVMVA_part_arm
+
+.global gteMVMVA_part_nf_arm
+gteMVMVA_part_nf_arm:
+ mvma_op 0
+ .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm
+
+@ common version of MVMVA with cv3 (== 0) and shift12,
+@ can't overflow so no gteMAC flags needed
+@ note: not std calling convention used
+@ r0 = CP2 (d,c) (must preserve)
+@ r4,r5 = VXYZ(v) packed
+@ r6 = &MX11(mx)
+.global gteMVMVA_part_cv3sh12_arm
+gteMVMVA_part_cv3sh12_arm:
+ push {r8-r9}
+ ldmia r6!,{r7-r9} @ MX1*,MX2*
+ smulbb r1, r7, r4 @ MX11 * vx
+ smultt r2, r7, r4 @ MX12 * vy
+ smulbb r3, r8, r5 @ MX13 * vz
+ qadd r1, r1, r2
+ asr r3, #1 @ prevent oflow, lose a bit
+ add r1, r3, r1, asr #1
+ asr r7, r1, #11
+ smultb r1, r8, r4 @ MX21 * vx
+ smulbt r2, r9, r4 @ MX22 * vy
+ smultb r3, r9, r5 @ MX23 * vz
+ qadd r1, r1, r2
+ asr r3, #1
+ add r1, r3, r1, asr #1
+ asr r8, r1, #11
+ ldmia r6, {r6,r9} @ MX3*
+ smulbb r1, r6, r4 @ MX31 * vx
+ smultt r2, r6, r4 @ MX32 * vy
+ smulbb r3, r9, r5 @ MX33 * vz
+ qadd r1, r1, r2
+ asr r3, #1
+ add r1, r3, r1, asr #1
+ asr r9, r1, #11
+ add r1, r0, #4*25
+ mov r2, #0
+ stmia r1, {r7-r9}
+ str r2, [r0, #4*(32+31)] @ gteFLAG
+ pop {r8-r9}
+ bx lr
+ .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm
+
+