/*
* (C) GraÅžvydas "notaz" Ignotas, 2011
*
- * This work is licensed under the terms of any of these licenses
- * (at your option):
- * - GNU GPL, version 2 or later.
- * - GNU LGPL, version 2.1 or later.
+ * This work is licensed under the terms of GNU GPL version 2 or later.
* See the COPYING file in the top-level directory.
*/
.text
.align 2
+@ XXX: gteMAC calc shouldn't be saturating, but it is here
+
@ approximate gteMAC|123 flags
@ in: rr 123 as gteMAC|123
@ trash: nothing
vmovl.s32 q3, d6 @ || gteDQ|AB [64]
vrecps.f32 q12, q10, q11 @ step
vcvt.f32.u32 d13, d13 @ | gteH (float for div)
+ vmov.f32 q8, #0.5 @ |||
vmul.f32 q11, q12, q11 @ better inv
add r3, r0, #4*16
vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
vdup.32 q13, d13[0] @ |
-@ vrecps.f32 q12, q10, q11 @ step
-@ vmul.f32 q11, q12, q11 @ better inv
+@ vrecps.f32 q12, q10, q11 @ step
+@ vmul.f32 q11, q12, q11 @ better inv
vmul.f32 q10, q13, q11 @ result
.else
+ vmov.f32 q8, #0.5 @ |||
vmovl.s32 q2, d4 @ || gteOF|XY [64]
vmovl.s32 q3, d6 @ || gteDQ|AB [64]
vcvt.f32.u32 d13, d13 @ | gteH (float for div)
orrne lr, #(1<<31)
orrne lr, #(1<<18) @ fSZ (limD)
-@ vadd.f32 q10, q @ adjust for vcvt rounding mode
+ vadd.f32 q10, q8 @ adjust for vcvt rounding mode
vcvt.u32.f32 q8, q10
vmovl.s16 q9, d1 @ expand gteIR|12 v=0
vmovl.s16 q10, d3 @ expand gteIR|12 v=1
-.global gteMVMVA_neon @ r0=CP2 (d,c), op
-gteMVMVA_neon:
- push {r4-r5,lr}
-
- add r12, r0, #4*32
-
- ubfx r2, r1, #15, #2 @ v
-
- vmov.i32 q0, #0 @ d0,d1
- vmov.i32 q1, #0 @ d2,d3
- vmov.i32 q2, #0 @ d4,d5
- cmp r2, #3
- addeq r4, r0, #4*9
- addne r3, r0, r2, lsl #3
- ldmeqia r4, {r3-r5}
- ldmneia r3, {r4,r5}
- pkhbteq r4, r3, r4, lsl #16
+@ note: non-std calling convention used
+@ r0 = CP2 (d,c) (must preserve)
+@ r1 = op
+@ r4,r5 = VXYZ(v) packed
+@ r6 = &MX11(mx)
+@ r7 = &CV1(cv)
+.global gteMVMVA_part_neon
+gteMVMVA_part_neon:
uxth r5, r5
vmov.32 d8[0], r4
vmov.32 d8[1], r5 @ VXYZ(v)
- ubfx r3, r1, #17, #2 @ mx
- ubfx r2, r1, #13, #2 @ cv
- cmp r3, #3
- beq 0f @ very rare case
- add r3, r12, r3, lsl #5
- vldmia r3, {d0-d2} @ MXxy/gteR* [16*9]
-0:
- cmp r2, #3
- add r3, r12, r2, lsl #5
- beq 0f
- add r3, #4*5
- vldmia r3, {d4-d5} @ CVx/gteTR*
+ vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
+ vldmia r7, {d4-d5} @ CVx/gteTR*
-0:
vmov.i32 q15, #0
vext.16 d2, d1, d2, #2 @ xxx3 -> x321
vext.16 d1, d0, d1, #3 @ xx32 -> x321
add r3, r0, #4*9
vst1.32 d18, [r3]!
vst1.32 d19[0], [r3]
+ bx lr
+ .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
- tst r1, #1<<10 @ lm
- mov r2, #0
+
+@ get flags after gteMVMVA_part_neon operation
+.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
+gteMACtoIR_flags_neon:
+ push {r4,r5,lr}
+ tst r1, r1 @ lm
mov lr, #0 @ gteFLAG
+ mov r2, #0
mov r12, #15
moveq r2, #0x8000 @ adj
moveq r12, #16 @ shift
orrne lr, #(1<<22) @ IR3/limB3
str lr, [r0, #4*(32+31)] @ gteFLAG
- pop {r4-r5,pc}
- .size gteMVMVA_neon, .-gteMVMVA_neon
-
-
-
-@ the name is misnormer, this doesn't use NEON but oh well..
-.global gteNCLIP_neon @ r0=CP2 (d,c),
-gteNCLIP_neon:
- push {r4-r6,lr}
-
- add r1, r0, #4*12
- ldmia r1, {r1-r3}
- mov r4, r1, asr #16
- mov r5, r2, asr #16
- mov r6, r3, asr #16
- sub r12, r4, r5 @ 3: gteSY0 - gteSY1
- sub r5, r5, r6 @ 1: gteSY1 - gteSY2
- sxth r1, r1
- smull r1, r5, r1, r5 @ RdLo, RdHi
- sub r6, r4 @ 2: gteSY2 - gteSY0
- sxth r2, r2
- smlal r1, r5, r2, r6
- mov lr, #0 @ gteFLAG
- sxth r3, r3
- smlal r1, r5, r3, r12
- mov r6, #1<<31
- orr r6, #1<<15
- movs r2, r1, lsl #1
- adc r5, r5
- cmp r5, #0
- movtgt lr, #((1<<31)|(1<<16))>>16
- mvngt r1, #1<<31 @ maxint
- cmn r5, #1
- movmi r1, #1<<31 @ minint
- orrmi lr, r6
- str r1, [r0, #4*24]
- str lr, [r0, #4*(32+31)] @ gteFLAG
+ pop {r4,r5,pc}
+ .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
- pop {r4-r6,pc}
- .size gteNCLIP_neon, .-gteNCLIP_neon
@ vim:filetype=armasm