2 * (C) GraÅžvydas "notaz" Ignotas, 2011
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
8 /* .equiv HAVE_ARMV7, 1 */
22 @ prepare work reg for ssatx
23 @ in: wr reg, bit to saturate to
24 .macro ssatx_prep wr bit
26 mov \wr, #(1<<(\bit-1))
30 .macro ssatx rd wr bit
53 @ unsigned divide rd = rm / rs
59 lsl \rs, \rs, \rd @ shift up divisor
71 @ calculate RTPS/RTPT MAC values
72 @ in: r0 context, r8,r9 VXYZ
77 add r2, r0, #4*(32+5) @ gteTRX
78 ldmia r1!,{r5-r7} @ gteR1*,gteR2*
80 smulbb r2, r5, r8 @ gteR11 * gteVX0
81 smultt r3, r5, r8 @ gteR12 * gteVY0
82 smulbb r4, r6, r9 @ gteR13 * gteVZ0
84 asr r4, r4, #1 @ prevent oflow, lose a bit
85 add r3, r4, r2, asr #1
86 add r10,r10,r3, asr #11 @ gteMAC1
87 smultb r2, r6, r8 @ gteR21 * gteVX0
88 smulbt r3, r7, r8 @ gteR22 * gteVY0
89 smultb r4, r7, r9 @ gteR23 * gteVZ0
90 ldmia r1!,{r5-r6} @ gteR3*
93 add r3, r4, r2, asr #1
94 add r11,r11,r3, asr #11 @ gteMAC2
95 @ be more accurate for gteMAC3, since it's also a divider
96 smulbb r2, r5, r8 @ gteR31 * gteVX0
97 smultt r3, r5, r8 @ gteR32 * gteVY0
98 smulbb r4, r6, r9 @ gteR33 * gteVZ0
100 asr r3, r4, #31 @ expand to 64bit
102 adc r3, r2, asr #31 @ 64bit sum in r3,r1
103 add r12,r12,r3, lsl #20
104 add r12,r12,r1, lsr #12 @ gteMAC3
108 .global gteRTPS_nf_arm @ r0=CP2 (d,c),
112 ldmia r0, {r8,r9} @ VXYZ(0)
114 add r1, r0, #4*25 @ gteMAC1
115 add r2, r0, #4*17 @ gteSZ1
116 stmia r1, {r10-r12} @ gteMAC123 save
118 add r1, r0, #4*16 @ gteSZ0
119 add r2, r0, #4*9 @ gteIR1
121 usat16_ lr, r12 @ limD
125 stmia r1, {r3-r5,lr} @ gteSZ*
126 ldr r3, [r0,#4*(32+26)] @ gteH
127 stmia r2, {r10,r11,r12} @ gteIR123 save
128 cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ?
143 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
145 add r1, r0, #4*12 @ gteSXY0
148 /* quotient */ subhs r9, #1
150 smlal r6, r2, r10, r9
151 stmia r1!,{r3,r4} @ shift gteSXY
153 smlal r7, r3, r11, r9
155 /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)]
156 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
159 /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11
160 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
161 ssatx r6, r2, 11 @ gteSX2
162 ssatx r7, r2, 11 @ gteSY2
165 str r4, [r0,#4*24] @ gteMAC0
168 cmp r4, #0x1000 @ limH
170 str r4, [r0,#4*8] @ gteIR0
173 .size gteRTPS_nf_arm, .-gteRTPS_nf_arm
176 .global gteRTPT_nf_arm @ r0=CP2 (d,c),
178 ldr r1, [r0, #4*19] @ gteSZ3
180 str r1, [r0, #4*16] @ gteSZ0
184 add r1, r0, lr, lsl #1
185 ldrd r8, [r1] @ VXYZ(v)
189 usat16_ r2, r12 @ limD
190 add r1, r0, #4*25 @ gteMAC1
191 ldr r3, [r0,#4*(32+26)] @ gteH
192 stmia r1, {r10-r12} @ gteMAC123 save
197 str r2, [r1, lr] @ fSZ(v)
198 cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ?
216 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
217 /* quotient */ subhs r9, #1
219 smlal r6, r2, r10, r9
221 smlal r7, r3, r11, r9
223 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
226 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
227 ssatx r6, r2, 11 @ gteSX(v)
228 ssatx r7, r2, 11 @ gteSY(v)
235 ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB
236 add r1, r0, #4*9 @ gteIR1
237 mla r3, r4, r9, r5 @ gteDQB + gteDQA * q
238 stmia r1, {r10,r11,r12} @ gteIR123 save
240 str r3, [r0,#4*24] @ gteMAC0
243 cmp r3, #0x1000 @ limH
245 str r3, [r0,#4*8] @ gteIR0
248 .size gteRTPT_nf_arm, .-gteRTPT_nf_arm
251 .global gteNCLIP_arm @ r0=CP2 (d,c),
260 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
261 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
263 smull r1, r5, r1, r5 @ RdLo, RdHi
264 sub r6, r4 @ 2: gteSY2 - gteSY0
269 smlal r1, r5, r3, r12
276 movtgt lr, #((1<<31)|(1<<16))>>16
281 mvngt r1, #1<<31 @ maxint
283 movmi r1, #1<<31 @ minint
286 str lr, [r0, #4*(32+31)] @ gteFLAG
289 .size gteNCLIP_arm, .-gteNCLIP_arm
292 @ vim:filetype=armasm