2 * (C) GraÅžvydas "notaz" Ignotas, 2011
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
8 /* .equiv HAVE_ARMV7, 1 */
22 @ prepare work reg for ssatx
23 @ in: wr reg, bit to saturate to
24 .macro ssatx_prep wr bit
26 mov \wr, #(1<<(\bit-1))
30 .macro ssatx rd wr bit
41 @ prepare work reg for ssatx0 (sat to 0..2^(bit-1))
42 @ in: wr reg, bit to saturate to
43 .macro ssatx0_prep wr bit
44 mov \wr, #(1<<(\bit-1))
47 .macro ssatx0 rd wr bit
66 @ unsigned divide rd = rm / rs
72 lsl \rs, \rs, \rd @ shift up divisor
84 @ calculate RTPS/RTPT MAC values
85 @ in: r0 context, r8,r9 VXYZ
90 add r2, r0, #4*(32+5) @ gteTRX
91 ldmia r1!,{r5-r7} @ gteR1*,gteR2*
93 smulbb r2, r5, r8 @ gteR11 * gteVX0
94 smultt r3, r5, r8 @ gteR12 * gteVY0
95 smulbb r4, r6, r9 @ gteR13 * gteVZ0
97 asr r4, r4, #1 @ prevent oflow, lose a bit
98 add r3, r4, r2, asr #1
99 add r10,r10,r3, asr #11 @ gteMAC1
100 smultb r2, r6, r8 @ gteR21 * gteVX0
101 smulbt r3, r7, r8 @ gteR22 * gteVY0
102 smultb r4, r7, r9 @ gteR23 * gteVZ0
103 ldmia r1!,{r5-r6} @ gteR3*
106 add r3, r4, r2, asr #1
107 add r11,r11,r3, asr #11 @ gteMAC2
108 @ be more accurate for gteMAC3, since it's also a divider
109 smulbb r2, r5, r8 @ gteR31 * gteVX0
110 smultt r3, r5, r8 @ gteR32 * gteVY0
111 smulbb r4, r6, r9 @ gteR33 * gteVZ0
113 asr r3, r4, #31 @ expand to 64bit
115 adc r3, r2, asr #31 @ 64bit sum in r3,r1
116 add r12,r12,r3, lsl #20
117 add r12,r12,r1, lsr #12 @ gteMAC3
121 .global gteRTPS_nf_arm @ r0=CP2 (d,c),
125 ldmia r0, {r8,r9} @ VXYZ(0)
127 add r1, r0, #4*25 @ gteMAC1
128 add r2, r0, #4*17 @ gteSZ1
129 stmia r1, {r10-r12} @ gteMAC123 save
131 add r1, r0, #4*16 @ gteSZ0
132 add r2, r0, #4*9 @ gteIR1
134 usat16_ lr, r12 @ limD
138 stmia r1, {r3-r5,lr} @ gteSZ*
139 ldr r3, [r0,#4*(32+26)] @ gteH
140 stmia r2, {r10,r11,r12} @ gteIR123 save
141 cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ?
156 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
158 add r1, r0, #4*12 @ gteSXY0
161 /* quotient */ subhs r9, #1
163 smlal r6, r2, r10, r9
164 stmia r1!,{r3,r4} @ shift gteSXY
166 smlal r7, r3, r11, r9
168 /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)]
169 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
172 /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11
173 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
174 ssatx r6, r2, 11 @ gteSX2
175 ssatx r7, r2, 11 @ gteSY2
178 str r4, [r0,#4*24] @ gteMAC0
181 cmp r4, #0x1000 @ limH
183 str r4, [r0,#4*8] @ gteIR0
186 .size gteRTPS_nf_arm, .-gteRTPS_nf_arm
189 .global gteRTPT_nf_arm @ r0=CP2 (d,c),
191 ldr r1, [r0, #4*19] @ gteSZ3
193 str r1, [r0, #4*16] @ gteSZ0
197 add r1, r0, lr, lsl #1
198 ldrd r8, [r1] @ VXYZ(v)
202 usat16_ r2, r12 @ limD
203 add r1, r0, #4*25 @ gteMAC1
204 ldr r3, [r0,#4*(32+26)] @ gteH
205 stmia r1, {r10-r12} @ gteMAC123 save
210 str r2, [r1, lr] @ fSZ(v)
211 cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ?
228 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
229 /* quotient */ subhs r9, #1
231 smlal r6, r2, r10, r9
233 smlal r7, r3, r11, r9
235 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
238 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
239 ssatx r6, r2, 11 @ gteSX(v)
240 ssatx r7, r2, 11 @ gteSY(v)
247 ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB
248 add r1, r0, #4*9 @ gteIR1
249 mla r3, r4, r9, r5 @ gteDQB + gteDQA * q
250 stmia r1, {r10,r11,r12} @ gteIR123 save
252 str r3, [r0,#4*24] @ gteMAC0
255 cmp r3, #0x1000 @ limH
257 str r3, [r0,#4*8] @ gteIR0
260 .size gteRTPT_nf_arm, .-gteRTPT_nf_arm
263 @ note: not std calling convention used
264 @ r0 = CP2 (d,c) (must preserve)
266 @ r4,r5 = VXYZ(v) packed
269 .macro mvma_op do_flags
273 ands r3, r1, #1 @ gteFLAG, shift_need
277 ldmia r7, {r7-r9} @ CV123
278 ldmia r6!,{r10-r12} @ MX1*,MX2*
280 lsl r7, #12 @ expand to 64bit
281 smlalbb r7, r1, r10, r4 @ MX11 * vx
282 smlaltt r7, r1, r10, r4 @ MX12 * vy
283 smlalbb r7, r1, r11, r5 @ MX13 * vz
285 orrne r7, r1, lsl #20 @ gteMAC0
288 adds r2, r7, #0x80000000
291 orrmi r3, #(1<<31)|(1<<27)
292 tst r3, #1 @ repeat shift test
295 lsl r8, #12 @ expand to 64bit
296 smlaltb r8, r1, r11, r4 @ MX21 * vx
297 smlalbt r8, r1, r12, r4 @ MX22 * vy
298 smlaltb r8, r1, r12, r5 @ MX23 * vz
300 orrne r8, r1, lsl #20 @ gteMAC1
303 adds r2, r8, #0x80000000
306 orrmi r3, #(1<<31)|(1<<26)
307 tst r3, #1 @ repeat shift test
309 ldmia r6!,{r10-r11} @ MX3*
311 lsl r9, #12 @ expand to 64bit
312 smlalbb r9, r1, r10, r4 @ MX31 * vx
313 smlaltt r9, r1, r10, r4 @ MX32 * vy
314 smlalbb r9, r1, r11, r5 @ MX33 * vz
316 orrne r9, r1, lsl #20 @ gteMAC2
319 adds r2, r9, #0x80000000
322 orrmi r3, #(1<<31)|(1<<25)
327 str r3, [r0, #4*(32+31)] @ gteFLAG
335 .global gteMVMVA_part_arm
338 .size gteMVMVA_part_arm, .-gteMVMVA_part_arm
340 .global gteMVMVA_part_nf_arm
341 gteMVMVA_part_nf_arm:
343 .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm
345 @ common version of MVMVA with cv3 (== 0) and shift12,
346 @ can't overflow so no gteMAC flags needed
347 @ note: not std calling convention used
348 @ r0 = CP2 (d,c) (must preserve)
349 @ r4,r5 = VXYZ(v) packed
351 .global gteMVMVA_part_cv3sh12_arm
352 gteMVMVA_part_cv3sh12_arm:
354 ldmia r6!,{r7-r9} @ MX1*,MX2*
355 smulbb r1, r7, r4 @ MX11 * vx
356 smultt r2, r7, r4 @ MX12 * vy
357 smulbb r3, r8, r5 @ MX13 * vz
359 asr r3, #1 @ prevent oflow, lose a bit
360 add r1, r3, r1, asr #1
362 smultb r1, r8, r4 @ MX21 * vx
363 smulbt r2, r9, r4 @ MX22 * vy
364 smultb r3, r9, r5 @ MX23 * vz
367 add r1, r3, r1, asr #1
369 ldmia r6, {r6,r9} @ MX3*
370 smulbb r1, r6, r4 @ MX31 * vx
371 smultt r2, r6, r4 @ MX32 * vy
372 smulbb r3, r9, r5 @ MX33 * vz
375 add r1, r3, r1, asr #1
380 str r2, [r0, #4*(32+31)] @ gteFLAG
383 .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm
386 .global gteNCLIP_arm @ r0=CP2 (d,c),
395 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
396 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
398 smull r1, r5, r1, r5 @ RdLo, RdHi
399 sub r6, r4 @ 2: gteSY2 - gteSY0
404 smlal r1, r5, r3, r12
411 movtgt lr, #((1<<31)|(1<<16))>>16
416 mvngt r1, #1<<31 @ maxint
418 movmi r1, #1<<31 @ minint
421 str lr, [r0, #4*(32+31)] @ gteFLAG
424 .size gteNCLIP_arm, .-gteNCLIP_arm
428 ldr r2, [r0, #4*25] @ gteMAC1
430 ldr r12,[r0, #4*(32+31)] @ gteFLAG
433 orrge r12, #(1<<31)|(1<<24)
442 ldrd r2, [r0, #4*26] @ gteMAC23
443 orrlt r12, #(1<<31)|(1<<24)
468 strd r2, [r0, #4*10] @ gteIR23
469 str r12,[r0, #4*(32+31)] @ gteFLAG
473 .global gteMACtoIR_lm0 @ r0=CP2 (d,c)
476 .size gteMACtoIR_lm0, .-gteMACtoIR_lm0
478 .global gteMACtoIR_lm1 @ r0=CP2 (d,c)
481 .size gteMACtoIR_lm1, .-gteMACtoIR_lm1
484 .global gteMACtoIR_lm0_nf @ r0=CP2 (d,c)
495 .size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf
498 .global gteMACtoIR_lm1_nf @ r0=CP2 (d,c)
509 .size gteMACtoIR_lm1_nf, .-gteMACtoIR_lm1_nf
513 .global gteMVMVA_test
517 and r2, r1, #0x18000 @ v
518 cmp r2, #0x18000 @ v == 3?
520 addne r3, r0, r2, lsr #12
525 orreq r4, r3, r4, lsl #16 @ r4,r5 = VXYZ(v)
528 and r3, r1, #0x60000 @ mx
530 add r6, r12, r3, lsl #5
533 and r2, r1, #0x06000 @ cv
535 add r7, r12, r2, lsl #5
543 bne gteMVMVA_part_cv3sh12_arm
556 bl gteMVMVA_part_neon
559 bl gteMACtoIR_flags_neon
568 @ vim:filetype=armasm