2 * (C) GraÅžvydas "notaz" Ignotas, 2011
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
8 #include "arm_features.h"
22 @ prepare work reg for ssatx
23 @ in: wr reg, bit to saturate to
24 .macro ssatx_prep wr bit
26 mov \wr, #(1<<(\bit-1))
30 .macro ssatx rd wr bit
41 @ prepare work reg for ssatx0 (sat to 0..2^(bit-1))
42 @ in: wr reg, bit to saturate to
43 .macro ssatx0_prep wr bit
44 mov \wr, #(1<<(\bit-1))
47 .macro ssatx0 rd wr bit
71 lsl \rs, \rs, \rd @ shift up divisor
82 .macro newton_step rcp den zero t1 t2
83 umull \t2, \t1, \den, \rcp @ \t2 is dummy
84 sub \t1, \zero, \t1, lsl #2
85 smlal \t2, \rcp, \t1, \rcp
88 .macro udiv_newton rd rm rs t1 t2 t3 t4
92 lsl \rs, \t1 @ normalize for the algo
93 mov \rm, #0x4d000000 @ initial estimate ~1.2
95 newton_step \rm, \rs, \t2, \t3, \t4
96 newton_step \rm, \rs, \t2, \t3, \t4
97 newton_step \rm, \rs, \t2, \t3, \t4
98 newton_step \rm, \rs, \t2, \t3, \t4
100 umull \t4, \rd, \rm, \rd
101 rsb \t2, \t1, #30 @ here t1 is 1..15
102 mov \rd, \rd, lsr \t2
105 @ unsigned divide rd = rm / rs; 16.16 result
109 .macro udiv rd rm rs t1 t2 t3 t4
111 udiv_newton \rd, \rm, \rs, \t1, \t2, \t3, \t4
114 @ calculate RTPS/RTPT MAC values
115 @ in: r0 context, r8,r9 VXYZ
116 @ out: r10-r12 MAC123
120 add r2, r0, #4*(32+5) @ gteTRX
121 ldmia r1!,{r5-r7} @ gteR1*,gteR2*
123 smulbb r2, r5, r8 @ gteR11 * gteVX0
124 smultt r3, r5, r8 @ gteR12 * gteVY0
125 smulbb r4, r6, r9 @ gteR13 * gteVZ0
127 asr r4, r4, #1 @ prevent oflow, lose a bit
128 add r3, r4, r2, asr #1
129 add r10,r10,r3, asr #11 @ gteMAC1
130 smultb r2, r6, r8 @ gteR21 * gteVX0
131 smulbt r3, r7, r8 @ gteR22 * gteVY0
132 smultb r4, r7, r9 @ gteR23 * gteVZ0
133 ldmia r1!,{r5-r6} @ gteR3*
136 add r3, r4, r2, asr #1
137 add r11,r11,r3, asr #11 @ gteMAC2
138 @ be more accurate for gteMAC3, since it's also a divider
139 smulbb r2, r5, r8 @ gteR31 * gteVX0
140 smultt r3, r5, r8 @ gteR32 * gteVY0
141 smulbb r4, r6, r9 @ gteR33 * gteVZ0
143 asr r3, r4, #31 @ expand to 64bit
145 adc r3, r2, asr #31 @ 64bit sum in r3,r1
146 add r12,r12,r3, lsl #20
147 add r12,r12,r1, lsr #12 @ gteMAC3
151 FUNCTION(gteRTPS_nf_arm): @ r0=CP2 (d,c),
154 ldmia r0, {r8,r9} @ VXYZ(0)
156 add r1, r0, #4*25 @ gteMAC1
157 add r2, r0, #4*17 @ gteSZ1
158 stmia r1, {r10-r12} @ gteMAC123 save
160 add r1, r0, #4*16 @ gteSZ0
161 add r2, r0, #4*9 @ gteIR1
163 usat16_ lr, r12 @ limD
167 stmia r1, {r3-r5,lr} @ gteSZ*
168 ldr r3, [r0,#4*(32+26)] @ gteH
169 stmia r2, {r10,r11,r12} @ gteIR123 save
170 cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ?
174 udiv r9, r3, lr, r1, r2, r6, r7
184 ldrd r6, r7, [r0, #4*(32+24)] @ gteOFXY
186 add r1, r0, #4*12 @ gteSXY0
189 /* quotient */ subhs r9, #1
191 smlal r6, r2, r10, r9
192 stmia r1!,{r3,r4} @ shift gteSXY
194 smlal r7, r3, r11, r9
196 /* gteDQA, gteDQB */ ldrd r10,r11, [r0, #4*(32+27)]
197 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
200 /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11
201 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
202 ssatx r6, r2, 11 @ gteSX2
203 ssatx r7, r2, 11 @ gteSY2
206 str r4, [r0,#4*24] @ gteMAC0
209 cmp r4, #0x1000 @ limH
211 str r4, [r0,#4*8] @ gteIR0
214 .size gteRTPS_nf_arm, .-gteRTPS_nf_arm
217 FUNCTION(gteRTPT_nf_arm): @ r0=CP2 (d,c),
218 ldr r1, [r0, #4*19] @ gteSZ3
220 str r1, [r0, #4*16] @ gteSZ0
224 add r1, r0, lr, lsl #1
225 ldrd r8, r9, [r1] @ VXYZ(v)
229 usat16_ r2, r12 @ limD
230 add r1, r0, #4*25 @ gteMAC1
231 ldr r3, [r0,#4*(32+26)] @ gteH
232 stmia r1, {r10-r12} @ gteMAC123 save
237 str r2, [r1, lr] @ fSZ(v)
238 cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ?
242 udiv r9, r3, r2, r1, r4, r6, r7
254 ldrd r6, r7, [r0,#4*(32+24)] @ gteOFXY
255 /* quotient */ subhs r9, #1
257 smlal r6, r2, r10, r9
259 smlal r7, r3, r11, r9
261 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
264 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
265 ssatx r6, r2, 11 @ gteSX(v)
266 ssatx r7, r2, 11 @ gteSY(v)
273 ldrd r4, r5, [r0, #4*(32+27)] @ gteDQA, gteDQB
274 add r1, r0, #4*9 @ gteIR1
275 mla r3, r4, r9, r5 @ gteDQB + gteDQA * q
276 stmia r1, {r10,r11,r12} @ gteIR123 save
278 str r3, [r0,#4*24] @ gteMAC0
281 cmp r3, #0x1000 @ limH
283 str r3, [r0,#4*8] @ gteIR0
286 .size gteRTPT_nf_arm, .-gteRTPT_nf_arm
289 @ note: not std calling convention used
290 @ r0 = CP2 (d,c) (must preserve)
292 @ r4,r5 = VXYZ(v) packed
295 .macro mvma_op do_flags
299 ands r3, r1, #1 @ gteFLAG, shift_need
303 ldmia r7, {r7-r9} @ CV123
304 ldmia r6!,{r10-r12} @ MX1*,MX2*
306 lsl r7, #12 @ expand to 64bit
307 smlalbb r7, r1, r10, r4 @ MX11 * vx
308 smlaltt r7, r1, r10, r4 @ MX12 * vy
309 smlalbb r7, r1, r11, r5 @ MX13 * vz
311 orrne r7, r1, lsl #20 @ gteMAC0
314 adds r2, r7, #0x80000000
317 orrmi r3, #(1<<31)|(1<<27)
318 tst r3, #1 @ repeat shift test
321 lsl r8, #12 @ expand to 64bit
322 smlaltb r8, r1, r11, r4 @ MX21 * vx
323 smlalbt r8, r1, r12, r4 @ MX22 * vy
324 smlaltb r8, r1, r12, r5 @ MX23 * vz
326 orrne r8, r1, lsl #20 @ gteMAC1
329 adds r2, r8, #0x80000000
332 orrmi r3, #(1<<31)|(1<<26)
333 tst r3, #1 @ repeat shift test
335 ldmia r6!,{r10-r11} @ MX3*
337 lsl r9, #12 @ expand to 64bit
338 smlalbb r9, r1, r10, r4 @ MX31 * vx
339 smlaltt r9, r1, r10, r4 @ MX32 * vy
340 smlalbb r9, r1, r11, r5 @ MX33 * vz
342 orrne r9, r1, lsl #20 @ gteMAC2
345 adds r2, r9, #0x80000000
348 orrmi r3, #(1<<31)|(1<<25)
353 str r3, [r0, #4*(32+31)] @ gteFLAG
361 FUNCTION(gteMVMVA_part_arm):
363 .size gteMVMVA_part_arm, .-gteMVMVA_part_arm
365 FUNCTION(gteMVMVA_part_nf_arm):
367 .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm
369 @ common version of MVMVA with cv3 (== 0) and shift12,
370 @ can't overflow so no gteMAC flags needed
371 @ note: not std calling convention used
372 @ r0 = CP2 (d,c) (must preserve)
373 @ r4,r5 = VXYZ(v) packed
375 FUNCTION(gteMVMVA_part_cv3sh12_arm):
377 ldmia r6!,{r7-r9} @ MX1*,MX2*
378 smulbb r1, r7, r4 @ MX11 * vx
379 smultt r2, r7, r4 @ MX12 * vy
380 smulbb r3, r8, r5 @ MX13 * vz
382 asr r3, #1 @ prevent oflow, lose a bit
383 add r1, r3, r1, asr #1
385 smultb r1, r8, r4 @ MX21 * vx
386 smulbt r2, r9, r4 @ MX22 * vy
387 smultb r3, r9, r5 @ MX23 * vz
390 add r1, r3, r1, asr #1
392 ldmia r6, {r6,r9} @ MX3*
393 smulbb r1, r6, r4 @ MX31 * vx
394 smultt r2, r6, r4 @ MX32 * vy
395 smulbb r3, r9, r5 @ MX33 * vz
398 add r1, r3, r1, asr #1
403 str r2, [r0, #4*(32+31)] @ gteFLAG
406 .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm
408 #endif /* HAVE_ARMV5 */
410 FUNCTION(gteNCLIP_arm): @ r0=CP2 (d,c),
412 ldrsh r4, [r0, #4*12+2]
413 ldrsh r5, [r0, #4*13+2]
414 ldrsh r6, [r0, #4*14+2]
415 ldrsh lr, [r0, #4*12]
416 ldrsh r2, [r0, #4*13]
417 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
418 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
419 smull r1, r5, lr, r5 @ RdLo, RdHi
420 sub r6, r4 @ 2: gteSY2 - gteSY0
421 ldrsh r3, [r0, #4*14]
424 smlal r1, r5, r3, r12
431 movtgt lr, #((1<<31)|(1<<16))>>16
439 str lr, [r0, #4*(32+31)] @ gteFLAG
442 .size gteNCLIP_arm, .-gteNCLIP_arm
446 ldr r2, [r0, #4*25] @ gteMAC1
448 ldr r12,[r0, #4*(32+31)] @ gteFLAG
451 orrge r12, #(1<<31)|(1<<24)
461 ldrd r2, r3, [r0, #4*26] @ gteMAC23
466 orrlt r12, #(1<<31)|(1<<24)
492 strd r2, r3, [r0, #4*10] @ gteIR23
497 str r12,[r0, #4*(32+31)] @ gteFLAG
501 FUNCTION(gteMACtoIR_lm0): @ r0=CP2 (d,c)
503 .size gteMACtoIR_lm0, .-gteMACtoIR_lm0
505 FUNCTION(gteMACtoIR_lm1): @ r0=CP2 (d,c)
507 .size gteMACtoIR_lm1, .-gteMACtoIR_lm1
510 FUNCTION(gteMACtoIR_lm0_nf): @ r0=CP2 (d,c)
520 .size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf
523 FUNCTION(gteMACtoIR_lm1_nf): @ r0=CP2 (d,c)
533 .size gteMACtoIR_lm1_nf, .-gteMACtoIR_lm1_nf
537 FUNCTION(gteMVMVA_test):
540 and r2, r1, #0x18000 @ v
541 cmp r2, #0x18000 @ v == 3?
543 addne r3, r0, r2, lsr #12
548 orreq r4, r3, r4, lsl #16 @ r4,r5 = VXYZ(v)
551 and r3, r1, #0x60000 @ mx
553 add r6, r12, r3, lsl #5
556 and r2, r1, #0x06000 @ cv
558 add r7, r12, r2, lsl #5
566 bne gteMVMVA_part_cv3sh12_arm
579 bl gteMVMVA_part_neon
582 bl gteMACtoIR_flags_neon
591 @ vim:filetype=armasm