mingw build fixes
[pcsx_rearmed.git] / libpcsxcore / gte_arm.S
CommitLineData
59774ed0 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
59774ed0 5 * See the COPYING file in the top-level directory.
6 */
7
665f33e1 8#include "arm_features.h"
59774ed0 9
10.text
11.align 2
12
7c621bf0 13.macro sgnxt16 rd rs
665f33e1 14#ifdef HAVE_ARMV7
7c621bf0 15 sxth \rd, \rs
665f33e1 16#else
7c621bf0 17 lsl \rd, \rs, #16
0c2ca3ba 18 asr \rd, \rd, #16
665f33e1 19#endif
59774ed0 20.endm
21
0c2ca3ba 22@ prepare work reg for ssatx
23@ in: wr reg, bit to saturate to
24.macro ssatx_prep wr bit
665f33e1 25#ifndef HAVE_ARMV7
0c2ca3ba 26 mov \wr, #(1<<(\bit-1))
665f33e1 27#endif
0c2ca3ba 28.endm
29
30.macro ssatx rd wr bit
665f33e1 31#ifdef HAVE_ARMV7
0c2ca3ba 32 ssat \rd, #\bit, \rd
665f33e1 33#else
0c2ca3ba 34 cmp \rd, \wr
35 subge \rd, \wr, #1
36 cmn \rd, \wr
37 rsblt \rd, \wr, #0
665f33e1 38#endif
0c2ca3ba 39.endm
40
054175e9 41@ prepare work reg for ssatx0 (sat to 0..2^(bit-1))
42@ in: wr reg, bit to saturate to
43.macro ssatx0_prep wr bit
44 mov \wr, #(1<<(\bit-1))
45.endm
46
47.macro ssatx0 rd wr bit
48 cmp \rd, \wr
49 subge \rd, \wr, #1
50 cmn \rd, #0
51 movlt \rd, #0
52.endm
53
0c2ca3ba 54.macro usat16_ rd rs
665f33e1 55#ifdef HAVE_ARMV7
0c2ca3ba 56 usat \rd, #16, \rs
665f33e1 57#else
0c2ca3ba 58 subs \rd, \rs, #0
59 movlt \rd, #0
60 cmp \rd, #0x10000
61 movge \rd, #0x0ff00
62 orrge \rd, #0x000ff
665f33e1 63#endif
0c2ca3ba 64.endm
65
665f33e1 66#ifdef HAVE_ARMV5
67
7c621bf0 68.macro udiv_ rd rm rs
69 lsl \rm, #16
0c2ca3ba 70 clz \rd, \rs
71 lsl \rs, \rs, \rd @ shift up divisor
72 orr \rd, \rd, #1<<31
73 lsr \rd, \rd, \rd
740:
75 cmp \rm, \rs
76 subcs \rm, \rs
77 adcs \rd, \rd, \rd
78 lsr \rs, #1
79 bcc 0b
80.endm
81
7c621bf0 82.macro newton_step rcp den zero t1 t2
83 umull \t2, \t1, \den, \rcp @ \t2 is dummy
84 sub \t1, \zero, \t1, lsl #2
85 smlal \t2, \rcp, \t1, \rcp
86.endm
87
88.macro udiv_newton rd rm rs t1 t2 t3 t4
89 lsl \rd, \rm, #16
90 clz \t1, \rs
91 mov \t2, #0
92 lsl \rs, \t1 @ normalize for the algo
93 mov \rm, #0x4d000000 @ initial estimate ~1.2
94
95 newton_step \rm, \rs, \t2, \t3, \t4
96 newton_step \rm, \rs, \t2, \t3, \t4
97 newton_step \rm, \rs, \t2, \t3, \t4
98 newton_step \rm, \rs, \t2, \t3, \t4
99
100 umull \t4, \rd, \rm, \rd
101 rsb \t2, \t1, #30 @ here t1 is 1..15
102 mov \rd, \rd, lsr \t2
103.endm
104
105@ unsigned divide rd = rm / rs; 16.16 result
106@ no div by 0 check
107@ in: rm, rs
108@ trash: rm rs t*
109.macro udiv rd rm rs t1 t2 t3 t4
110 @udiv_ \rd, \rm, \rs
111 udiv_newton \rd, \rm, \rs, \t1, \t2, \t3, \t4
112.endm
0c2ca3ba 113
114@ calculate RTPS/RTPT MAC values
115@ in: r0 context, r8,r9 VXYZ
116@ out: r10-r12 MAC123
117@ trash: r1-r7
118.macro do_rtpx_mac
119 add r1, r0, #4*32
120 add r2, r0, #4*(32+5) @ gteTRX
121 ldmia r1!,{r5-r7} @ gteR1*,gteR2*
122 ldmia r2, {r10-r12}
123 smulbb r2, r5, r8 @ gteR11 * gteVX0
124 smultt r3, r5, r8 @ gteR12 * gteVY0
125 smulbb r4, r6, r9 @ gteR13 * gteVZ0
126 qadd r2, r2, r3
127 asr r4, r4, #1 @ prevent oflow, lose a bit
128 add r3, r4, r2, asr #1
129 add r10,r10,r3, asr #11 @ gteMAC1
130 smultb r2, r6, r8 @ gteR21 * gteVX0
131 smulbt r3, r7, r8 @ gteR22 * gteVY0
132 smultb r4, r7, r9 @ gteR23 * gteVZ0
133 ldmia r1!,{r5-r6} @ gteR3*
134 qadd r2, r2, r3
135 asr r4, r4, #1
136 add r3, r4, r2, asr #1
137 add r11,r11,r3, asr #11 @ gteMAC2
138 @ be more accurate for gteMAC3, since it's also a divider
139 smulbb r2, r5, r8 @ gteR31 * gteVX0
140 smultt r3, r5, r8 @ gteR32 * gteVY0
141 smulbb r4, r6, r9 @ gteR33 * gteVZ0
142 qadd r2, r2, r3
143 asr r3, r4, #31 @ expand to 64bit
144 adds r1, r2, r4
145 adc r3, r2, asr #31 @ 64bit sum in r3,r1
146 add r12,r12,r3, lsl #20
147 add r12,r12,r1, lsr #12 @ gteMAC3
148.endm
149
150
5c6457c3 151FUNCTION(gteRTPS_nf_arm): @ r0=CP2 (d,c),
0c2ca3ba 152 push {r4-r11,lr}
153
154 ldmia r0, {r8,r9} @ VXYZ(0)
155 do_rtpx_mac
156 add r1, r0, #4*25 @ gteMAC1
157 add r2, r0, #4*17 @ gteSZ1
158 stmia r1, {r10-r12} @ gteMAC123 save
159 ldmia r2, {r3-r5}
160 add r1, r0, #4*16 @ gteSZ0
161 add r2, r0, #4*9 @ gteIR1
162 ssatx_prep r6, 16
163 usat16_ lr, r12 @ limD
164 ssatx r10,r6, 16
165 ssatx r11,r6, 16
166 ssatx r12,r6, 16
167 stmia r1, {r3-r5,lr} @ gteSZ*
168 ldr r3, [r0,#4*(32+26)] @ gteH
169 stmia r2, {r10,r11,r12} @ gteIR123 save
170 cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ?
171 mov r9, #1<<30
172 bhs 1f
173.if 1
7c621bf0 174 udiv r9, r3, lr, r1, r2, r6, r7
0c2ca3ba 175.else
176 push {r0, r12}
177 mov r0, r3
178 mov r1, lr
179 bl DIVIDE
180 mov r9, r0
181 pop {r0, r12}
182.endif
1831:
a53073ec 184 ldrd r6, r7, [r0, #4*(32+24)] @ gteOFXY
0c2ca3ba 185 cmp r9, #0x20000
186 add r1, r0, #4*12 @ gteSXY0
187 movhs r9, #0x20000
188 ldmia r1, {r2-r4}
189 /* quotient */ subhs r9, #1
76720f7f 190 mov r2, r6, asr #31
0c2ca3ba 191 smlal r6, r2, r10, r9
192 stmia r1!,{r3,r4} @ shift gteSXY
76720f7f 193 mov r3, r7, asr #31
0c2ca3ba 194 smlal r7, r3, r11, r9
195 lsr r6, #16
a53073ec 196 /* gteDQA, gteDQB */ ldrd r10,r11, [r0, #4*(32+27)]
0c2ca3ba 197 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
198 ssatx_prep r2, 11
199 lsr r7, #16
200 /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11
201 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
202 ssatx r6, r2, 11 @ gteSX2
203 ssatx r7, r2, 11 @ gteSY2
204 strh r6, [r1]
205 strh r7, [r1, #2]
206 str r4, [r0,#4*24] @ gteMAC0
207 asrs r4, #12
208 movmi r4, #0
209 cmp r4, #0x1000 @ limH
210 movgt r4, #0x1000
211 str r4, [r0,#4*8] @ gteIR0
212
213 pop {r4-r11,pc}
214 .size gteRTPS_nf_arm, .-gteRTPS_nf_arm
215
216
5c6457c3 217FUNCTION(gteRTPT_nf_arm): @ r0=CP2 (d,c),
0c2ca3ba 218 ldr r1, [r0, #4*19] @ gteSZ3
219 push {r4-r11,lr}
220 str r1, [r0, #4*16] @ gteSZ0
221 mov lr, #0
222
223rtpt_arm_loop:
224 add r1, r0, lr, lsl #1
a53073ec 225 ldrd r8, r9, [r1] @ VXYZ(v)
0c2ca3ba 226 do_rtpx_mac
227
228 ssatx_prep r6, 16
229 usat16_ r2, r12 @ limD
230 add r1, r0, #4*25 @ gteMAC1
231 ldr r3, [r0,#4*(32+26)] @ gteH
232 stmia r1, {r10-r12} @ gteMAC123 save
233 add r1, r0, #4*17
234 ssatx r10,r6, 16
235 ssatx r11,r6, 16
236 ssatx r12,r6, 16
237 str r2, [r1, lr] @ fSZ(v)
238 cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ?
239 mov r9, #1<<30
240 bhs 1f
241.if 1
7c621bf0 242 udiv r9, r3, r2, r1, r4, r6, r7
0c2ca3ba 243.else
244 push {r0, r12, lr}
245 mov r0, r3
246 mov r1, r2
247 bl DIVIDE
248 mov r9, r0
249 pop {r0, r12, lr}
250.endif
054175e9 2511: cmp r9, #0x20000
0c2ca3ba 252 add r1, r0, #4*12
253 movhs r9, #0x20000
a53073ec 254 ldrd r6, r7, [r0,#4*(32+24)] @ gteOFXY
0c2ca3ba 255 /* quotient */ subhs r9, #1
76720f7f 256 mov r2, r6, asr #31
0c2ca3ba 257 smlal r6, r2, r10, r9
76720f7f 258 mov r3, r7, asr #31
0c2ca3ba 259 smlal r7, r3, r11, r9
260 lsr r6, #16
261 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
262 ssatx_prep r2, 11
263 lsr r7, #16
264 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
265 ssatx r6, r2, 11 @ gteSX(v)
266 ssatx r7, r2, 11 @ gteSY(v)
267 strh r6, [r1, lr]!
268 add lr, #4
269 strh r7, [r1, #2]
270 cmp lr, #12
271 blt rtpt_arm_loop
272
a53073ec 273 ldrd r4, r5, [r0, #4*(32+27)] @ gteDQA, gteDQB
0c2ca3ba 274 add r1, r0, #4*9 @ gteIR1
275 mla r3, r4, r9, r5 @ gteDQB + gteDQA * q
276 stmia r1, {r10,r11,r12} @ gteIR123 save
277
278 str r3, [r0,#4*24] @ gteMAC0
279 asrs r3, #12
280 movmi r3, #0
281 cmp r3, #0x1000 @ limH
282 movgt r3, #0x1000
283 str r3, [r0,#4*8] @ gteIR0
284
285 pop {r4-r11,pc}
286 .size gteRTPT_nf_arm, .-gteRTPT_nf_arm
287
59774ed0 288
054175e9 289@ note: not std calling convention used
290@ r0 = CP2 (d,c) (must preserve)
291@ r1 = needs_shift12
292@ r4,r5 = VXYZ(v) packed
293@ r6 = &MX11(mx)
294@ r7 = &CV1(cv)
295.macro mvma_op do_flags
296 push {r8-r11}
297
298.if \do_flags
299 ands r3, r1, #1 @ gteFLAG, shift_need
300.else
301 tst r1, #1
302.endif
303 ldmia r7, {r7-r9} @ CV123
304 ldmia r6!,{r10-r12} @ MX1*,MX2*
305 asr r1, r7, #20
306 lsl r7, #12 @ expand to 64bit
307 smlalbb r7, r1, r10, r4 @ MX11 * vx
308 smlaltt r7, r1, r10, r4 @ MX12 * vy
309 smlalbb r7, r1, r11, r5 @ MX13 * vz
310 lsrne r7, #12
311 orrne r7, r1, lsl #20 @ gteMAC0
312.if \do_flags
313 asrne r1, #20
314 adds r2, r7, #0x80000000
315 adcs r1, #0
316 orrgt r3, #(1<<30)
317 orrmi r3, #(1<<31)|(1<<27)
318 tst r3, #1 @ repeat shift test
319.endif
320 asr r1, r8, #20
321 lsl r8, #12 @ expand to 64bit
322 smlaltb r8, r1, r11, r4 @ MX21 * vx
323 smlalbt r8, r1, r12, r4 @ MX22 * vy
324 smlaltb r8, r1, r12, r5 @ MX23 * vz
325 lsrne r8, #12
326 orrne r8, r1, lsl #20 @ gteMAC1
327.if \do_flags
328 asrne r1, #20
329 adds r2, r8, #0x80000000
330 adcs r1, #0
331 orrgt r3, #(1<<29)
332 orrmi r3, #(1<<31)|(1<<26)
333 tst r3, #1 @ repeat shift test
334.endif
335 ldmia r6!,{r10-r11} @ MX3*
336 asr r1, r9, #20
337 lsl r9, #12 @ expand to 64bit
338 smlalbb r9, r1, r10, r4 @ MX31 * vx
339 smlaltt r9, r1, r10, r4 @ MX32 * vy
340 smlalbb r9, r1, r11, r5 @ MX33 * vz
341 lsrne r9, #12
342 orrne r9, r1, lsl #20 @ gteMAC2
343.if \do_flags
344 asrne r1, #20
345 adds r2, r9, #0x80000000
346 adcs r1, #0
347 orrgt r3, #(1<<28)
348 orrmi r3, #(1<<31)|(1<<25)
349 bic r3, #1
350.else
351 mov r3, #0
352.endif
353 str r3, [r0, #4*(32+31)] @ gteFLAG
354 add r1, r0, #4*25
355 stmia r1, {r7-r9}
356
357 pop {r8-r11}
358 bx lr
359.endm
360
5c6457c3 361FUNCTION(gteMVMVA_part_arm):
054175e9 362 mvma_op 1
363 .size gteMVMVA_part_arm, .-gteMVMVA_part_arm
364
5c6457c3 365FUNCTION(gteMVMVA_part_nf_arm):
054175e9 366 mvma_op 0
367 .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm
368
369@ common version of MVMVA with cv3 (== 0) and shift12,
370@ can't overflow so no gteMAC flags needed
371@ note: not std calling convention used
372@ r0 = CP2 (d,c) (must preserve)
373@ r4,r5 = VXYZ(v) packed
374@ r6 = &MX11(mx)
5c6457c3 375FUNCTION(gteMVMVA_part_cv3sh12_arm):
054175e9 376 push {r8-r9}
377 ldmia r6!,{r7-r9} @ MX1*,MX2*
378 smulbb r1, r7, r4 @ MX11 * vx
379 smultt r2, r7, r4 @ MX12 * vy
380 smulbb r3, r8, r5 @ MX13 * vz
381 qadd r1, r1, r2
382 asr r3, #1 @ prevent oflow, lose a bit
383 add r1, r3, r1, asr #1
384 asr r7, r1, #11
385 smultb r1, r8, r4 @ MX21 * vx
386 smulbt r2, r9, r4 @ MX22 * vy
387 smultb r3, r9, r5 @ MX23 * vz
388 qadd r1, r1, r2
389 asr r3, #1
390 add r1, r3, r1, asr #1
391 asr r8, r1, #11
392 ldmia r6, {r6,r9} @ MX3*
393 smulbb r1, r6, r4 @ MX31 * vx
394 smultt r2, r6, r4 @ MX32 * vy
395 smulbb r3, r9, r5 @ MX33 * vz
396 qadd r1, r1, r2
397 asr r3, #1
398 add r1, r3, r1, asr #1
399 asr r9, r1, #11
400 add r1, r0, #4*25
401 mov r2, #0
402 stmia r1, {r7-r9}
403 str r2, [r0, #4*(32+31)] @ gteFLAG
404 pop {r8-r9}
405 bx lr
406 .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm
407
665f33e1 408#endif /* HAVE_ARMV5 */
054175e9 409
5c6457c3 410FUNCTION(gteNCLIP_arm): @ r0=CP2 (d,c),
59774ed0 411 push {r4-r6,lr}
7c621bf0 412 ldrsh r4, [r0, #4*12+2]
413 ldrsh r5, [r0, #4*13+2]
414 ldrsh r6, [r0, #4*14+2]
415 ldrsh lr, [r0, #4*12]
416 ldrsh r2, [r0, #4*13]
59774ed0 417 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
418 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
7c621bf0 419 smull r1, r5, lr, r5 @ RdLo, RdHi
59774ed0 420 sub r6, r4 @ 2: gteSY2 - gteSY0
7c621bf0 421 ldrsh r3, [r0, #4*14]
59774ed0 422 smlal r1, r5, r2, r6
423 mov lr, #0 @ gteFLAG
59774ed0 424 smlal r1, r5, r3, r12
425 mov r6, #1<<31
426 orr r6, #1<<15
427 movs r2, r1, lsl #1
428 adc r5, r5
429 cmp r5, #0
665f33e1 430#ifdef HAVE_ARMV7
59774ed0 431 movtgt lr, #((1<<31)|(1<<16))>>16
665f33e1 432#else
59774ed0 433 movgt lr, #(1<<31)
434 orrgt lr, #(1<<16)
665f33e1 435#endif
59774ed0 436 cmn r5, #1
59774ed0 437 orrmi lr, r6
438 str r1, [r0, #4*24]
439 str lr, [r0, #4*(32+31)] @ gteFLAG
440
441 pop {r4-r6,pc}
442 .size gteNCLIP_arm, .-gteNCLIP_arm
443
444
054175e9 445.macro gteMACtoIR lm
446 ldr r2, [r0, #4*25] @ gteMAC1
447 mov r1, #1<<15
448 ldr r12,[r0, #4*(32+31)] @ gteFLAG
449 cmp r2, r1
450 subge r2, r1, #1
451 orrge r12, #(1<<31)|(1<<24)
452.if \lm
453 cmp r2, #0
454 movlt r2, #0
455.else
456 cmn r2, r1
457 rsblt r2, r1, #0
458.endif
459 str r2, [r0, #4*9]
665f33e1 460#ifdef HAVE_ARMV5
a53073ec 461 ldrd r2, r3, [r0, #4*26] @ gteMAC23
665f33e1 462#else
463 ldr r2, [r0, #4*26]
464 ldr r3, [r0, #4*27]
465#endif
054175e9 466 orrlt r12, #(1<<31)|(1<<24)
467 cmp r2, r1
468 subge r2, r1, #1
469 orrge r12, #1<<23
470 orrge r12, #1<<31
471.if \lm
472 cmp r2, #0
473 movlt r2, #0
474.else
475 cmn r2, r1
476 rsblt r2, r1, #0
477.endif
478 orrlt r12, #1<<23
479 orrlt r12, #1<<31
480 cmp r3, r1
481 subge r3, r1, #1
482 orrge r12, #1<<22
483.if \lm
484 cmp r3, #0
485 movlt r3, #0
486.else
487 cmn r3, r1
488 rsblt r3, r1, #0
489.endif
490 orrlt r12, #1<<22
665f33e1 491#ifdef HAVE_ARMV5
a53073ec 492 strd r2, r3, [r0, #4*10] @ gteIR23
665f33e1 493#else
494 str r2, [r0, #4*10]
495 str r3, [r0, #4*11]
496#endif
054175e9 497 str r12,[r0, #4*(32+31)] @ gteFLAG
498 bx lr
499.endm
500
5c6457c3 501FUNCTION(gteMACtoIR_lm0): @ r0=CP2 (d,c)
054175e9 502 gteMACtoIR 0
503 .size gteMACtoIR_lm0, .-gteMACtoIR_lm0
504
5c6457c3 505FUNCTION(gteMACtoIR_lm1): @ r0=CP2 (d,c)
054175e9 506 gteMACtoIR 1
507 .size gteMACtoIR_lm1, .-gteMACtoIR_lm1
508
509
5c6457c3 510FUNCTION(gteMACtoIR_lm0_nf): @ r0=CP2 (d,c)
054175e9 511 add r12, r0, #4*25
512 ldmia r12, {r1-r3}
513 ssatx_prep r12, 16
514 ssatx r1, r12, 16
515 ssatx r2, r12, 16
516 ssatx r3, r12, 16
517 add r12, r0, #4*9
518 stmia r12, {r1-r3}
519 bx lr
520 .size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf
521
522
5c6457c3 523FUNCTION(gteMACtoIR_lm1_nf): @ r0=CP2 (d,c)
054175e9 524 add r12, r0, #4*25
525 ldmia r12, {r1-r3}
526 ssatx0_prep r12, 16
527 ssatx0 r1, r12, 16
528 ssatx0 r2, r12, 16
529 ssatx0 r3, r12, 16
530 add r12, r0, #4*9
531 stmia r12, {r1-r3}
532 bx lr
533 .size gteMACtoIR_lm1_nf, .-gteMACtoIR_lm1_nf
534
535
536.if 0
5c6457c3 537FUNCTION(gteMVMVA_test):
054175e9 538 push {r4-r7,lr}
539 push {r1}
540 and r2, r1, #0x18000 @ v
541 cmp r2, #0x18000 @ v == 3?
542 addeq r4, r0, #4*9
543 addne r3, r0, r2, lsr #12
544 ldmeqia r4, {r3-r5}
545 ldmneia r3, {r4,r5}
546 lsleq r3, #16
547 lsreq r3, #16
548 orreq r4, r3, r4, lsl #16 @ r4,r5 = VXYZ(v)
549 @and r5, #0xffff
550 add r12, r0, #4*32
551 and r3, r1, #0x60000 @ mx
552 lsr r3, #17
553 add r6, r12, r3, lsl #5
554 cmp r3, #3
555 adreq r6, zeroes
556 and r2, r1, #0x06000 @ cv
557 lsr r2, #13
558 add r7, r12, r2, lsl #5
559 add r7, #4*5
560 cmp r2, #3
561 adreq r7, zeroes
562.if 1
563 adr lr, 1f
564 bne 0f
565 tst r1, #1<<19
566 bne gteMVMVA_part_cv3sh12_arm
5670:
568 and r1, #1<<19
569 lsr r1, #19
570 b gteMVMVA_part_arm
5711:
572 pop {r1}
573 tst r1, #1<<10
574 adr lr, 0f
575 beq gteMACtoIR_lm0
576 bne gteMACtoIR_lm1
5770:
578.else
579 bl gteMVMVA_part_neon
580 pop {r1}
581 and r1, #1<<10
582 bl gteMACtoIR_flags_neon
583.endif
584 pop {r4-r7,pc}
585
586zeroes:
587 .word 0,0,0,0,0
588.endif
589
590
59774ed0 591@ vim:filetype=armasm
592