pcnt: measure gte too
[pcsx_rearmed.git] / libpcsxcore / gte_arm.s
CommitLineData
59774ed0 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
59774ed0 5 * See the COPYING file in the top-level directory.
6 */
7
8/* .equiv HAVE_ARMV7, 1 */
9
10.text
11.align 2
12
0c2ca3ba 13.macro sgnxt16 rd
59774ed0 14.if HAVE_ARMV7
0c2ca3ba 15 sxth \rd, \rd
59774ed0 16.else
0c2ca3ba 17 lsl \rd, \rd, #16
18 asr \rd, \rd, #16
59774ed0 19.endif
20.endm
21
0c2ca3ba 22@ prepare work reg for ssatx
23@ in: wr reg, bit to saturate to
24.macro ssatx_prep wr bit
25.if !HAVE_ARMV7
26 mov \wr, #(1<<(\bit-1))
27.endif
28.endm
29
30.macro ssatx rd wr bit
31.if HAVE_ARMV7
32 ssat \rd, #\bit, \rd
33.else
34 cmp \rd, \wr
35 subge \rd, \wr, #1
36 cmn \rd, \wr
37 rsblt \rd, \wr, #0
38.endif
39.endm
40
054175e9 41@ prepare work reg for ssatx0 (sat to 0..2^(bit-1))
42@ in: wr reg, bit to saturate to
43.macro ssatx0_prep wr bit
44 mov \wr, #(1<<(\bit-1))
45.endm
46
47.macro ssatx0 rd wr bit
48 cmp \rd, \wr
49 subge \rd, \wr, #1
50 cmn \rd, #0
51 movlt \rd, #0
52.endm
53
0c2ca3ba 54.macro usat16_ rd rs
55.if HAVE_ARMV7
56 usat \rd, #16, \rs
57.else
58 subs \rd, \rs, #0
59 movlt \rd, #0
60 cmp \rd, #0x10000
61 movge \rd, #0x0ff00
62 orrge \rd, #0x000ff
63.endif
64.endm
65
66@ unsigned divide rd = rm / rs
67@ no div by 0 check
68@ in: rm, rs
69@ trash: rm rs
70.macro udiv rd rm rs
71 clz \rd, \rs
72 lsl \rs, \rs, \rd @ shift up divisor
73 orr \rd, \rd, #1<<31
74 lsr \rd, \rd, \rd
750:
76 cmp \rm, \rs
77 subcs \rm, \rs
78 adcs \rd, \rd, \rd
79 lsr \rs, #1
80 bcc 0b
81.endm
82
83
84@ calculate RTPS/RTPT MAC values
85@ in: r0 context, r8,r9 VXYZ
86@ out: r10-r12 MAC123
87@ trash: r1-r7
88.macro do_rtpx_mac
89 add r1, r0, #4*32
90 add r2, r0, #4*(32+5) @ gteTRX
91 ldmia r1!,{r5-r7} @ gteR1*,gteR2*
92 ldmia r2, {r10-r12}
93 smulbb r2, r5, r8 @ gteR11 * gteVX0
94 smultt r3, r5, r8 @ gteR12 * gteVY0
95 smulbb r4, r6, r9 @ gteR13 * gteVZ0
96 qadd r2, r2, r3
97 asr r4, r4, #1 @ prevent oflow, lose a bit
98 add r3, r4, r2, asr #1
99 add r10,r10,r3, asr #11 @ gteMAC1
100 smultb r2, r6, r8 @ gteR21 * gteVX0
101 smulbt r3, r7, r8 @ gteR22 * gteVY0
102 smultb r4, r7, r9 @ gteR23 * gteVZ0
103 ldmia r1!,{r5-r6} @ gteR3*
104 qadd r2, r2, r3
105 asr r4, r4, #1
106 add r3, r4, r2, asr #1
107 add r11,r11,r3, asr #11 @ gteMAC2
108 @ be more accurate for gteMAC3, since it's also a divider
109 smulbb r2, r5, r8 @ gteR31 * gteVX0
110 smultt r3, r5, r8 @ gteR32 * gteVY0
111 smulbb r4, r6, r9 @ gteR33 * gteVZ0
112 qadd r2, r2, r3
113 asr r3, r4, #31 @ expand to 64bit
114 adds r1, r2, r4
115 adc r3, r2, asr #31 @ 64bit sum in r3,r1
116 add r12,r12,r3, lsl #20
117 add r12,r12,r1, lsr #12 @ gteMAC3
118.endm
119
120
121.global gteRTPS_nf_arm @ r0=CP2 (d,c),
122gteRTPS_nf_arm:
123 push {r4-r11,lr}
124
125 ldmia r0, {r8,r9} @ VXYZ(0)
126 do_rtpx_mac
127 add r1, r0, #4*25 @ gteMAC1
128 add r2, r0, #4*17 @ gteSZ1
129 stmia r1, {r10-r12} @ gteMAC123 save
130 ldmia r2, {r3-r5}
131 add r1, r0, #4*16 @ gteSZ0
132 add r2, r0, #4*9 @ gteIR1
133 ssatx_prep r6, 16
134 usat16_ lr, r12 @ limD
135 ssatx r10,r6, 16
136 ssatx r11,r6, 16
137 ssatx r12,r6, 16
138 stmia r1, {r3-r5,lr} @ gteSZ*
139 ldr r3, [r0,#4*(32+26)] @ gteH
140 stmia r2, {r10,r11,r12} @ gteIR123 save
141 cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ?
142 mov r9, #1<<30
143 bhs 1f
144.if 1
145 lsl r3, #16
146 udiv r9, r3, lr
147.else
148 push {r0, r12}
149 mov r0, r3
150 mov r1, lr
151 bl DIVIDE
152 mov r9, r0
153 pop {r0, r12}
154.endif
1551:
156 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
157 cmp r9, #0x20000
158 add r1, r0, #4*12 @ gteSXY0
159 movhs r9, #0x20000
160 ldmia r1, {r2-r4}
161 /* quotient */ subhs r9, #1
162 mov r2, #0
163 smlal r6, r2, r10, r9
164 stmia r1!,{r3,r4} @ shift gteSXY
165 mov r3, #0
166 smlal r7, r3, r11, r9
167 lsr r6, #16
168 /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)]
169 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
170 ssatx_prep r2, 11
171 lsr r7, #16
172 /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11
173 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
174 ssatx r6, r2, 11 @ gteSX2
175 ssatx r7, r2, 11 @ gteSY2
176 strh r6, [r1]
177 strh r7, [r1, #2]
178 str r4, [r0,#4*24] @ gteMAC0
179 asrs r4, #12
180 movmi r4, #0
181 cmp r4, #0x1000 @ limH
182 movgt r4, #0x1000
183 str r4, [r0,#4*8] @ gteIR0
184
185 pop {r4-r11,pc}
186 .size gteRTPS_nf_arm, .-gteRTPS_nf_arm
187
188
189.global gteRTPT_nf_arm @ r0=CP2 (d,c),
190gteRTPT_nf_arm:
191 ldr r1, [r0, #4*19] @ gteSZ3
192 push {r4-r11,lr}
193 str r1, [r0, #4*16] @ gteSZ0
194 mov lr, #0
195
196rtpt_arm_loop:
197 add r1, r0, lr, lsl #1
198 ldrd r8, [r1] @ VXYZ(v)
199 do_rtpx_mac
200
201 ssatx_prep r6, 16
202 usat16_ r2, r12 @ limD
203 add r1, r0, #4*25 @ gteMAC1
204 ldr r3, [r0,#4*(32+26)] @ gteH
205 stmia r1, {r10-r12} @ gteMAC123 save
206 add r1, r0, #4*17
207 ssatx r10,r6, 16
208 ssatx r11,r6, 16
209 ssatx r12,r6, 16
210 str r2, [r1, lr] @ fSZ(v)
211 cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ?
212 mov r9, #1<<30
213 bhs 1f
214.if 1
215 lsl r3, #16
216 udiv r9, r3, r2
217.else
218 push {r0, r12, lr}
219 mov r0, r3
220 mov r1, r2
221 bl DIVIDE
222 mov r9, r0
223 pop {r0, r12, lr}
224.endif
054175e9 2251: cmp r9, #0x20000
0c2ca3ba 226 add r1, r0, #4*12
227 movhs r9, #0x20000
228 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
229 /* quotient */ subhs r9, #1
230 mov r2, #0
231 smlal r6, r2, r10, r9
232 mov r3, #0
233 smlal r7, r3, r11, r9
234 lsr r6, #16
235 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
236 ssatx_prep r2, 11
237 lsr r7, #16
238 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
239 ssatx r6, r2, 11 @ gteSX(v)
240 ssatx r7, r2, 11 @ gteSY(v)
241 strh r6, [r1, lr]!
242 add lr, #4
243 strh r7, [r1, #2]
244 cmp lr, #12
245 blt rtpt_arm_loop
246
247 ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB
248 add r1, r0, #4*9 @ gteIR1
249 mla r3, r4, r9, r5 @ gteDQB + gteDQA * q
250 stmia r1, {r10,r11,r12} @ gteIR123 save
251
252 str r3, [r0,#4*24] @ gteMAC0
253 asrs r3, #12
254 movmi r3, #0
255 cmp r3, #0x1000 @ limH
256 movgt r3, #0x1000
257 str r3, [r0,#4*8] @ gteIR0
258
259 pop {r4-r11,pc}
260 .size gteRTPT_nf_arm, .-gteRTPT_nf_arm
261
59774ed0 262
054175e9 263@ note: not std calling convention used
264@ r0 = CP2 (d,c) (must preserve)
265@ r1 = needs_shift12
266@ r4,r5 = VXYZ(v) packed
267@ r6 = &MX11(mx)
268@ r7 = &CV1(cv)
269.macro mvma_op do_flags
270 push {r8-r11}
271
272.if \do_flags
273 ands r3, r1, #1 @ gteFLAG, shift_need
274.else
275 tst r1, #1
276.endif
277 ldmia r7, {r7-r9} @ CV123
278 ldmia r6!,{r10-r12} @ MX1*,MX2*
279 asr r1, r7, #20
280 lsl r7, #12 @ expand to 64bit
281 smlalbb r7, r1, r10, r4 @ MX11 * vx
282 smlaltt r7, r1, r10, r4 @ MX12 * vy
283 smlalbb r7, r1, r11, r5 @ MX13 * vz
284 lsrne r7, #12
285 orrne r7, r1, lsl #20 @ gteMAC0
286.if \do_flags
287 asrne r1, #20
288 adds r2, r7, #0x80000000
289 adcs r1, #0
290 orrgt r3, #(1<<30)
291 orrmi r3, #(1<<31)|(1<<27)
292 tst r3, #1 @ repeat shift test
293.endif
294 asr r1, r8, #20
295 lsl r8, #12 @ expand to 64bit
296 smlaltb r8, r1, r11, r4 @ MX21 * vx
297 smlalbt r8, r1, r12, r4 @ MX22 * vy
298 smlaltb r8, r1, r12, r5 @ MX23 * vz
299 lsrne r8, #12
300 orrne r8, r1, lsl #20 @ gteMAC1
301.if \do_flags
302 asrne r1, #20
303 adds r2, r8, #0x80000000
304 adcs r1, #0
305 orrgt r3, #(1<<29)
306 orrmi r3, #(1<<31)|(1<<26)
307 tst r3, #1 @ repeat shift test
308.endif
309 ldmia r6!,{r10-r11} @ MX3*
310 asr r1, r9, #20
311 lsl r9, #12 @ expand to 64bit
312 smlalbb r9, r1, r10, r4 @ MX31 * vx
313 smlaltt r9, r1, r10, r4 @ MX32 * vy
314 smlalbb r9, r1, r11, r5 @ MX33 * vz
315 lsrne r9, #12
316 orrne r9, r1, lsl #20 @ gteMAC2
317.if \do_flags
318 asrne r1, #20
319 adds r2, r9, #0x80000000
320 adcs r1, #0
321 orrgt r3, #(1<<28)
322 orrmi r3, #(1<<31)|(1<<25)
323 bic r3, #1
324.else
325 mov r3, #0
326.endif
327 str r3, [r0, #4*(32+31)] @ gteFLAG
328 add r1, r0, #4*25
329 stmia r1, {r7-r9}
330
331 pop {r8-r11}
332 bx lr
333.endm
334
335.global gteMVMVA_part_arm
336gteMVMVA_part_arm:
337 mvma_op 1
338 .size gteMVMVA_part_arm, .-gteMVMVA_part_arm
339
340.global gteMVMVA_part_nf_arm
341gteMVMVA_part_nf_arm:
342 mvma_op 0
343 .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm
344
345@ common version of MVMVA with cv3 (== 0) and shift12,
346@ can't overflow so no gteMAC flags needed
347@ note: not std calling convention used
348@ r0 = CP2 (d,c) (must preserve)
349@ r4,r5 = VXYZ(v) packed
350@ r6 = &MX11(mx)
351.global gteMVMVA_part_cv3sh12_arm
352gteMVMVA_part_cv3sh12_arm:
353 push {r8-r9}
354 ldmia r6!,{r7-r9} @ MX1*,MX2*
355 smulbb r1, r7, r4 @ MX11 * vx
356 smultt r2, r7, r4 @ MX12 * vy
357 smulbb r3, r8, r5 @ MX13 * vz
358 qadd r1, r1, r2
359 asr r3, #1 @ prevent oflow, lose a bit
360 add r1, r3, r1, asr #1
361 asr r7, r1, #11
362 smultb r1, r8, r4 @ MX21 * vx
363 smulbt r2, r9, r4 @ MX22 * vy
364 smultb r3, r9, r5 @ MX23 * vz
365 qadd r1, r1, r2
366 asr r3, #1
367 add r1, r3, r1, asr #1
368 asr r8, r1, #11
369 ldmia r6, {r6,r9} @ MX3*
370 smulbb r1, r6, r4 @ MX31 * vx
371 smultt r2, r6, r4 @ MX32 * vy
372 smulbb r3, r9, r5 @ MX33 * vz
373 qadd r1, r1, r2
374 asr r3, #1
375 add r1, r3, r1, asr #1
376 asr r9, r1, #11
377 add r1, r0, #4*25
378 mov r2, #0
379 stmia r1, {r7-r9}
380 str r2, [r0, #4*(32+31)] @ gteFLAG
381 pop {r8-r9}
382 bx lr
383 .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm
384
385
59774ed0 386.global gteNCLIP_arm @ r0=CP2 (d,c),
387gteNCLIP_arm:
388 push {r4-r6,lr}
389
390 add r1, r0, #4*12
391 ldmia r1, {r1-r3}
392 mov r4, r1, asr #16
393 mov r5, r2, asr #16
394 mov r6, r3, asr #16
395 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
396 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
397 sgnxt16 r1
398 smull r1, r5, r1, r5 @ RdLo, RdHi
399 sub r6, r4 @ 2: gteSY2 - gteSY0
400 sgnxt16 r2
401 smlal r1, r5, r2, r6
402 mov lr, #0 @ gteFLAG
403 sgnxt16 r3
404 smlal r1, r5, r3, r12
405 mov r6, #1<<31
406 orr r6, #1<<15
407 movs r2, r1, lsl #1
408 adc r5, r5
409 cmp r5, #0
410.if HAVE_ARMV7
411 movtgt lr, #((1<<31)|(1<<16))>>16
412.else
413 movgt lr, #(1<<31)
414 orrgt lr, #(1<<16)
415.endif
416 mvngt r1, #1<<31 @ maxint
417 cmn r5, #1
418 movmi r1, #1<<31 @ minint
419 orrmi lr, r6
420 str r1, [r0, #4*24]
421 str lr, [r0, #4*(32+31)] @ gteFLAG
422
423 pop {r4-r6,pc}
424 .size gteNCLIP_arm, .-gteNCLIP_arm
425
426
054175e9 427.macro gteMACtoIR lm
428 ldr r2, [r0, #4*25] @ gteMAC1
429 mov r1, #1<<15
430 ldr r12,[r0, #4*(32+31)] @ gteFLAG
431 cmp r2, r1
432 subge r2, r1, #1
433 orrge r12, #(1<<31)|(1<<24)
434.if \lm
435 cmp r2, #0
436 movlt r2, #0
437.else
438 cmn r2, r1
439 rsblt r2, r1, #0
440.endif
441 str r2, [r0, #4*9]
442 ldrd r2, [r0, #4*26] @ gteMAC23
443 orrlt r12, #(1<<31)|(1<<24)
444 cmp r2, r1
445 subge r2, r1, #1
446 orrge r12, #1<<23
447 orrge r12, #1<<31
448.if \lm
449 cmp r2, #0
450 movlt r2, #0
451.else
452 cmn r2, r1
453 rsblt r2, r1, #0
454.endif
455 orrlt r12, #1<<23
456 orrlt r12, #1<<31
457 cmp r3, r1
458 subge r3, r1, #1
459 orrge r12, #1<<22
460.if \lm
461 cmp r3, #0
462 movlt r3, #0
463.else
464 cmn r3, r1
465 rsblt r3, r1, #0
466.endif
467 orrlt r12, #1<<22
468 strd r2, [r0, #4*10] @ gteIR23
469 str r12,[r0, #4*(32+31)] @ gteFLAG
470 bx lr
471.endm
472
473.global gteMACtoIR_lm0 @ r0=CP2 (d,c)
474gteMACtoIR_lm0:
475 gteMACtoIR 0
476 .size gteMACtoIR_lm0, .-gteMACtoIR_lm0
477
478.global gteMACtoIR_lm1 @ r0=CP2 (d,c)
479gteMACtoIR_lm1:
480 gteMACtoIR 1
481 .size gteMACtoIR_lm1, .-gteMACtoIR_lm1
482
483
484.global gteMACtoIR_lm0_nf @ r0=CP2 (d,c)
485gteMACtoIR_lm0_nf:
486 add r12, r0, #4*25
487 ldmia r12, {r1-r3}
488 ssatx_prep r12, 16
489 ssatx r1, r12, 16
490 ssatx r2, r12, 16
491 ssatx r3, r12, 16
492 add r12, r0, #4*9
493 stmia r12, {r1-r3}
494 bx lr
495 .size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf
496
497
498.global gteMACtoIR_lm1_nf @ r0=CP2 (d,c)
499gteMACtoIR_lm1_nf:
500 add r12, r0, #4*25
501 ldmia r12, {r1-r3}
502 ssatx0_prep r12, 16
503 ssatx0 r1, r12, 16
504 ssatx0 r2, r12, 16
505 ssatx0 r3, r12, 16
506 add r12, r0, #4*9
507 stmia r12, {r1-r3}
508 bx lr
509 .size gteMACtoIR_lm1_nf, .-gteMACtoIR_lm1_nf
510
511
512.if 0
513.global gteMVMVA_test
514gteMVMVA_test:
515 push {r4-r7,lr}
516 push {r1}
517 and r2, r1, #0x18000 @ v
518 cmp r2, #0x18000 @ v == 3?
519 addeq r4, r0, #4*9
520 addne r3, r0, r2, lsr #12
521 ldmeqia r4, {r3-r5}
522 ldmneia r3, {r4,r5}
523 lsleq r3, #16
524 lsreq r3, #16
525 orreq r4, r3, r4, lsl #16 @ r4,r5 = VXYZ(v)
526 @and r5, #0xffff
527 add r12, r0, #4*32
528 and r3, r1, #0x60000 @ mx
529 lsr r3, #17
530 add r6, r12, r3, lsl #5
531 cmp r3, #3
532 adreq r6, zeroes
533 and r2, r1, #0x06000 @ cv
534 lsr r2, #13
535 add r7, r12, r2, lsl #5
536 add r7, #4*5
537 cmp r2, #3
538 adreq r7, zeroes
539.if 1
540 adr lr, 1f
541 bne 0f
542 tst r1, #1<<19
543 bne gteMVMVA_part_cv3sh12_arm
5440:
545 and r1, #1<<19
546 lsr r1, #19
547 b gteMVMVA_part_arm
5481:
549 pop {r1}
550 tst r1, #1<<10
551 adr lr, 0f
552 beq gteMACtoIR_lm0
553 bne gteMACtoIR_lm1
5540:
555.else
556 bl gteMVMVA_part_neon
557 pop {r1}
558 and r1, #1<<10
559 bl gteMACtoIR_flags_neon
560.endif
561 pop {r4-r7,pc}
562
563zeroes:
564 .word 0,0,0,0,0
565.endif
566
567
59774ed0 568@ vim:filetype=armasm
569