gpu_neon: warning fix
[pcsx_rearmed.git] / libpcsxcore / gte_arm.s
CommitLineData
59774ed0 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
59774ed0 5 * See the COPYING file in the top-level directory.
6 */
7
8/* .equiv HAVE_ARMV7, 1 */
9
10.text
11.align 2
12
7c621bf0 13.macro sgnxt16 rd rs
59774ed0 14.if HAVE_ARMV7
7c621bf0 15 sxth \rd, \rs
59774ed0 16.else
7c621bf0 17 lsl \rd, \rs, #16
0c2ca3ba 18 asr \rd, \rd, #16
59774ed0 19.endif
20.endm
21
0c2ca3ba 22@ prepare work reg for ssatx
23@ in: wr reg, bit to saturate to
24.macro ssatx_prep wr bit
25.if !HAVE_ARMV7
26 mov \wr, #(1<<(\bit-1))
27.endif
28.endm
29
30.macro ssatx rd wr bit
31.if HAVE_ARMV7
32 ssat \rd, #\bit, \rd
33.else
34 cmp \rd, \wr
35 subge \rd, \wr, #1
36 cmn \rd, \wr
37 rsblt \rd, \wr, #0
38.endif
39.endm
40
054175e9 41@ prepare work reg for ssatx0 (sat to 0..2^(bit-1))
42@ in: wr reg, bit to saturate to
43.macro ssatx0_prep wr bit
44 mov \wr, #(1<<(\bit-1))
45.endm
46
47.macro ssatx0 rd wr bit
48 cmp \rd, \wr
49 subge \rd, \wr, #1
50 cmn \rd, #0
51 movlt \rd, #0
52.endm
53
0c2ca3ba 54.macro usat16_ rd rs
55.if HAVE_ARMV7
56 usat \rd, #16, \rs
57.else
58 subs \rd, \rs, #0
59 movlt \rd, #0
60 cmp \rd, #0x10000
61 movge \rd, #0x0ff00
62 orrge \rd, #0x000ff
63.endif
64.endm
65
7c621bf0 66.macro udiv_ rd rm rs
67 lsl \rm, #16
0c2ca3ba 68 clz \rd, \rs
69 lsl \rs, \rs, \rd @ shift up divisor
70 orr \rd, \rd, #1<<31
71 lsr \rd, \rd, \rd
720:
73 cmp \rm, \rs
74 subcs \rm, \rs
75 adcs \rd, \rd, \rd
76 lsr \rs, #1
77 bcc 0b
78.endm
79
7c621bf0 80.macro newton_step rcp den zero t1 t2
81 umull \t2, \t1, \den, \rcp @ \t2 is dummy
82 sub \t1, \zero, \t1, lsl #2
83 smlal \t2, \rcp, \t1, \rcp
84.endm
85
86.macro udiv_newton rd rm rs t1 t2 t3 t4
87 lsl \rd, \rm, #16
88 clz \t1, \rs
89 mov \t2, #0
90 lsl \rs, \t1 @ normalize for the algo
91 mov \rm, #0x4d000000 @ initial estimate ~1.2
92
93 newton_step \rm, \rs, \t2, \t3, \t4
94 newton_step \rm, \rs, \t2, \t3, \t4
95 newton_step \rm, \rs, \t2, \t3, \t4
96 newton_step \rm, \rs, \t2, \t3, \t4
97
98 umull \t4, \rd, \rm, \rd
99 rsb \t2, \t1, #30 @ here t1 is 1..15
100 mov \rd, \rd, lsr \t2
101.endm
102
103@ unsigned divide rd = rm / rs; 16.16 result
104@ no div by 0 check
105@ in: rm, rs
106@ trash: rm rs t*
107.macro udiv rd rm rs t1 t2 t3 t4
108 @udiv_ \rd, \rm, \rs
109 udiv_newton \rd, \rm, \rs, \t1, \t2, \t3, \t4
110.endm
0c2ca3ba 111
112@ calculate RTPS/RTPT MAC values
113@ in: r0 context, r8,r9 VXYZ
114@ out: r10-r12 MAC123
115@ trash: r1-r7
116.macro do_rtpx_mac
117 add r1, r0, #4*32
118 add r2, r0, #4*(32+5) @ gteTRX
119 ldmia r1!,{r5-r7} @ gteR1*,gteR2*
120 ldmia r2, {r10-r12}
121 smulbb r2, r5, r8 @ gteR11 * gteVX0
122 smultt r3, r5, r8 @ gteR12 * gteVY0
123 smulbb r4, r6, r9 @ gteR13 * gteVZ0
124 qadd r2, r2, r3
125 asr r4, r4, #1 @ prevent oflow, lose a bit
126 add r3, r4, r2, asr #1
127 add r10,r10,r3, asr #11 @ gteMAC1
128 smultb r2, r6, r8 @ gteR21 * gteVX0
129 smulbt r3, r7, r8 @ gteR22 * gteVY0
130 smultb r4, r7, r9 @ gteR23 * gteVZ0
131 ldmia r1!,{r5-r6} @ gteR3*
132 qadd r2, r2, r3
133 asr r4, r4, #1
134 add r3, r4, r2, asr #1
135 add r11,r11,r3, asr #11 @ gteMAC2
136 @ be more accurate for gteMAC3, since it's also a divider
137 smulbb r2, r5, r8 @ gteR31 * gteVX0
138 smultt r3, r5, r8 @ gteR32 * gteVY0
139 smulbb r4, r6, r9 @ gteR33 * gteVZ0
140 qadd r2, r2, r3
141 asr r3, r4, #31 @ expand to 64bit
142 adds r1, r2, r4
143 adc r3, r2, asr #31 @ 64bit sum in r3,r1
144 add r12,r12,r3, lsl #20
145 add r12,r12,r1, lsr #12 @ gteMAC3
146.endm
147
148
149.global gteRTPS_nf_arm @ r0=CP2 (d,c),
150gteRTPS_nf_arm:
151 push {r4-r11,lr}
152
153 ldmia r0, {r8,r9} @ VXYZ(0)
154 do_rtpx_mac
155 add r1, r0, #4*25 @ gteMAC1
156 add r2, r0, #4*17 @ gteSZ1
157 stmia r1, {r10-r12} @ gteMAC123 save
158 ldmia r2, {r3-r5}
159 add r1, r0, #4*16 @ gteSZ0
160 add r2, r0, #4*9 @ gteIR1
161 ssatx_prep r6, 16
162 usat16_ lr, r12 @ limD
163 ssatx r10,r6, 16
164 ssatx r11,r6, 16
165 ssatx r12,r6, 16
166 stmia r1, {r3-r5,lr} @ gteSZ*
167 ldr r3, [r0,#4*(32+26)] @ gteH
168 stmia r2, {r10,r11,r12} @ gteIR123 save
169 cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ?
170 mov r9, #1<<30
171 bhs 1f
172.if 1
7c621bf0 173 udiv r9, r3, lr, r1, r2, r6, r7
0c2ca3ba 174.else
175 push {r0, r12}
176 mov r0, r3
177 mov r1, lr
178 bl DIVIDE
179 mov r9, r0
180 pop {r0, r12}
181.endif
1821:
183 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
184 cmp r9, #0x20000
185 add r1, r0, #4*12 @ gteSXY0
186 movhs r9, #0x20000
187 ldmia r1, {r2-r4}
188 /* quotient */ subhs r9, #1
189 mov r2, #0
190 smlal r6, r2, r10, r9
191 stmia r1!,{r3,r4} @ shift gteSXY
192 mov r3, #0
193 smlal r7, r3, r11, r9
194 lsr r6, #16
195 /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)]
196 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
197 ssatx_prep r2, 11
198 lsr r7, #16
199 /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11
200 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
201 ssatx r6, r2, 11 @ gteSX2
202 ssatx r7, r2, 11 @ gteSY2
203 strh r6, [r1]
204 strh r7, [r1, #2]
205 str r4, [r0,#4*24] @ gteMAC0
206 asrs r4, #12
207 movmi r4, #0
208 cmp r4, #0x1000 @ limH
209 movgt r4, #0x1000
210 str r4, [r0,#4*8] @ gteIR0
211
212 pop {r4-r11,pc}
213 .size gteRTPS_nf_arm, .-gteRTPS_nf_arm
214
215
216.global gteRTPT_nf_arm @ r0=CP2 (d,c),
217gteRTPT_nf_arm:
218 ldr r1, [r0, #4*19] @ gteSZ3
219 push {r4-r11,lr}
220 str r1, [r0, #4*16] @ gteSZ0
221 mov lr, #0
222
223rtpt_arm_loop:
224 add r1, r0, lr, lsl #1
225 ldrd r8, [r1] @ VXYZ(v)
226 do_rtpx_mac
227
228 ssatx_prep r6, 16
229 usat16_ r2, r12 @ limD
230 add r1, r0, #4*25 @ gteMAC1
231 ldr r3, [r0,#4*(32+26)] @ gteH
232 stmia r1, {r10-r12} @ gteMAC123 save
233 add r1, r0, #4*17
234 ssatx r10,r6, 16
235 ssatx r11,r6, 16
236 ssatx r12,r6, 16
237 str r2, [r1, lr] @ fSZ(v)
238 cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ?
239 mov r9, #1<<30
240 bhs 1f
241.if 1
7c621bf0 242 udiv r9, r3, r2, r1, r4, r6, r7
0c2ca3ba 243.else
244 push {r0, r12, lr}
245 mov r0, r3
246 mov r1, r2
247 bl DIVIDE
248 mov r9, r0
249 pop {r0, r12, lr}
250.endif
054175e9 2511: cmp r9, #0x20000
0c2ca3ba 252 add r1, r0, #4*12
253 movhs r9, #0x20000
254 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
255 /* quotient */ subhs r9, #1
256 mov r2, #0
257 smlal r6, r2, r10, r9
258 mov r3, #0
259 smlal r7, r3, r11, r9
260 lsr r6, #16
261 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
262 ssatx_prep r2, 11
263 lsr r7, #16
264 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
265 ssatx r6, r2, 11 @ gteSX(v)
266 ssatx r7, r2, 11 @ gteSY(v)
267 strh r6, [r1, lr]!
268 add lr, #4
269 strh r7, [r1, #2]
270 cmp lr, #12
271 blt rtpt_arm_loop
272
273 ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB
274 add r1, r0, #4*9 @ gteIR1
275 mla r3, r4, r9, r5 @ gteDQB + gteDQA * q
276 stmia r1, {r10,r11,r12} @ gteIR123 save
277
278 str r3, [r0,#4*24] @ gteMAC0
279 asrs r3, #12
280 movmi r3, #0
281 cmp r3, #0x1000 @ limH
282 movgt r3, #0x1000
283 str r3, [r0,#4*8] @ gteIR0
284
285 pop {r4-r11,pc}
286 .size gteRTPT_nf_arm, .-gteRTPT_nf_arm
287
59774ed0 288
054175e9 289@ note: not std calling convention used
290@ r0 = CP2 (d,c) (must preserve)
291@ r1 = needs_shift12
292@ r4,r5 = VXYZ(v) packed
293@ r6 = &MX11(mx)
294@ r7 = &CV1(cv)
295.macro mvma_op do_flags
296 push {r8-r11}
297
298.if \do_flags
299 ands r3, r1, #1 @ gteFLAG, shift_need
300.else
301 tst r1, #1
302.endif
303 ldmia r7, {r7-r9} @ CV123
304 ldmia r6!,{r10-r12} @ MX1*,MX2*
305 asr r1, r7, #20
306 lsl r7, #12 @ expand to 64bit
307 smlalbb r7, r1, r10, r4 @ MX11 * vx
308 smlaltt r7, r1, r10, r4 @ MX12 * vy
309 smlalbb r7, r1, r11, r5 @ MX13 * vz
310 lsrne r7, #12
311 orrne r7, r1, lsl #20 @ gteMAC0
312.if \do_flags
313 asrne r1, #20
314 adds r2, r7, #0x80000000
315 adcs r1, #0
316 orrgt r3, #(1<<30)
317 orrmi r3, #(1<<31)|(1<<27)
318 tst r3, #1 @ repeat shift test
319.endif
320 asr r1, r8, #20
321 lsl r8, #12 @ expand to 64bit
322 smlaltb r8, r1, r11, r4 @ MX21 * vx
323 smlalbt r8, r1, r12, r4 @ MX22 * vy
324 smlaltb r8, r1, r12, r5 @ MX23 * vz
325 lsrne r8, #12
326 orrne r8, r1, lsl #20 @ gteMAC1
327.if \do_flags
328 asrne r1, #20
329 adds r2, r8, #0x80000000
330 adcs r1, #0
331 orrgt r3, #(1<<29)
332 orrmi r3, #(1<<31)|(1<<26)
333 tst r3, #1 @ repeat shift test
334.endif
335 ldmia r6!,{r10-r11} @ MX3*
336 asr r1, r9, #20
337 lsl r9, #12 @ expand to 64bit
338 smlalbb r9, r1, r10, r4 @ MX31 * vx
339 smlaltt r9, r1, r10, r4 @ MX32 * vy
340 smlalbb r9, r1, r11, r5 @ MX33 * vz
341 lsrne r9, #12
342 orrne r9, r1, lsl #20 @ gteMAC2
343.if \do_flags
344 asrne r1, #20
345 adds r2, r9, #0x80000000
346 adcs r1, #0
347 orrgt r3, #(1<<28)
348 orrmi r3, #(1<<31)|(1<<25)
349 bic r3, #1
350.else
351 mov r3, #0
352.endif
353 str r3, [r0, #4*(32+31)] @ gteFLAG
354 add r1, r0, #4*25
355 stmia r1, {r7-r9}
356
357 pop {r8-r11}
358 bx lr
359.endm
360
361.global gteMVMVA_part_arm
362gteMVMVA_part_arm:
363 mvma_op 1
364 .size gteMVMVA_part_arm, .-gteMVMVA_part_arm
365
366.global gteMVMVA_part_nf_arm
367gteMVMVA_part_nf_arm:
368 mvma_op 0
369 .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm
370
371@ common version of MVMVA with cv3 (== 0) and shift12,
372@ can't overflow so no gteMAC flags needed
373@ note: not std calling convention used
374@ r0 = CP2 (d,c) (must preserve)
375@ r4,r5 = VXYZ(v) packed
376@ r6 = &MX11(mx)
377.global gteMVMVA_part_cv3sh12_arm
378gteMVMVA_part_cv3sh12_arm:
379 push {r8-r9}
380 ldmia r6!,{r7-r9} @ MX1*,MX2*
381 smulbb r1, r7, r4 @ MX11 * vx
382 smultt r2, r7, r4 @ MX12 * vy
383 smulbb r3, r8, r5 @ MX13 * vz
384 qadd r1, r1, r2
385 asr r3, #1 @ prevent oflow, lose a bit
386 add r1, r3, r1, asr #1
387 asr r7, r1, #11
388 smultb r1, r8, r4 @ MX21 * vx
389 smulbt r2, r9, r4 @ MX22 * vy
390 smultb r3, r9, r5 @ MX23 * vz
391 qadd r1, r1, r2
392 asr r3, #1
393 add r1, r3, r1, asr #1
394 asr r8, r1, #11
395 ldmia r6, {r6,r9} @ MX3*
396 smulbb r1, r6, r4 @ MX31 * vx
397 smultt r2, r6, r4 @ MX32 * vy
398 smulbb r3, r9, r5 @ MX33 * vz
399 qadd r1, r1, r2
400 asr r3, #1
401 add r1, r3, r1, asr #1
402 asr r9, r1, #11
403 add r1, r0, #4*25
404 mov r2, #0
405 stmia r1, {r7-r9}
406 str r2, [r0, #4*(32+31)] @ gteFLAG
407 pop {r8-r9}
408 bx lr
409 .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm
410
411
59774ed0 412.global gteNCLIP_arm @ r0=CP2 (d,c),
413gteNCLIP_arm:
414 push {r4-r6,lr}
7c621bf0 415 ldrsh r4, [r0, #4*12+2]
416 ldrsh r5, [r0, #4*13+2]
417 ldrsh r6, [r0, #4*14+2]
418 ldrsh lr, [r0, #4*12]
419 ldrsh r2, [r0, #4*13]
59774ed0 420 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
421 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
7c621bf0 422 smull r1, r5, lr, r5 @ RdLo, RdHi
59774ed0 423 sub r6, r4 @ 2: gteSY2 - gteSY0
7c621bf0 424 ldrsh r3, [r0, #4*14]
59774ed0 425 smlal r1, r5, r2, r6
426 mov lr, #0 @ gteFLAG
59774ed0 427 smlal r1, r5, r3, r12
428 mov r6, #1<<31
429 orr r6, #1<<15
430 movs r2, r1, lsl #1
431 adc r5, r5
432 cmp r5, #0
433.if HAVE_ARMV7
434 movtgt lr, #((1<<31)|(1<<16))>>16
435.else
436 movgt lr, #(1<<31)
437 orrgt lr, #(1<<16)
438.endif
59774ed0 439 cmn r5, #1
59774ed0 440 orrmi lr, r6
441 str r1, [r0, #4*24]
442 str lr, [r0, #4*(32+31)] @ gteFLAG
443
444 pop {r4-r6,pc}
445 .size gteNCLIP_arm, .-gteNCLIP_arm
446
447
054175e9 448.macro gteMACtoIR lm
449 ldr r2, [r0, #4*25] @ gteMAC1
450 mov r1, #1<<15
451 ldr r12,[r0, #4*(32+31)] @ gteFLAG
452 cmp r2, r1
453 subge r2, r1, #1
454 orrge r12, #(1<<31)|(1<<24)
455.if \lm
456 cmp r2, #0
457 movlt r2, #0
458.else
459 cmn r2, r1
460 rsblt r2, r1, #0
461.endif
462 str r2, [r0, #4*9]
463 ldrd r2, [r0, #4*26] @ gteMAC23
464 orrlt r12, #(1<<31)|(1<<24)
465 cmp r2, r1
466 subge r2, r1, #1
467 orrge r12, #1<<23
468 orrge r12, #1<<31
469.if \lm
470 cmp r2, #0
471 movlt r2, #0
472.else
473 cmn r2, r1
474 rsblt r2, r1, #0
475.endif
476 orrlt r12, #1<<23
477 orrlt r12, #1<<31
478 cmp r3, r1
479 subge r3, r1, #1
480 orrge r12, #1<<22
481.if \lm
482 cmp r3, #0
483 movlt r3, #0
484.else
485 cmn r3, r1
486 rsblt r3, r1, #0
487.endif
488 orrlt r12, #1<<22
489 strd r2, [r0, #4*10] @ gteIR23
490 str r12,[r0, #4*(32+31)] @ gteFLAG
491 bx lr
492.endm
493
494.global gteMACtoIR_lm0 @ r0=CP2 (d,c)
495gteMACtoIR_lm0:
496 gteMACtoIR 0
497 .size gteMACtoIR_lm0, .-gteMACtoIR_lm0
498
499.global gteMACtoIR_lm1 @ r0=CP2 (d,c)
500gteMACtoIR_lm1:
501 gteMACtoIR 1
502 .size gteMACtoIR_lm1, .-gteMACtoIR_lm1
503
504
505.global gteMACtoIR_lm0_nf @ r0=CP2 (d,c)
506gteMACtoIR_lm0_nf:
507 add r12, r0, #4*25
508 ldmia r12, {r1-r3}
509 ssatx_prep r12, 16
510 ssatx r1, r12, 16
511 ssatx r2, r12, 16
512 ssatx r3, r12, 16
513 add r12, r0, #4*9
514 stmia r12, {r1-r3}
515 bx lr
516 .size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf
517
518
519.global gteMACtoIR_lm1_nf @ r0=CP2 (d,c)
520gteMACtoIR_lm1_nf:
521 add r12, r0, #4*25
522 ldmia r12, {r1-r3}
523 ssatx0_prep r12, 16
524 ssatx0 r1, r12, 16
525 ssatx0 r2, r12, 16
526 ssatx0 r3, r12, 16
527 add r12, r0, #4*9
528 stmia r12, {r1-r3}
529 bx lr
530 .size gteMACtoIR_lm1_nf, .-gteMACtoIR_lm1_nf
531
532
533.if 0
534.global gteMVMVA_test
535gteMVMVA_test:
536 push {r4-r7,lr}
537 push {r1}
538 and r2, r1, #0x18000 @ v
539 cmp r2, #0x18000 @ v == 3?
540 addeq r4, r0, #4*9
541 addne r3, r0, r2, lsr #12
542 ldmeqia r4, {r3-r5}
543 ldmneia r3, {r4,r5}
544 lsleq r3, #16
545 lsreq r3, #16
546 orreq r4, r3, r4, lsl #16 @ r4,r5 = VXYZ(v)
547 @and r5, #0xffff
548 add r12, r0, #4*32
549 and r3, r1, #0x60000 @ mx
550 lsr r3, #17
551 add r6, r12, r3, lsl #5
552 cmp r3, #3
553 adreq r6, zeroes
554 and r2, r1, #0x06000 @ cv
555 lsr r2, #13
556 add r7, r12, r2, lsl #5
557 add r7, #4*5
558 cmp r2, #3
559 adreq r7, zeroes
560.if 1
561 adr lr, 1f
562 bne 0f
563 tst r1, #1<<19
564 bne gteMVMVA_part_cv3sh12_arm
5650:
566 and r1, #1<<19
567 lsr r1, #19
568 b gteMVMVA_part_arm
5691:
570 pop {r1}
571 tst r1, #1<<10
572 adr lr, 0f
573 beq gteMACtoIR_lm0
574 bne gteMACtoIR_lm1
5750:
576.else
577 bl gteMVMVA_part_neon
578 pop {r1}
579 and r1, #1<<10
580 bl gteMACtoIR_flags_neon
581.endif
582 pop {r4-r7,pc}
583
584zeroes:
585 .word 0,0,0,0,0
586.endif
587
588
59774ed0 589@ vim:filetype=armasm
590