drc: try even more to not compile code as 64bit
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
11
12.bss
13.align 6 @ cacheline
14
15scratch:
16.rept 8*8*2/4
17 .word 0
18.endr
19
20.text
21.align 2
22
5d8e3bf8 23@ approximate gteMAC|123 flags
24@ in: rr 123 as gteMAC|123
25@ trash: nothing
26.macro do_mac_flags rr1 rr2 rr3
27 cmp \rr1, #1
28 orrvs lr, #(1<<31)|(1<<27)
29 cmp \rr2, #1
30 orrvs lr, #(1<<31)|(1<<26)
31 cmp \rr3, #1
32 orrvs lr, #(1<<31)|(1<<25)
33 cmn \rr1, #1 @ same as adds ...
34 orrvs lr, #(1<<30)
35 cmn \rr2, #1
36 orrvs lr, #(1<<29)
37 cmn \rr3, #1
38 orrvs lr, #(1<<28)
39.endm
40
41@ approximate 3x gteMACn flags
42@ in: rr 123 as 3 instances gteMACn, *flags
43@ trash: nothing
44.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
45 cmp \rr1, #1
46 cmpvc \rr2, #1
47 cmpvc \rr3, #1
48 orrvs lr, #\nflags
49 cmn \rr1, #1 @ adds ...
50 cmnvc \rr2, #1
51 cmnvc \rr3, #1
52 orrvs lr, #\pflags
53.endm
54
17ed0d69 55@ get gteIR|123 flags from gteMAC|123
56@ in: rr 123 as gteMAC|123
5d8e3bf8 57@ trash: r2,r3
58.macro do_irs_flags rr1 rr2 rr3
17ed0d69 59 add r2, \rr1, #0x8000
60 add r3, \rr2, #0x8000
61 lsrs r2, #16
62 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
63 lsrs r3, #16
64 add r2, \rr3, #0x8000
65 orrne lr, #(1<<31)
66 orrne lr, #(1<<23) @ IR2/limB2
67 lsrs r2, #16
68 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 69.endm
70
71
72/*
5d8e3bf8 73 * RTPS/RTPT register map:
74 *
8cfbda97 75 * q | d | c code / phase 1 phase 2 scratch
76 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
77 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
78 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
79 * 3 * gteIR1-3 = gteIR1-3 /
80 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
81 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
82 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
83 * 7 0 gteDQB [s64] max gteMAC|12
84 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
85 * 9 * / gteMAC3 max gteIR|123
86 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
87 * 11 0 quotient 3
88 * 6 12 gteH (adj. for cmp)
89 * 13 gteH (float for div)
90 * ... <scratch>
91 * 15 30 0
92 * 31 0
93 */
8cfbda97 94
5d8e3bf8 95@ load gteR*, gteTR* and gteH (see map above), clear q15
96@ in: r0 - context
97@ trash: r3
98.macro rtpx_preload
8cfbda97 99 add r3, r0, #4*32
100 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 101 vmov.i32 q15, #0
8cfbda97 102 add r3, r0, #4*(32+5)
103 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 104 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
105 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 106 add r3, r0, #4*(32+26)
107 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 108 vshll.s32 q3, d5, #12 @ gteTRZ
109 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 110 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 111.endm
8cfbda97 112
5d8e3bf8 113@ do RTP* gteMAC* calculation
114@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
115@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
116@ trash: d16-d21
117.macro rtpx_mac
8cfbda97 118 vmull.s16 q8, d0, d8
119 vmull.s16 q9, d1, d8
120 vmull.s16 q10, d2, d8
121 vpaddl.s32 q8, q8
122 vpaddl.s32 q9, q9
123 vpaddl.s32 q10, q10
124 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
125 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
126 vadd.s64 d20, d21 @ QC
127 vadd.s64 d16, d4
128 vadd.s64 d18, d5
129 vadd.s64 d20, d6
130 vqshrn.s64 d8, q8, #12 @ gteMAC1
131 vqshrn.s64 d18, q9, #12 @ gteMAC2
132 vqshrn.s64 d9, q10, #12 @ gteMAC3
133 vsli.u64 d8, d18, #32 @ gteMAC|12
134 vmov.32 d9[1], r12
5d8e3bf8 135 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
136.endm
137
138.global gteRTPS_neon @ r0=CP2 (d,c),
139gteRTPS_neon:
17ed0d69 140 push {r4-r6,lr}
5d8e3bf8 141
142@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
143 movw r1, #:lower16:scratch
144 movt r1, #:upper16:scratch
145 mov r12, #0
146
4cc3050c 147 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 148 rtpx_preload
149
4cc3050c 150@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 151 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 152 vmull.s16 q8, d0, d8
153 vmull.s16 q9, d1, d8
154 vmull.s16 q10, d2, d8
155 vpadd.s32 d16, d16, d17
156 vpadd.s32 d17, d18, d19
157 vpadd.s32 d18, d20, d21
158 vpadal.s32 q2, q8
159 vpadal.s32 q3, q9 @ d6, d18 is slow?
160 vqshrn.s64 d8, q2, #12 @ gteMAC|12
161 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 162
163 add r3, r0, #4*25
164 vst1.32 d8, [r3]!
165 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 166 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 167
168 add r3, r0, #4*17 @ gteSZ*
169 vldmia r3, {q7} @ d14,d15 gteSZ|123x
170 vmov.i32 d28, #0xffff @ 0xffff[32]
171 vmax.s32 d11, d9, d31
172 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
173 vmov.i32 d26, #1
174 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 175 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 176 vshl.u32 d13, d12, #16 @ | preparing gteH
177 add r3, r0, #4*9
178 vst1.32 d18, [r3]!
179 vst1.32 d19[0], [r3]
180
181 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
182 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
183
184 add r3, r0, #4*(32+24)
185 vld1.32 d4, [r3] @ || gteOF|XY
186 add r3, r0, #4*(32+27)
187 vld1.32 d6, [r3] @ || gteDQ|AB
188
189 vand d11, d16
190 vmovl.s32 q2, d4 @ || gteOF|XY [64]
191 vmax.u32 d11, d26 @ make divisor 1 if not
192 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
193 add r3, r0, #4*16 @ | gteSZ*
194 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
195
196 vcvt.f32.u32 d13, d13 @ gteH (float for div)
197 vcvt.f32.u32 d11, d11 @ divisor
198
199 @ divide.. it's not worth messing with reciprocals here
200 @ just for 1 value, let's just use VFP divider here
201 vdiv.f32 s22, s26, s22
202
203 vcvt.u32.f32 d11, d11 @ quotient
204
205 @ while NEON's busy we calculate some flags on ARM
206 add r3, r0, #4*25
207 mov lr, #0 @ gteFLAG
208 ldmia r3, {r4-r6} @ gteMAC|123
209
4cc3050c 210 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 211 vqshl.u32 d11, #15
212
213 do_mac_flags r4, r5, r6
214
215 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 216
5d8e3bf8 217 do_irs_flags r4, r5, r6
218
219 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
220 add r3, r0, #4*13
221 vld1.32 d16, [r3] @ || load fS|XY12, new 01
222 vqmovn.s64 d18, q2 @ saturate to 32
223 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
224 vqshl.s32 d19, d18, #5 @ 11bit precision
225
226 ldr r4, [r1] @ quotient
5d8e3bf8 227 movs r3, r6, lsr #16
228 orrne lr, #(1<<31)
229 orrne lr, #(1<<18) @ fSZ (limD)
230
17ed0d69 231 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
232
5d8e3bf8 233 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
234 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
235 vmovn.s32 d18, q9 @ fS|XY2 [s16]
236
237 vqshrn.s64 d20, q10, #12 @ | gteMAC0
238 add r3, r0, #4*12
239 vst1.32 d16, [r3]! @ writeback fS|XY01
240 vst1.32 d18[0], [r3] @ ...2
241 add r3, r0, #4*24
242 vst1.32 d20[0], [r3] @ gteMAC0
243
17ed0d69 244 movs r4, r4, lsr #17
245 orrne lr, #(1<<31)
246 orrne lr, #(1<<17) @ limE
247
5d8e3bf8 248 vmax.s32 d21, d20, d31
249 vmov.i32 d22, #0x1000
250 vmin.s32 d21, d22
251 add r3, r0, #4*8
252 vst1.16 d21[0], [r3] @ gteIR0
253
17ed0d69 254 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
255 add r2, r4, #0x400<<16
256 add r3, r5, #0x400<<16
257 lsrs r2, #16+11
258 orrne lr, #(1<<14) @ limG1
259 orrne lr, #(1<<31)
260 lsrs r3, #16+11
261 orrne lr, #(1<<13) @ limG2
5d8e3bf8 262 orrne lr, #(1<<31)
5d8e3bf8 263 adds r2, r4, #1
264 addvcs r3, r5, #1
265 orrvs lr, #(1<<16) @ F
266 orrvs lr, #(1<<31)
267 subs r2, r4, #1
268 subvcs r3, r5, #1
269 orrvs lr, #(1<<31)
270
271 ldr r4, [r0, #4*24] @ gteMAC0
272 orrvs lr, #(1<<15)
273
274 adds r3, r4, #1
275 orrvs lr, #(1<<16) @ F
276 orrvs lr, #(1<<31)
277 subs r2, r4, #1
278 orrvs lr, #(1<<15) @ F
279 orrvs lr, #(1<<31)
280 cmp r4, #0x1000
281 orrhi lr, #(1<<12) @ limH
282
283 str lr, [r0, #4*(32+31)] @ gteFLAG
284
17ed0d69 285 pop {r4-r6,pc}
5d8e3bf8 286 .size gteRTPS_neon, .-gteRTPS_neon
287
288
289
290.global gteRTPT_neon @ r0=CP2 (d,c),
291gteRTPT_neon:
292 push {r4-r11,lr}
293
294 movw r1, #:lower16:scratch
295 movt r1, #:upper16:scratch
296 mov r12, #0
297
298 rtpx_preload
299
300 vmov.i32 d22, #0x7fffffff
301 vmov.i32 d23, #0x80000000
302 mov r3, #3 @ counter
303 mov r2, r0 @ VXYZ(0)
3040:
305 vldmia r2!, {d8} @ VXYZ(v)
306 vmov.16 d8[3], r12 @ kill unused upper vector
307
308 rtpx_mac
8cfbda97 309 vmin.s32 d22, d8 @ min gteMAC|12
310 vmax.s32 d23, d8 @ max gteMAC|12
311 subs r3, #1
4cc3050c 312 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 313 bgt 0b
314
4cc3050c 315 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 316
317 @ - phase2 -
318 sub r1, r1, #8*2*4
319 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
320
321 vmov d20, d0 @ gteMAC3 v=0
322 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 323 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 324 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
325 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
326 vmov d21, d9 @ ... v=2
327
328 vmov.i32 q14, #0xffff @ 0xffff[32]
329 vmax.s32 q10, q15
330 vmov.i32 q13, #1
331 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 332 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 333 vmin.s16 d24, d10 @ | find min/max IR
334 vmax.s16 d25, d10 @ |
335
5d8e3bf8 336 add r3, r0, #4*19 @ ||
337 vld1.32 d14[0], [r3] @ || gteSZ3
338
8cfbda97 339 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
340 add r3, r0, #4*17
341 vst1.32 d20, [r3]! @ | writeback fSZ(v)
342 vand q11, q10, q11
343 vst1.32 d21[0], [r3] @ |
344 vmax.u32 q10, q11, q13 @ make divisor 1 if not
345 add r3, r1, #8*8
346 vstmia r3, {q12} @ min/max IR for flags
347 vcvt.f32.u32 q10, q10
348 vshl.u32 d13, d12, #16 @ | preparing gteH
349
350 @ while NEON's busy we calculate some flags on ARM
351 add r2, r1, #8*2*3
352 mov lr, #0 @ gteFLAG
353 ldmia r2, {r4-r7} @ min/max gteMAC|12
354 subs r2, r4, #1
355 orrvs lr, #(1<<31)|(1<<27)
356 subs r3, r5, #1
357 orrvs lr, #(1<<31)|(1<<26)
358 adds r2, r6, #1
359 orrvs lr, #(1<<30)
360 adds r3, r7, #1
361 orrvs lr, #(1<<29)
362 ldr r4, [r1, #0] @ gteMAC3 v=0
363 ldr r5, [r1, #8*2] @ ... v=1
364 ldr r6, [r1, #8*4] @ ... v=2
365
366 add r3, r0, #4*(32+24)
367 vld1.32 d4, [r3] @ || gteOF|XY
368 add r3, r0, #4*(32+27)
5d8e3bf8 369 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 370
371 @ divide
372.if 1
373 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 374 vmovl.s32 q2, d4 @ || gteOF|XY [64]
375 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 376 vrecps.f32 q12, q10, q11 @ step
377 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
378 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 379 add r3, r0, #4*16
380 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 381 vdup.32 q13, d13[0] @ |
382@ vrecps.f32 q12, q10, q11 @ step
383@ vmul.f32 q11, q12, q11 @ better inv
384 vmul.f32 q10, q13, q11 @ result
385.else
5d8e3bf8 386 vmovl.s32 q2, d4 @ || gteOF|XY [64]
387 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 388 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
389 vdup.32 q13, d13[0] @ |
5d8e3bf8 390 add r3, r0, #4*16
391 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 392
393 vpush {q0}
394 vmov q0, q10 @ to test against C code
395 vdiv.f32 s0, s26, s0
396 vdiv.f32 s1, s26, s1
397 vdiv.f32 s2, s26, s2
398 vmov q10, q0
399 vpop {q0}
400.endif
401
5d8e3bf8 402 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 403 orr r7, r4, r5
404 add r4, r1, #8*8
405 orr r3, r7, r6
406 ldmia r4, {r7,r8,r10,r11} @ min/max IR
407
408 movs r3, r3, lsr #16
409 orrne lr, #(1<<31)
410 orrne lr, #(1<<18) @ fSZ (limD)
411
412@ vadd.f32 q10, q @ adjust for vcvt rounding mode
413 vcvt.u32.f32 q8, q10
414 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
415 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
416 add r6, r1, #8*10
417 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
418 vqshl.u32 q8, #15
419 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
420 vshr.u32 q8, #15 @ quotients (limE)
421 vdup.32 d24, d16[0]
422 vdup.32 d25, d16[1]
423 vdup.32 d26, d17[0] @ quotient (dup)
424
5d8e3bf8 425 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
426 mov r4, #0x10000
427 cmp r7, #1<<16
428 cmnvc r10, #1<<16
8cfbda97 429 orrvs lr, #(1<<31)
430 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 431 rsbs r2, r4, r7, lsl #16
432 cmnvc r4, r10, lsl #16
8cfbda97 433 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 434 rsbs r2, r4, r8, lsl #16
435 cmnvc r4, r11, lsl #16
8cfbda97 436 orrvs lr, #(1<<22) @ IR3/limB3
437
438 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
439 vmull.s32 q10, d20, d25 @ ... v=1
440 vmull.s32 q11, d22, d26 @ ... v=2
441 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
442 vadd.s64 q10, q2 @ ... v=1
443 vadd.s64 q11, q2 @ ... v=2
444 vqmovn.s64 d18, q9 @ saturate to 32 v=0
445 vqmovn.s64 d19, q10 @ ... v=1
446 vqmovn.s64 d20, q11 @ ... v=2
447 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
448 vmax.s32 d15, d18, d19 @ || for flags
449 vmin.s32 d14, d20
450 vmax.s32 d15, d20
451 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
452 vqshl.s32 d24, d20, #5 @ ... v=2
453 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 454 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
455 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 456 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
457 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 458 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
459 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
460 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
461 vmovn.s32 d13, q12 @ 3
462 vstmia r1, {d14-d16} @ || other cacheline than quotients
463 add r3, r0, #4*12
464 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
465 vst1.32 d13[0], [r3]
466
467 vqshrn.s64 d26, q13, #12 @ | gteMAC0
468 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
469
470 vmov.i32 d13, #0x1000
471 vmax.s32 d12, d26, d30
472
473 add r3, r0, #4*24
474 vst1.32 d26[0], [r3]! @ gteMAC0
475 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
476 vst1.32 d9[0], [r3]
477
478 vmin.s32 d12, d13 @ | gteIR0
479
8cfbda97 480 ldmia r6, {r4-r6} @ quotients
481 orr r4, r5
482 orr r4, r6
5d8e3bf8 483 add r3, r0, #4*8
8cfbda97 484 movs r4, r4, lsr #17
8cfbda97 485
8cfbda97 486 vst1.32 d12[0], [r3]! @ gteIR0
487 vst1.32 d10, [r3]! @ gteIR12
488 vst1.32 d11[0], [r3] @ ..3
489
17ed0d69 490 @ ~23 cycles
5d8e3bf8 491 orrne lr, #(1<<31) @ limE
492 orrne lr, #(1<<17) @ limE
8cfbda97 493 ldmia r1, {r4-r9}
17ed0d69 494 add r2, r4, #0x400<<16 @ min fSX
495 add r3, r6, #0x400<<16 @ max fSX
496 lsrs r2, #16+11
497 lsreqs r3, #16+11
498 orrne lr, #(1<<31) @ limG1
499 orrne lr, #(1<<14)
500 add r2, r5, #0x400<<16 @ min fSY
501 add r3, r7, #0x400<<16 @ max fSY
502 lsrs r2, #16+11
503 lsreqs r3, #16+11
504 orrne lr, #(1<<31) @ limG2
505 orrne lr, #(1<<13)
8cfbda97 506 adds r2, r9, #1
17ed0d69 507 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 508 subs r3, r8, #1
8cfbda97 509
510 ldr r4, [r0, #4*24] @ gteMAC0
511 orrvs lr, #(1<<15)
512
513 adds r3, r4, #1
514 orrvs lr, #(1<<16)
515 orrvs lr, #(1<<31) @ F
516 subs r2, r4, #1
517 orrvs lr, #(1<<15)
518 orrvs lr, #(1<<31) @ F
519 cmp r4, #0x1000
5d8e3bf8 520 orrhi lr, #(1<<12) @ limH
8cfbda97 521
522 str lr, [r0, #4*(32+31)] @ gteFLAG
523
524 pop {r4-r11,pc}
5d8e3bf8 525 .size gteRTPT_neon, .-gteRTPT_neon
526
527
528
17ed0d69 529.global gteMVMVA_neon @ r0=CP2 (d,c), op
530gteMVMVA_neon:
531 push {r4-r5,lr}
532
533 add r12, r0, #4*32
534
535 ubfx r2, r1, #15, #2 @ v
536
537 vmov.i32 q0, #0 @ d0,d1
538 vmov.i32 q1, #0 @ d2,d3
539 vmov.i32 q2, #0 @ d4,d5
540 cmp r2, #3
541 addeq r4, r0, #4*9
542 addne r3, r0, r2, lsl #3
543 ldmeqia r4, {r3-r5}
544 ldmneia r3, {r4,r5}
545 pkhbteq r4, r3, r4, lsl #16
546 uxth r5, r5
547 vmov.32 d8[0], r4
548 vmov.32 d8[1], r5 @ VXYZ(v)
549 ubfx r3, r1, #17, #2 @ mx
550 ubfx r2, r1, #13, #2 @ cv
551 cmp r3, #3
552 beq 0f @ very rare case
553 add r3, r12, r3, lsl #5
554 vldmia r3, {d0-d2} @ MXxy/gteR* [16*9]
5550:
556 cmp r2, #3
557 add r3, r12, r2, lsl #5
558 beq 0f
559 add r3, #4*5
560 vldmia r3, {d4-d5} @ CVx/gteTR*
561
5620:
563 vmov.i32 q15, #0
564 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
565 vext.16 d1, d0, d1, #3 @ xx32 -> x321
566 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
567 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
568
569 vmull.s16 q8, d0, d8
570 vmull.s16 q9, d1, d8
571 vmull.s16 q10, d2, d8
572 vpadd.s32 d16, d16, d17
573 vpadd.s32 d17, d18, d19
574 vpadd.s32 d18, d20, d21
575 vpadal.s32 q2, q8
576 vpadal.s32 q3, q9
577 tst r1, #1<<19
578 beq 0f
579 vshr.s64 q2, q2, #12
580 vshr.s64 q3, q3, #12
5810:
582 vqmovn.s64 d8, q2 @ gteMAC|12
583 vqmovn.s64 d9, q3 @ gteMAC3
584
585 tst r1, #1<<10
586 add r3, r0, #4*25
587 vqmovn.s32 d10, q4 @ gteIR|123
588 vst1.32 d8, [r3]!
589 vst1.32 d9[0], [r3] @ wb gteMAC|123
590
591 beq 0f
592 vmax.s16 d10, d31
5930:
594 vmovl.s16 q9, d10 @ expand gteIR|123
595 add r3, r0, #4*9
596 vst1.32 d18, [r3]!
597 vst1.32 d19[0], [r3]
598
599 tst r1, #1<<10 @ lm
600 mov r2, #0
601 mov lr, #0 @ gteFLAG
602 mov r12, #15
603 moveq r2, #0x8000 @ adj
604 moveq r12, #16 @ shift
605
606 add r3, r0, #4*25
607 ldmia r3, {r3-r5} @ gteMAC|123
608
609 do_mac_flags r3, r4, r5
610
611 add r3, r2
612 add r4, r2
613 add r5, r2
614 asrs r3, r12
615 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
616 asrs r4, r12
617 orrne lr, #(1<<31)
618 orrne lr, #(1<<23) @ IR2/limB2
619 asrs r5, r12
620 orrne lr, #(1<<22) @ IR3/limB3
621 str lr, [r0, #4*(32+31)] @ gteFLAG
622
623 pop {r4-r5,pc}
624 .size gteMVMVA_neon, .-gteMVMVA_neon
625
626
627
5d8e3bf8 628@ the name is misnormer, this doesn't use NEON but oh well..
629.global gteNCLIP_neon @ r0=CP2 (d,c),
630gteNCLIP_neon:
631 push {r4-r6,lr}
632
633 add r1, r0, #4*12
634 ldmia r1, {r1-r3}
635 mov r4, r1, asr #16
636 mov r5, r2, asr #16
637 mov r6, r3, asr #16
638 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
639 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
640 sxth r1, r1
641 smull r1, r5, r1, r5 @ RdLo, RdHi
642 sub r6, r4 @ 2: gteSY2 - gteSY0
643 sxth r2, r2
644 smlal r1, r5, r2, r6
645 mov lr, #0 @ gteFLAG
646 sxth r3, r3
647 smlal r1, r5, r3, r12
648 mov r6, #1<<31
649 orr r6, #1<<15
650 movs r2, r1, lsl #1
651 adc r5, r5
652 cmp r5, #0
653 movtgt lr, #((1<<31)|(1<<16))>>16
654 mvngt r1, #1<<31 @ maxint
655 cmn r5, #1
656 movmi r1, #1<<31 @ minint
657 orrmi lr, r6
658 str r1, [r0, #4*24]
659 str lr, [r0, #4*(32+31)] @ gteFLAG
660
661 pop {r4-r6,pc}
662 .size gteNCLIP_neon, .-gteNCLIP_neon
663
8cfbda97 664
665@ vim:filetype=armasm