gpu_unai: fix build on older toolchains
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
... / ...
CommitLineData
1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8
9.bss
10.align 6 @ cacheline
11
12scratch:
13.rept 8*8*2/4
14 .word 0
15.endr
16
17.text
18.align 2
19
20@ XXX: gteMAC calc shouldn't be saturating, but it is here
21
22@ approximate gteMAC|123 flags
23@ in: rr 123 as gteMAC|123
24@ trash: nothing
25.macro do_mac_flags rr1 rr2 rr3
26 cmp \rr1, #1
27 orrvs lr, #(1<<31)|(1<<27)
28 cmp \rr2, #1
29 orrvs lr, #(1<<31)|(1<<26)
30 cmp \rr3, #1
31 orrvs lr, #(1<<31)|(1<<25)
32 cmn \rr1, #1 @ same as adds ...
33 orrvs lr, #(1<<30)
34 cmn \rr2, #1
35 orrvs lr, #(1<<29)
36 cmn \rr3, #1
37 orrvs lr, #(1<<28)
38.endm
39
40@ approximate 3x gteMACn flags
41@ in: rr 123 as 3 instances gteMACn, *flags
42@ trash: nothing
43.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
44 cmp \rr1, #1
45 cmpvc \rr2, #1
46 cmpvc \rr3, #1
47 orrvs lr, #\nflags
48 cmn \rr1, #1 @ adds ...
49 cmnvc \rr2, #1
50 cmnvc \rr3, #1
51 orrvs lr, #\pflags
52.endm
53
54@ get gteIR|123 flags from gteMAC|123
55@ in: rr 123 as gteMAC|123
56@ trash: r2,r3
57.macro do_irs_flags rr1 rr2 rr3
58 add r2, \rr1, #0x8000
59 add r3, \rr2, #0x8000
60 lsrs r2, #16
61 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
62 lsrs r3, #16
63 add r2, \rr3, #0x8000
64 orrne lr, #(1<<31)
65 orrne lr, #(1<<23) @ IR2/limB2
66 lsrs r2, #16
67 orrne lr, #(1<<22) @ IR3/limB3
68.endm
69
70
71/*
72 * RTPS/RTPT register map:
73 *
74 * q | d | c code / phase 1 phase 2 scratch
75 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
76 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
77 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
78 * 3 * gteIR1-3 = gteIR1-3 /
79 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
80 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
81 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
82 * 7 0 gteDQB [s64] max gteMAC|12
83 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
84 * 9 * / gteMAC3 max gteIR|123
85 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
86 * 11 0 quotient 3
87 * 6 12 gteH (adj. for cmp)
88 * 13 gteH (float for div)
89 * ... <scratch>
90 * 15 30 0
91 * 31 0
92 */
93
94@ load gteR*, gteTR* and gteH (see map above), clear q15
95@ in: r0 - context
96@ trash: r3
97.macro rtpx_preload
98 add r3, r0, #4*32
99 vldmia r3, {d0-d2} @ gteR* [16*9]
100 vmov.i32 q15, #0
101 add r3, r0, #4*(32+5)
102 vldmia r3, {d4-d5} @ gteTR*
103 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
104 vext.16 d1, d0, d1, #3 @ xx32 -> x321
105 add r3, r0, #4*(32+26)
106 vld1.32 d11[0], [r3] @ gteH
107 vshll.s32 q3, d5, #12 @ gteTRZ
108 vshll.s32 q2, d4, #12 @ gteTR|XY
109 vmovl.s16 q6, d11 @ gteH
110.endm
111
112@ do RTP* gteMAC* calculation
113@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
114@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
115@ trash: d16-d21
116.macro rtpx_mac
117 vmull.s16 q8, d0, d8
118 vmull.s16 q9, d1, d8
119 vmull.s16 q10, d2, d8
120 vpaddl.s32 q8, q8
121 vpaddl.s32 q9, q9
122 vpaddl.s32 q10, q10
123 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
124 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
125 vadd.s64 d20, d21 @ QC
126 vadd.s64 d16, d4
127 vadd.s64 d18, d5
128 vadd.s64 d20, d6
129 vqshrn.s64 d8, q8, #12 @ gteMAC1
130 vqshrn.s64 d18, q9, #12 @ gteMAC2
131 vqshrn.s64 d9, q10, #12 @ gteMAC3
132 vsli.u64 d8, d18, #32 @ gteMAC|12
133 vmov.32 d9[1], r12
134 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
135.endm
136
137.global gteRTPS_neon @ r0=CP2 (d,c),
138gteRTPS_neon:
139 push {r4-r6,lr}
140
141@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
142 movw r1, #:lower16:scratch
143 movt r1, #:upper16:scratch
144 mov r12, #0
145
146 vldmia r0, {d8} @ VXYZ(0)
147 rtpx_preload
148
149@ rtpx_mac @ slower here, faster in RTPT?
150 vmov.16 d8[3], r12 @ kill unused upper vector
151 vmull.s16 q8, d0, d8
152 vmull.s16 q9, d1, d8
153 vmull.s16 q10, d2, d8
154 vpadd.s32 d16, d16, d17
155 vpadd.s32 d17, d18, d19
156 vpadd.s32 d18, d20, d21
157 vpadal.s32 q2, q8
158 vpadal.s32 q3, q9 @ d6, d18 is slow?
159 vqshrn.s64 d8, q2, #12 @ gteMAC|12
160 vqshrn.s64 d9, q3, #12 @ gteMAC3
161
162 add r3, r0, #4*25
163 vst1.32 d8, [r3]!
164 vst1.32 d9[0], [r3] @ wb gteMAC|123
165 vqmovn.s32 d10, q4 @ gteIR|123
166
167 add r3, r0, #4*17 @ gteSZ*
168 vldmia r3, {q7} @ d14,d15 gteSZ|123x
169 vmov.i32 d28, #0xffff @ 0xffff[32]
170 vmax.s32 d11, d9, d31
171 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
172 vmov.i32 d26, #1
173 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
174 vmovl.s16 q9, d10 @ || expand gteIR|123
175 vshl.u32 d13, d12, #16 @ | preparing gteH
176 add r3, r0, #4*9
177 vst1.32 d18, [r3]!
178 vst1.32 d19[0], [r3]
179
180 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
181 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
182
183 add r3, r0, #4*(32+24)
184 vld1.32 d4, [r3] @ || gteOF|XY
185 add r3, r0, #4*(32+27)
186 vld1.32 d6, [r3] @ || gteDQ|AB
187
188 vand d11, d16
189 vmovl.s32 q2, d4 @ || gteOF|XY [64]
190 vmax.u32 d11, d26 @ make divisor 1 if not
191 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
192 add r3, r0, #4*16 @ | gteSZ*
193 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
194
195 vcvt.f32.u32 d13, d13 @ gteH (float for div)
196 vcvt.f32.u32 d11, d11 @ divisor
197
198 @ divide.. it's not worth messing with reciprocals here
199 @ just for 1 value, let's just use VFP divider here
200 vdiv.f32 s22, s26, s22
201
202 vcvt.u32.f32 d11, d11 @ quotient
203
204 @ while NEON's busy we calculate some flags on ARM
205 add r3, r0, #4*25
206 mov lr, #0 @ gteFLAG
207 ldmia r3, {r4-r6} @ gteMAC|123
208
209 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
210 vqshl.u32 d11, #15
211
212 do_mac_flags r4, r5, r6
213
214 vshr.u32 d11, #15 @ quotient (limE)
215
216 do_irs_flags r4, r5, r6
217
218 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
219 add r3, r0, #4*13
220 vld1.32 d16, [r3] @ || load fS|XY12, new 01
221 vqmovn.s64 d18, q2 @ saturate to 32
222 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
223 vqshl.s32 d19, d18, #5 @ 11bit precision
224
225 ldr r4, [r1] @ quotient
226 movs r3, r6, lsr #16
227 orrne lr, #(1<<31)
228 orrne lr, #(1<<18) @ fSZ (limD)
229
230 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
231
232 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
233 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
234 vmovn.s32 d18, q9 @ fS|XY2 [s16]
235
236 vqmovn.s64 d20, q10 @ | gteMAC0
237 add r3, r0, #4*12
238 vst1.32 d16, [r3]! @ writeback fS|XY01
239 vst1.32 d18[0], [r3] @ ...2
240 add r3, r0, #4*24
241 vshr.s32 d21, d20, #12
242 vst1.32 d20[0], [r3] @ gteMAC0
243
244 movs r4, r4, lsr #17
245 orrne lr, #(1<<31)
246 orrne lr, #(1<<17) @ limE
247
248 vmax.s32 d21, d31
249 vmov.i32 d22, #0x1000
250 vmin.s32 d21, d22
251 add r3, r0, #4*8
252 vst1.16 d21[0], [r3] @ gteIR0
253
254 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
255 add r2, r4, #0x400<<16
256 add r3, r5, #0x400<<16
257 lsrs r2, #16+11
258 orrne lr, #(1<<14) @ limG1
259 orrne lr, #(1<<31)
260 lsrs r3, #16+11
261 orrne lr, #(1<<13) @ limG2
262 orrne lr, #(1<<31)
263 adds r2, r4, #1
264 addvcs r3, r5, #1
265 orrvs lr, #(1<<16) @ F
266 orrvs lr, #(1<<31)
267 subs r2, r4, #1
268 subvcs r3, r5, #1
269 orrvs lr, #(1<<31)
270
271 ldr r4, [r0, #4*24] @ gteMAC0
272 orrvs lr, #(1<<15)
273
274 adds r3, r4, #1
275 orrvs lr, #(1<<16) @ F
276 orrvs lr, #(1<<31)
277 subs r2, r4, #1
278 orrvs lr, #(1<<15) @ F
279 orrvs lr, #(1<<31)
280 cmp r4, #0x1000
281 orrhi lr, #(1<<12) @ limH
282
283 str lr, [r0, #4*(32+31)] @ gteFLAG
284
285 pop {r4-r6,pc}
286 .size gteRTPS_neon, .-gteRTPS_neon
287
288
289
290.global gteRTPT_neon @ r0=CP2 (d,c),
291gteRTPT_neon:
292 push {r4-r11,lr}
293
294 movw r1, #:lower16:scratch
295 movt r1, #:upper16:scratch
296 mov r12, #0
297
298 rtpx_preload
299
300 vmov.i32 d22, #0x7fffffff
301 vmov.i32 d23, #0x80000000
302 mov r3, #3 @ counter
303 mov r2, r0 @ VXYZ(0)
3040:
305 vldmia r2!, {d8} @ VXYZ(v)
306 vmov.16 d8[3], r12 @ kill unused upper vector
307
308 rtpx_mac
309 vmin.s32 d22, d8 @ min gteMAC|12
310 vmax.s32 d23, d8 @ max gteMAC|12
311 subs r3, #1
312 vst1.32 {d9,d10}, [r1, :128]!
313 bgt 0b
314
315 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
316
317 @ - phase2 -
318 sub r1, r1, #8*2*4
319 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
320
321 vmov d20, d0 @ gteMAC3 v=0
322 vmin.s16 d24, d1, d3 @ | find min IR
323 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
324 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
325 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
326 vmov d21, d9 @ ... v=2
327
328 vmov.i32 q14, #0xffff @ 0xffff[32]
329 vmax.s32 q10, q15
330 vmov.i32 q13, #1
331 vdup.32 q11, d22[0] @ gteH/2
332 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
333 vmin.s16 d24, d10 @ | find min/max IR
334 vmax.s16 d25, d10 @ |
335
336 add r3, r0, #4*19 @ ||
337 vld1.32 d14[0], [r3] @ || gteSZ3
338
339 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
340 add r3, r0, #4*17
341 vst1.32 d20, [r3]! @ | writeback fSZ(v)
342 vand q11, q10, q11
343 vst1.32 d21[0], [r3] @ |
344 vmax.u32 q10, q11, q13 @ make divisor 1 if not
345 add r3, r1, #8*8
346 vstmia r3, {q12} @ min/max IR for flags
347 vcvt.f32.u32 q10, q10
348 vshl.u32 d13, d12, #16 @ | preparing gteH
349
350 @ while NEON's busy we calculate some flags on ARM
351 add r2, r1, #8*2*3
352 mov lr, #0 @ gteFLAG
353 ldmia r2, {r4-r7} @ min/max gteMAC|12
354 subs r2, r4, #1
355 orrvs lr, #(1<<31)|(1<<27)
356 subs r3, r5, #1
357 orrvs lr, #(1<<31)|(1<<26)
358 adds r2, r6, #1
359 orrvs lr, #(1<<30)
360 adds r3, r7, #1
361 orrvs lr, #(1<<29)
362 ldr r4, [r1, #0] @ gteMAC3 v=0
363 ldr r5, [r1, #8*2] @ ... v=1
364 ldr r6, [r1, #8*4] @ ... v=2
365
366 add r3, r0, #4*(32+24)
367 vld1.32 d4, [r3] @ || gteOF|XY
368 add r3, r0, #4*(32+27)
369 vld1.32 d6, [r3] @ || gteDQ|AB
370
371 @ divide
372.if 1
373 vrecpe.f32 q11, q10 @ inv
374 vmovl.s32 q2, d4 @ || gteOF|XY [64]
375 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
376 vrecps.f32 q12, q10, q11 @ step
377 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
378 vmov.f32 q8, #0.5 @ |||
379 vmul.f32 q11, q12, q11 @ better inv
380 add r3, r0, #4*16
381 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
382 vdup.32 q13, d13[0] @ |
383@ vrecps.f32 q12, q10, q11 @ step
384@ vmul.f32 q11, q12, q11 @ better inv
385 vmul.f32 q10, q13, q11 @ result
386.else
387 vmov.f32 q8, #0.5 @ |||
388 vmovl.s32 q2, d4 @ || gteOF|XY [64]
389 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
390 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
391 vdup.32 q13, d13[0] @ |
392 add r3, r0, #4*16
393 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
394
395 vpush {q0}
396 vmov q0, q10 @ to test against C code
397 vdiv.f32 s0, s26, s0
398 vdiv.f32 s1, s26, s1
399 vdiv.f32 s2, s26, s2
400 vmov q10, q0
401 vpop {q0}
402.endif
403
404 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
405 orr r7, r4, r5
406 add r4, r1, #8*8
407 orr r3, r7, r6
408 ldmia r4, {r7,r8,r10,r11} @ min/max IR
409
410 movs r3, r3, lsr #16
411 orrne lr, #(1<<31)
412 orrne lr, #(1<<18) @ fSZ (limD)
413
414 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
415 vcvt.u32.f32 q8, q10
416 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
417 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
418 add r6, r1, #8*10
419 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
420 vqshl.u32 q8, #15
421 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
422 vshr.u32 q8, #15 @ quotients (limE)
423 vdup.32 d24, d16[0]
424 vdup.32 d25, d16[1]
425 vdup.32 d26, d17[0] @ quotient (dup)
426
427 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
428 mov r4, #0x10000
429 cmp r7, #1<<16
430 cmnvc r10, #1<<16
431 orrvs lr, #(1<<31)
432 orrvs lr, #(1<<23) @ IR2/limB2
433 rsbs r2, r4, r7, lsl #16
434 cmnvc r4, r10, lsl #16
435 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
436 rsbs r2, r4, r8, lsl #16
437 cmnvc r4, r11, lsl #16
438 orrvs lr, #(1<<22) @ IR3/limB3
439
440 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
441 vmull.s32 q10, d20, d25 @ ... v=1
442 vmull.s32 q11, d22, d26 @ ... v=2
443 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
444 vadd.s64 q10, q2 @ ... v=1
445 vadd.s64 q11, q2 @ ... v=2
446 vqmovn.s64 d18, q9 @ saturate to 32 v=0
447 vqmovn.s64 d19, q10 @ ... v=1
448 vqmovn.s64 d20, q11 @ ... v=2
449 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
450 vmax.s32 d15, d18, d19 @ || for flags
451 vmin.s32 d14, d20
452 vmax.s32 d15, d20
453 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
454 vqshl.s32 d24, d20, #5 @ ... v=2
455 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
456 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
457 vpmax.s32 d17, d15, d31 @ ||
458 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
459 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
460 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
461 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
462 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
463 vmovn.s32 d13, q12 @ 3
464 vstmia r1, {d14-d16} @ || other cacheline than quotients
465 add r3, r0, #4*12
466 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
467 vst1.32 d13[0], [r3]
468
469 vqmovn.s64 d26, q13 @ | gteMAC0
470 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
471
472 vmov.i32 d13, #0x1000
473 vshr.s32 d12, d26, #12
474
475 add r3, r0, #4*24
476 vst1.32 d26[0], [r3]! @ gteMAC0
477 vmax.s32 d12, d30
478 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
479 vst1.32 d9[0], [r3]
480
481 vmin.s32 d12, d13 @ | gteIR0
482
483 ldmia r6, {r4-r6} @ quotients
484 orr r4, r5
485 orr r4, r6
486 add r3, r0, #4*8
487 movs r4, r4, lsr #17
488
489 vst1.32 d12[0], [r3]! @ gteIR0
490 vst1.32 d10, [r3]! @ gteIR12
491 vst1.32 d11[0], [r3] @ ..3
492
493 @ ~23 cycles
494 orrne lr, #(1<<31) @ limE
495 orrne lr, #(1<<17) @ limE
496 ldmia r1, {r4-r9}
497 add r2, r4, #0x400<<16 @ min fSX
498 add r3, r6, #0x400<<16 @ max fSX
499 lsrs r2, #16+11
500 lsreqs r3, #16+11
501 orrne lr, #(1<<31) @ limG1
502 orrne lr, #(1<<14)
503 add r2, r5, #0x400<<16 @ min fSY
504 add r3, r7, #0x400<<16 @ max fSY
505 lsrs r2, #16+11
506 lsreqs r3, #16+11
507 orrne lr, #(1<<31) @ limG2
508 orrne lr, #(1<<13)
509 adds r2, r9, #1
510 orrvs lr, #(1<<16) @ F (31 already done by above)
511 subs r3, r8, #1
512
513 ldr r4, [r0, #4*24] @ gteMAC0
514 orrvs lr, #(1<<15)
515
516 adds r3, r4, #1
517 orrvs lr, #(1<<16)
518 orrvs lr, #(1<<31) @ F
519 subs r2, r4, #1
520 orrvs lr, #(1<<15)
521 orrvs lr, #(1<<31) @ F
522 cmp r4, #0x1000
523 orrhi lr, #(1<<12) @ limH
524
525 str lr, [r0, #4*(32+31)] @ gteFLAG
526
527 pop {r4-r11,pc}
528 .size gteRTPT_neon, .-gteRTPT_neon
529
530
531
532@ note: non-std calling convention used
533@ r0 = CP2 (d,c) (must preserve)
534@ r1 = op
535@ r4,r5 = VXYZ(v) packed
536@ r6 = &MX11(mx)
537@ r7 = &CV1(cv)
538.global gteMVMVA_part_neon
539gteMVMVA_part_neon:
540 uxth r5, r5
541 vmov.32 d8[0], r4
542 vmov.32 d8[1], r5 @ VXYZ(v)
543 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
544 vldmia r7, {d4-d5} @ CVx/gteTR*
545
546 vmov.i32 q15, #0
547 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
548 vext.16 d1, d0, d1, #3 @ xx32 -> x321
549 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
550 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
551
552 vmull.s16 q8, d0, d8
553 vmull.s16 q9, d1, d8
554 vmull.s16 q10, d2, d8
555 vpadd.s32 d16, d16, d17
556 vpadd.s32 d17, d18, d19
557 vpadd.s32 d18, d20, d21
558 vpadal.s32 q2, q8
559 vpadal.s32 q3, q9
560 tst r1, #1<<19
561 beq 0f
562 vshr.s64 q2, q2, #12
563 vshr.s64 q3, q3, #12
5640:
565 vqmovn.s64 d8, q2 @ gteMAC|12
566 vqmovn.s64 d9, q3 @ gteMAC3
567
568 tst r1, #1<<10
569 add r3, r0, #4*25
570 vqmovn.s32 d10, q4 @ gteIR|123
571 vst1.32 d8, [r3]!
572 vst1.32 d9[0], [r3] @ wb gteMAC|123
573
574 beq 0f
575 vmax.s16 d10, d31
5760:
577 vmovl.s16 q9, d10 @ expand gteIR|123
578 add r3, r0, #4*9
579 vst1.32 d18, [r3]!
580 vst1.32 d19[0], [r3]
581 bx lr
582 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
583
584
585@ get flags after gteMVMVA_part_neon operation
586.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
587gteMACtoIR_flags_neon:
588 push {r4,r5,lr}
589 tst r1, r1 @ lm
590 mov lr, #0 @ gteFLAG
591 mov r2, #0
592 mov r12, #15
593 moveq r2, #0x8000 @ adj
594 moveq r12, #16 @ shift
595
596 add r3, r0, #4*25
597 ldmia r3, {r3-r5} @ gteMAC|123
598
599 do_mac_flags r3, r4, r5
600
601 add r3, r2
602 add r4, r2
603 add r5, r2
604 asrs r3, r12
605 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
606 asrs r4, r12
607 orrne lr, #(1<<31)
608 orrne lr, #(1<<23) @ IR2/limB2
609 asrs r5, r12
610 orrne lr, #(1<<22) @ IR3/limB3
611 str lr, [r0, #4*(32+31)] @ gteFLAG
612
613 pop {r4,r5,pc}
614 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
615
616
617
618@ vim:filetype=armasm