get rid of old memhandlers
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
... / ...
CommitLineData
1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8
9.bss
10.align 6 @ cacheline
11
12scratch:
13.rept 8*8*2/4
14 .word 0
15.endr
16
17.text
18.align 2
19
20@ XXX: gteMAC calc shouldn't be saturating, but it is here
21
22@ approximate gteMAC|123 flags
23@ in: rr 123 as gteMAC|123
24@ trash: nothing
25.macro do_mac_flags rr1 rr2 rr3
26 cmp \rr1, #1
27 orrvs lr, #(1<<31)|(1<<27)
28 cmp \rr2, #1
29 orrvs lr, #(1<<31)|(1<<26)
30 cmp \rr3, #1
31 orrvs lr, #(1<<31)|(1<<25)
32 cmn \rr1, #1 @ same as adds ...
33 orrvs lr, #(1<<30)
34 cmn \rr2, #1
35 orrvs lr, #(1<<29)
36 cmn \rr3, #1
37 orrvs lr, #(1<<28)
38.endm
39
40@ approximate 3x gteMACn flags
41@ in: rr 123 as 3 instances gteMACn, *flags
42@ trash: nothing
43.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
44 cmp \rr1, #1
45 cmpvc \rr2, #1
46 cmpvc \rr3, #1
47 orrvs lr, #\nflags
48 cmn \rr1, #1 @ adds ...
49 cmnvc \rr2, #1
50 cmnvc \rr3, #1
51 orrvs lr, #\pflags
52.endm
53
54@ get gteIR|123 flags from gteMAC|123
55@ in: rr 123 as gteMAC|123
56@ trash: r2,r3
57.macro do_irs_flags rr1 rr2 rr3
58 add r2, \rr1, #0x8000
59 add r3, \rr2, #0x8000
60 lsrs r2, #16
61 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
62 lsrs r3, #16
63 add r2, \rr3, #0x8000
64 orrne lr, #(1<<31)
65 orrne lr, #(1<<23) @ IR2/limB2
66 lsrs r2, #16
67 orrne lr, #(1<<22) @ IR3/limB3
68.endm
69
70
71/*
72 * RTPS/RTPT register map:
73 *
74 * q | d | c code / phase 1 phase 2 scratch
75 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
76 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
77 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
78 * 3 * gteIR1-3 = gteIR1-3 /
79 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
80 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
81 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
82 * 7 0 gteDQB [s64] max gteMAC|12
83 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
84 * 9 * / gteMAC3 max gteIR|123
85 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
86 * 11 0 quotient 3
87 * 6 12 gteH (adj. for cmp)
88 * 13 gteH (float for div)
89 * ... <scratch>
90 * 15 30 0
91 * 31 0
92 */
93
94@ load gteR*, gteTR* and gteH (see map above), clear q15
95@ in: r0 - context
96@ trash: r3
97.macro rtpx_preload
98 add r3, r0, #4*32
99 vldmia r3, {d0-d2} @ gteR* [16*9]
100 vmov.i32 q15, #0
101 add r3, r0, #4*(32+5)
102 vldmia r3, {d4-d5} @ gteTR*
103 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
104 vext.16 d1, d0, d1, #3 @ xx32 -> x321
105 add r3, r0, #4*(32+26)
106 vld1.32 d11[0], [r3] @ gteH
107 vshll.s32 q3, d5, #12 @ gteTRZ
108 vshll.s32 q2, d4, #12 @ gteTR|XY
109 vmovl.s16 q6, d11 @ gteH
110.endm
111
112@ do RTP* gteMAC* calculation
113@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
114@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
115@ trash: d16-d21
116.macro rtpx_mac
117 vmull.s16 q8, d0, d8
118 vmull.s16 q9, d1, d8
119 vmull.s16 q10, d2, d8
120 vpaddl.s32 q8, q8
121 vpaddl.s32 q9, q9
122 vpaddl.s32 q10, q10
123 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
124 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
125 vadd.s64 d20, d21 @ QC
126 vadd.s64 d16, d4
127 vadd.s64 d18, d5
128 vadd.s64 d20, d6
129 vqshrn.s64 d8, q8, #12 @ gteMAC1
130 vqshrn.s64 d18, q9, #12 @ gteMAC2
131 vqshrn.s64 d9, q10, #12 @ gteMAC3
132 vsli.u64 d8, d18, #32 @ gteMAC|12
133 vmov.32 d9[1], r12
134 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
135.endm
136
137.global gteRTPS_neon @ r0=CP2 (d,c),
138gteRTPS_neon:
139 push {r4-r6,lr}
140
141@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
142 movw r1, #:lower16:scratch
143 movt r1, #:upper16:scratch
144 mov r12, #0
145
146 vldmia r0, {d8} @ VXYZ(0)
147 rtpx_preload
148
149@ rtpx_mac @ slower here, faster in RTPT?
150 vmov.16 d8[3], r12 @ kill unused upper vector
151 vmull.s16 q8, d0, d8
152 vmull.s16 q9, d1, d8
153 vmull.s16 q10, d2, d8
154 vpadd.s32 d16, d16, d17
155 vpadd.s32 d17, d18, d19
156 vpadd.s32 d18, d20, d21
157 vpadal.s32 q2, q8
158 vpadal.s32 q3, q9 @ d6, d18 is slow?
159 vqshrn.s64 d8, q2, #12 @ gteMAC|12
160 vqshrn.s64 d9, q3, #12 @ gteMAC3
161
162 add r3, r0, #4*25
163 vst1.32 d8, [r3]!
164 vst1.32 d9[0], [r3] @ wb gteMAC|123
165 vqmovn.s32 d10, q4 @ gteIR|123
166
167 add r3, r0, #4*17 @ gteSZ*
168 vldmia r3, {q7} @ d14,d15 gteSZ|123x
169 vmov.i32 d28, #0xffff @ 0xffff[32]
170 vmax.s32 d11, d9, d31
171 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
172 vmov.i32 d26, #1
173 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
174 vmovl.s16 q9, d10 @ || expand gteIR|123
175 vshl.u32 d13, d12, #16 @ | preparing gteH
176 add r3, r0, #4*9
177 vst1.32 d18, [r3]!
178 vst1.32 d19[0], [r3]
179
180 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
181 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
182
183 add r3, r0, #4*(32+24)
184 vld1.32 d4, [r3] @ || gteOF|XY
185 add r3, r0, #4*(32+27)
186 vld1.32 d6, [r3] @ || gteDQ|AB
187
188 vand d11, d16
189 vmovl.s32 q2, d4 @ || gteOF|XY [64]
190 vmax.u32 d11, d26 @ make divisor 1 if not
191 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
192 add r3, r0, #4*16 @ | gteSZ*
193 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
194
195 vcvt.f32.u32 d13, d13 @ gteH (float for div)
196 vcvt.f32.u32 d11, d11 @ divisor
197
198 @ divide.. it's not worth messing with reciprocals here
199 @ just for 1 value, let's just use VFP divider here
200 vdiv.f32 s22, s26, s22
201
202 vcvt.u32.f32 d11, d11 @ quotient
203
204 @ while NEON's busy we calculate some flags on ARM
205 add r3, r0, #4*25
206 mov lr, #0 @ gteFLAG
207 ldmia r3, {r4-r6} @ gteMAC|123
208
209 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
210 vqshl.u32 d11, #15
211
212 do_mac_flags r4, r5, r6
213
214 vshr.u32 d11, #15 @ quotient (limE)
215
216 do_irs_flags r4, r5, r6
217
218 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
219 add r3, r0, #4*13
220 vld1.32 d16, [r3] @ || load fS|XY12, new 01
221 vqmovn.s64 d18, q2 @ saturate to 32
222 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
223 vqshl.s32 d19, d18, #5 @ 11bit precision
224
225 ldr r4, [r1] @ quotient
226 movs r3, r6, lsr #16
227 orrne lr, #(1<<31)
228 orrne lr, #(1<<18) @ fSZ (limD)
229
230 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
231
232 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
233 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
234 vmovn.s32 d18, q9 @ fS|XY2 [s16]
235
236 vqmovn.s64 d20, q10 @ | gteMAC0
237 add r3, r0, #4*12
238 vst1.32 d16, [r3]! @ writeback fS|XY01
239 vst1.32 d18[0], [r3] @ ...2
240 add r3, r0, #4*24
241 vshr.s32 d21, d20, #12
242 vst1.32 d20[0], [r3] @ gteMAC0
243
244 movs r4, r4, lsr #17
245 orrne lr, #(1<<31)
246 orrne lr, #(1<<17) @ limE
247
248 vmax.s32 d21, d31
249 vmov.i32 d22, #0x1000
250 vmin.s32 d21, d22
251 add r3, r0, #4*8
252 vst1.16 d21[0], [r3] @ gteIR0
253
254 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
255 add r2, r4, #0x400<<16
256 add r3, r5, #0x400<<16
257 lsrs r2, #16+11
258 orrne lr, #(1<<14) @ limG1
259 orrne lr, #(1<<31)
260 lsrs r3, #16+11
261 orrne lr, #(1<<13) @ limG2
262 orrne lr, #(1<<31)
263 adds r2, r4, #1
264 addvcs r3, r5, #1
265 orrvs lr, #(1<<16) @ F
266 orrvs lr, #(1<<31)
267 subs r2, r4, #1
268 subvcs r3, r5, #1
269 orrvs lr, #(1<<31)
270
271 ldr r4, [r0, #4*24] @ gteMAC0
272 orrvs lr, #(1<<15)
273
274 adds r3, r4, #1
275 orrvs lr, #(1<<16) @ F
276 orrvs lr, #(1<<31)
277 subs r2, r4, #1
278 orrvs lr, #(1<<15) @ F
279 orrvs lr, #(1<<31)
280 cmp r4, #0x1000
281 orrhi lr, #(1<<12) @ limH
282
283 str lr, [r0, #4*(32+31)] @ gteFLAG
284
285 pop {r4-r6,pc}
286 .size gteRTPS_neon, .-gteRTPS_neon
287
288
289
290.global gteRTPT_neon @ r0=CP2 (d,c),
291gteRTPT_neon:
292 push {r4-r11,lr}
293
294 movw r1, #:lower16:scratch
295 movt r1, #:upper16:scratch
296 mov r12, #0
297
298 rtpx_preload
299
300 vmov.i32 d22, #0x7fffffff
301 vmov.i32 d23, #0x80000000
302 mov r3, #3 @ counter
303 mov r2, r0 @ VXYZ(0)
3040:
305 vldmia r2!, {d8} @ VXYZ(v)
306 vmov.16 d8[3], r12 @ kill unused upper vector
307
308 rtpx_mac
309 vmin.s32 d22, d8 @ min gteMAC|12
310 vmax.s32 d23, d8 @ max gteMAC|12
311 subs r3, #1
312 vst1.32 {d9,d10}, [r1, :128]!
313 bgt 0b
314
315 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
316
317 @ - phase2 -
318 sub r1, r1, #8*2*4
319 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
320
321 vmov d20, d0 @ gteMAC3 v=0
322 vmin.s16 d24, d1, d3 @ | find min IR
323 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
324 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
325 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
326 vmov d21, d9 @ ... v=2
327
328 vmov.i32 q14, #0xffff @ 0xffff[32]
329 vmax.s32 q10, q15
330 vmov.i32 q13, #1
331 vdup.32 q11, d22[0] @ gteH/2
332 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
333 vmin.s16 d24, d10 @ | find min/max IR
334 vmax.s16 d25, d10 @ |
335
336 add r3, r0, #4*19 @ ||
337 vld1.32 d14[0], [r3] @ || gteSZ3
338
339 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
340 add r3, r0, #4*17
341 vst1.32 d20, [r3]! @ | writeback fSZ(v)
342 vand q11, q10, q11
343 vst1.32 d21[0], [r3] @ |
344 vmax.u32 q10, q11, q13 @ make divisor 1 if not
345 add r3, r1, #8*8
346 vstmia r3, {q12} @ min/max IR for flags
347 vcvt.f32.u32 q10, q10
348 vshl.u32 d13, d12, #16 @ | preparing gteH
349
350 @ while NEON's busy we calculate some flags on ARM
351 add r2, r1, #8*2*3
352 mov lr, #0 @ gteFLAG
353 ldmia r2, {r4-r7} @ min/max gteMAC|12
354 subs r2, r4, #1
355 orrvs lr, #(1<<31)|(1<<27)
356 subs r3, r5, #1
357 orrvs lr, #(1<<31)|(1<<26)
358 adds r2, r6, #1
359 orrvs lr, #(1<<30)
360 adds r3, r7, #1
361 orrvs lr, #(1<<29)
362 ldr r4, [r1, #0] @ gteMAC3 v=0
363 ldr r5, [r1, #8*2] @ ... v=1
364 ldr r6, [r1, #8*4] @ ... v=2
365
366 add r3, r0, #4*(32+24)
367 vld1.32 d4, [r3] @ || gteOF|XY
368 add r3, r0, #4*(32+27)
369 vld1.32 d6, [r3] @ || gteDQ|AB
370
371 @ divide
372.if 1
373 vrecpe.f32 q11, q10 @ inv
374 vmovl.s32 q2, d4 @ || gteOF|XY [64]
375 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
376 vrecps.f32 q12, q10, q11 @ step
377 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
378 vmul.f32 q11, q12, q11 @ better inv
379 add r3, r0, #4*16
380 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
381 vdup.32 q13, d13[0] @ |
382@ vrecps.f32 q12, q10, q11 @ step
383@ vmul.f32 q11, q12, q11 @ better inv
384 vmul.f32 q10, q13, q11 @ result
385.else
386 vmovl.s32 q2, d4 @ || gteOF|XY [64]
387 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
388 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
389 vdup.32 q13, d13[0] @ |
390 add r3, r0, #4*16
391 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
392
393 vpush {q0}
394 vmov q0, q10 @ to test against C code
395 vdiv.f32 s0, s26, s0
396 vdiv.f32 s1, s26, s1
397 vdiv.f32 s2, s26, s2
398 vmov q10, q0
399 vpop {q0}
400.endif
401
402 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
403 orr r7, r4, r5
404 add r4, r1, #8*8
405 orr r3, r7, r6
406 ldmia r4, {r7,r8,r10,r11} @ min/max IR
407
408 movs r3, r3, lsr #16
409 orrne lr, #(1<<31)
410 orrne lr, #(1<<18) @ fSZ (limD)
411
412@ vadd.f32 q10, q @ adjust for vcvt rounding mode
413 vcvt.u32.f32 q8, q10
414 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
415 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
416 add r6, r1, #8*10
417 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
418 vqshl.u32 q8, #15
419 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
420 vshr.u32 q8, #15 @ quotients (limE)
421 vdup.32 d24, d16[0]
422 vdup.32 d25, d16[1]
423 vdup.32 d26, d17[0] @ quotient (dup)
424
425 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
426 mov r4, #0x10000
427 cmp r7, #1<<16
428 cmnvc r10, #1<<16
429 orrvs lr, #(1<<31)
430 orrvs lr, #(1<<23) @ IR2/limB2
431 rsbs r2, r4, r7, lsl #16
432 cmnvc r4, r10, lsl #16
433 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
434 rsbs r2, r4, r8, lsl #16
435 cmnvc r4, r11, lsl #16
436 orrvs lr, #(1<<22) @ IR3/limB3
437
438 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
439 vmull.s32 q10, d20, d25 @ ... v=1
440 vmull.s32 q11, d22, d26 @ ... v=2
441 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
442 vadd.s64 q10, q2 @ ... v=1
443 vadd.s64 q11, q2 @ ... v=2
444 vqmovn.s64 d18, q9 @ saturate to 32 v=0
445 vqmovn.s64 d19, q10 @ ... v=1
446 vqmovn.s64 d20, q11 @ ... v=2
447 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
448 vmax.s32 d15, d18, d19 @ || for flags
449 vmin.s32 d14, d20
450 vmax.s32 d15, d20
451 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
452 vqshl.s32 d24, d20, #5 @ ... v=2
453 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
454 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
455 vpmax.s32 d17, d15, d31 @ ||
456 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
457 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
458 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
459 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
460 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
461 vmovn.s32 d13, q12 @ 3
462 vstmia r1, {d14-d16} @ || other cacheline than quotients
463 add r3, r0, #4*12
464 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
465 vst1.32 d13[0], [r3]
466
467 vqmovn.s64 d26, q13 @ | gteMAC0
468 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
469
470 vmov.i32 d13, #0x1000
471 vshr.s32 d12, d26, #12
472
473 add r3, r0, #4*24
474 vst1.32 d26[0], [r3]! @ gteMAC0
475 vmax.s32 d12, d30
476 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
477 vst1.32 d9[0], [r3]
478
479 vmin.s32 d12, d13 @ | gteIR0
480
481 ldmia r6, {r4-r6} @ quotients
482 orr r4, r5
483 orr r4, r6
484 add r3, r0, #4*8
485 movs r4, r4, lsr #17
486
487 vst1.32 d12[0], [r3]! @ gteIR0
488 vst1.32 d10, [r3]! @ gteIR12
489 vst1.32 d11[0], [r3] @ ..3
490
491 @ ~23 cycles
492 orrne lr, #(1<<31) @ limE
493 orrne lr, #(1<<17) @ limE
494 ldmia r1, {r4-r9}
495 add r2, r4, #0x400<<16 @ min fSX
496 add r3, r6, #0x400<<16 @ max fSX
497 lsrs r2, #16+11
498 lsreqs r3, #16+11
499 orrne lr, #(1<<31) @ limG1
500 orrne lr, #(1<<14)
501 add r2, r5, #0x400<<16 @ min fSY
502 add r3, r7, #0x400<<16 @ max fSY
503 lsrs r2, #16+11
504 lsreqs r3, #16+11
505 orrne lr, #(1<<31) @ limG2
506 orrne lr, #(1<<13)
507 adds r2, r9, #1
508 orrvs lr, #(1<<16) @ F (31 already done by above)
509 subs r3, r8, #1
510
511 ldr r4, [r0, #4*24] @ gteMAC0
512 orrvs lr, #(1<<15)
513
514 adds r3, r4, #1
515 orrvs lr, #(1<<16)
516 orrvs lr, #(1<<31) @ F
517 subs r2, r4, #1
518 orrvs lr, #(1<<15)
519 orrvs lr, #(1<<31) @ F
520 cmp r4, #0x1000
521 orrhi lr, #(1<<12) @ limH
522
523 str lr, [r0, #4*(32+31)] @ gteFLAG
524
525 pop {r4-r11,pc}
526 .size gteRTPT_neon, .-gteRTPT_neon
527
528
529
530.global gteMVMVA_neon @ r0=CP2 (d,c), op
531gteMVMVA_neon:
532 push {r4-r5,lr}
533
534 add r12, r0, #4*32
535
536 ubfx r2, r1, #15, #2 @ v
537
538 vmov.i32 q0, #0 @ d0,d1
539 vmov.i32 q1, #0 @ d2,d3
540 vmov.i32 q2, #0 @ d4,d5
541 cmp r2, #3
542 addeq r4, r0, #4*9
543 addne r3, r0, r2, lsl #3
544 ldmeqia r4, {r3-r5}
545 ldmneia r3, {r4,r5}
546 pkhbteq r4, r3, r4, lsl #16
547 uxth r5, r5
548 vmov.32 d8[0], r4
549 vmov.32 d8[1], r5 @ VXYZ(v)
550 ubfx r3, r1, #17, #2 @ mx
551 ubfx r2, r1, #13, #2 @ cv
552 cmp r3, #3
553 beq 0f @ very rare case
554 add r3, r12, r3, lsl #5
555 vldmia r3, {d0-d2} @ MXxy/gteR* [16*9]
5560:
557 cmp r2, #3
558 add r3, r12, r2, lsl #5
559 beq 0f
560 add r3, #4*5
561 vldmia r3, {d4-d5} @ CVx/gteTR*
562
5630:
564 vmov.i32 q15, #0
565 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
566 vext.16 d1, d0, d1, #3 @ xx32 -> x321
567 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
568 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
569
570 vmull.s16 q8, d0, d8
571 vmull.s16 q9, d1, d8
572 vmull.s16 q10, d2, d8
573 vpadd.s32 d16, d16, d17
574 vpadd.s32 d17, d18, d19
575 vpadd.s32 d18, d20, d21
576 vpadal.s32 q2, q8
577 vpadal.s32 q3, q9
578 tst r1, #1<<19
579 beq 0f
580 vshr.s64 q2, q2, #12
581 vshr.s64 q3, q3, #12
5820:
583 vqmovn.s64 d8, q2 @ gteMAC|12
584 vqmovn.s64 d9, q3 @ gteMAC3
585
586 tst r1, #1<<10
587 add r3, r0, #4*25
588 vqmovn.s32 d10, q4 @ gteIR|123
589 vst1.32 d8, [r3]!
590 vst1.32 d9[0], [r3] @ wb gteMAC|123
591
592 beq 0f
593 vmax.s16 d10, d31
5940:
595 vmovl.s16 q9, d10 @ expand gteIR|123
596 add r3, r0, #4*9
597 vst1.32 d18, [r3]!
598 vst1.32 d19[0], [r3]
599
600 tst r1, #1<<10 @ lm
601 mov r2, #0
602 mov lr, #0 @ gteFLAG
603 mov r12, #15
604 moveq r2, #0x8000 @ adj
605 moveq r12, #16 @ shift
606
607 add r3, r0, #4*25
608 ldmia r3, {r3-r5} @ gteMAC|123
609
610 do_mac_flags r3, r4, r5
611
612 add r3, r2
613 add r4, r2
614 add r5, r2
615 asrs r3, r12
616 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
617 asrs r4, r12
618 orrne lr, #(1<<31)
619 orrne lr, #(1<<23) @ IR2/limB2
620 asrs r5, r12
621 orrne lr, #(1<<22) @ IR3/limB3
622 str lr, [r0, #4*(32+31)] @ gteFLAG
623
624 pop {r4-r5,pc}
625 .size gteMVMVA_neon, .-gteMVMVA_neon
626
627
628
629@ vim:filetype=armasm