gte: be friendly to more assemblers
[pcsx_rearmed.git] / libpcsxcore / gte_neon.S
... / ...
CommitLineData
1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8
9.syntax unified
10
11.bss
12.align 6 @ cacheline
13
14scratch:
15.rept 8*8*2/4
16 .word 0
17.endr
18
19.text
20.align 2
21
22.macro ldr_scratch rd
23#ifndef __PIC__
24 movw \rd, #:lower16:scratch
25 movt \rd, #:upper16:scratch
26#else
27 ldr \rd, =scratch
28#endif
29.endm
30
31@ XXX: gteMAC calc shouldn't be saturating, but it is here
32
33@ approximate gteMAC|123 flags
34@ in: rr 123 as gteMAC|123
35@ trash: nothing
36.macro do_mac_flags rr1 rr2 rr3
37 cmp \rr1, #1
38 orrvs lr, #(1<<31)|(1<<27)
39 cmp \rr2, #1
40 orrvs lr, #(1<<31)|(1<<26)
41 cmp \rr3, #1
42 orrvs lr, #(1<<31)|(1<<25)
43 cmn \rr1, #1 @ same as adds ...
44 orrvs lr, #(1<<30)
45 cmn \rr2, #1
46 orrvs lr, #(1<<29)
47 cmn \rr3, #1
48 orrvs lr, #(1<<28)
49.endm
50
51@ approximate 3x gteMACn flags
52@ in: rr 123 as 3 instances gteMACn, *flags
53@ trash: nothing
54.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
55 cmp \rr1, #1
56 cmpvc \rr2, #1
57 cmpvc \rr3, #1
58 orrvs lr, #\nflags
59 cmn \rr1, #1 @ adds ...
60 cmnvc \rr2, #1
61 cmnvc \rr3, #1
62 orrvs lr, #\pflags
63.endm
64
65@ get gteIR|123 flags from gteMAC|123
66@ in: rr 123 as gteMAC|123
67@ trash: r2,r3
68.macro do_irs_flags rr1 rr2 rr3
69 add r2, \rr1, #0x8000
70 add r3, \rr2, #0x8000
71 lsrs r2, #16
72 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
73 lsrs r3, #16
74 add r2, \rr3, #0x8000
75 orrne lr, #(1<<31)
76 orrne lr, #(1<<23) @ IR2/limB2
77 lsrs r2, #16
78 orrne lr, #(1<<22) @ IR3/limB3
79.endm
80
81
82/*
83 * RTPS/RTPT register map:
84 *
85 * q | d | c code / phase 1 phase 2 scratch
86 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
87 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
88 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
89 * 3 * gteIR1-3 = gteIR1-3 /
90 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
91 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
92 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
93 * 7 0 gteDQB [s64] max gteMAC|12
94 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
95 * 9 * / gteMAC3 max gteIR|123
96 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
97 * 11 0 quotient 3
98 * 6 12 gteH (adj. for cmp)
99 * 13 gteH (float for div)
100 * ... <scratch>
101 * 15 30 0
102 * 31 0
103 */
104
105@ load gteR*, gteTR* and gteH (see map above), clear q15
106@ in: r0 - context
107@ trash: r3
108.macro rtpx_preload
109 add r3, r0, #4*32
110 vldmia r3, {d0-d2} @ gteR* [16*9]
111 vmov.i32 q15, #0
112 add r3, r0, #4*(32+5)
113 vldmia r3, {d4-d5} @ gteTR*
114 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
115 vext.16 d1, d0, d1, #3 @ xx32 -> x321
116 add r3, r0, #4*(32+26)
117 vld1.32 d11[0], [r3] @ gteH
118 vshll.s32 q3, d5, #12 @ gteTRZ
119 vshll.s32 q2, d4, #12 @ gteTR|XY
120 vmovl.s16 q6, d11 @ gteH
121.endm
122
123@ do RTP* gteMAC* calculation
124@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
125@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
126@ trash: d16-d21
127.macro rtpx_mac
128 vmull.s16 q8, d0, d8
129 vmull.s16 q9, d1, d8
130 vmull.s16 q10, d2, d8
131 vpaddl.s32 q8, q8
132 vpaddl.s32 q9, q9
133 vpaddl.s32 q10, q10
134 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
135 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
136 vadd.s64 d20, d21 @ QC
137 vadd.s64 d16, d4
138 vadd.s64 d18, d5
139 vadd.s64 d20, d6
140 vqshrn.s64 d8, q8, #12 @ gteMAC1
141 vqshrn.s64 d18, q9, #12 @ gteMAC2
142 vqshrn.s64 d9, q10, #12 @ gteMAC3
143 vsli.u64 d8, d18, #32 @ gteMAC|12
144 vmov.32 d9[1], r12
145 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
146.endm
147
148.global gteRTPS_neon @ r0=CP2 (d,c),
149gteRTPS_neon:
150 push {r4-r6,lr}
151
152@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
153 ldr_scratch r1
154 mov r12, #0
155
156 vldmia r0, {d8} @ VXYZ(0)
157 rtpx_preload
158
159@ rtpx_mac @ slower here, faster in RTPT?
160 vmov.16 d8[3], r12 @ kill unused upper vector
161 vmull.s16 q8, d0, d8
162 vmull.s16 q9, d1, d8
163 vmull.s16 q10, d2, d8
164 vpadd.s32 d16, d16, d17
165 vpadd.s32 d17, d18, d19
166 vpadd.s32 d18, d20, d21
167 vpadal.s32 q2, q8
168 vpadal.s32 q3, q9 @ d6, d18 is slow?
169 vqshrn.s64 d8, q2, #12 @ gteMAC|12
170 vqshrn.s64 d9, q3, #12 @ gteMAC3
171
172 add r3, r0, #4*25
173 vst1.32 d8, [r3]!
174 vst1.32 d9[0], [r3] @ wb gteMAC|123
175 vqmovn.s32 d10, q4 @ gteIR|123
176
177 add r3, r0, #4*17 @ gteSZ*
178 vldmia r3, {q7} @ d14,d15 gteSZ|123x
179 vmov.i32 d28, #0xffff @ 0xffff[32]
180 vmax.s32 d11, d9, d31
181 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
182 vmov.i32 d26, #1
183 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
184 vmovl.s16 q9, d10 @ || expand gteIR|123
185 vshl.u32 d13, d12, #16 @ | preparing gteH
186 add r3, r0, #4*9
187 vst1.32 d18, [r3]!
188 vst1.32 d19[0], [r3]
189
190 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
191 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
192
193 add r3, r0, #4*(32+24)
194 vld1.32 d4, [r3] @ || gteOF|XY
195 add r3, r0, #4*(32+27)
196 vld1.32 d6, [r3] @ || gteDQ|AB
197
198 vand d11, d16
199 vmovl.s32 q2, d4 @ || gteOF|XY [64]
200 vmax.u32 d11, d26 @ make divisor 1 if not
201 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
202 add r3, r0, #4*16 @ | gteSZ*
203 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
204
205 vcvt.f32.u32 d13, d13 @ gteH (float for div)
206 vcvt.f32.u32 d11, d11 @ divisor
207
208 @ divide.. it's not worth messing with reciprocals here
209 @ just for 1 value, let's just use VFP divider here
210 vdiv.f32 s22, s26, s22
211
212 vmov.f32 d20, #0.5
213 vadd.f32 d11, d20
214 vcvt.u32.f32 d11, d11 @ quotient
215
216 @ while NEON's busy we calculate some flags on ARM
217 add r3, r0, #4*25
218 mov lr, #0 @ gteFLAG
219 ldmia r3, {r4-r6} @ gteMAC|123
220
221 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
222 vqshl.u32 d11, #15
223
224 do_mac_flags r4, r5, r6
225
226 vshr.u32 d11, #15 @ quotient (limE)
227
228 do_irs_flags r4, r5, r6
229
230 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
231 add r3, r0, #4*13
232 vld1.32 d16, [r3] @ || load fS|XY12, new 01
233 vqmovn.s64 d18, q2 @ saturate to 32
234 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
235 vqshl.s32 d19, d18, #5 @ 11bit precision
236
237 ldr r4, [r1] @ quotient
238 movs r3, r6, lsr #16
239 orrne lr, #(1<<31)
240 orrne lr, #(1<<18) @ fSZ (limD)
241
242 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
243
244 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
245 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
246 vmovn.s32 d18, q9 @ fS|XY2 [s16]
247
248 vqmovn.s64 d20, q10 @ | gteMAC0
249 add r3, r0, #4*12
250 vst1.32 d16, [r3]! @ writeback fS|XY01
251 vst1.32 d18[0], [r3] @ ...2
252 add r3, r0, #4*24
253 vshr.s32 d21, d20, #12
254 vst1.32 d20[0], [r3] @ gteMAC0
255
256 movs r4, r4, lsr #17
257 orrne lr, #(1<<31)
258 orrne lr, #(1<<17) @ limE
259
260 vmax.s32 d21, d31
261 vmov.i32 d22, #0x1000
262 vmin.s32 d21, d22
263 add r3, r0, #4*8
264 vst1.16 d21[0], [r3] @ gteIR0
265
266 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
267 add r2, r4, #0x400<<16
268 add r3, r5, #0x400<<16
269 lsrs r2, #16+11
270 orrne lr, #(1<<14) @ limG1
271 orrne lr, #(1<<31)
272 lsrs r3, #16+11
273 orrne lr, #(1<<13) @ limG2
274 orrne lr, #(1<<31)
275 adds r2, r4, #1
276 addsvc r3, r5, #1
277 orrvs lr, #(1<<16) @ F
278 orrvs lr, #(1<<31)
279 subs r2, r4, #1
280 subsvc r3, r5, #1
281 orrvs lr, #(1<<31)
282
283 ldr r4, [r0, #4*24] @ gteMAC0
284 orrvs lr, #(1<<15)
285
286 adds r3, r4, #1
287 orrvs lr, #(1<<16) @ F
288 orrvs lr, #(1<<31)
289 subs r2, r4, #1
290 orrvs lr, #(1<<15) @ F
291 orrvs lr, #(1<<31)
292 cmp r4, #0x1000
293 orrhi lr, #(1<<12) @ limH
294
295 str lr, [r0, #4*(32+31)] @ gteFLAG
296
297 pop {r4-r6,pc}
298 .size gteRTPS_neon, .-gteRTPS_neon
299
300
301
302.global gteRTPT_neon @ r0=CP2 (d,c),
303gteRTPT_neon:
304 push {r4-r11,lr}
305
306 ldr_scratch r1
307 mov r12, #0
308
309 rtpx_preload
310
311 vmov.i32 d22, #0x7fffffff
312 vmov.i32 d23, #0x80000000
313 mov r3, #3 @ counter
314 mov r2, r0 @ VXYZ(0)
3150:
316 vldmia r2!, {d8} @ VXYZ(v)
317 vmov.16 d8[3], r12 @ kill unused upper vector
318
319 rtpx_mac
320 vmin.s32 d22, d8 @ min gteMAC|12
321 vmax.s32 d23, d8 @ max gteMAC|12
322 subs r3, #1
323 vst1.32 {d9,d10}, [r1, :128]!
324 bgt 0b
325
326 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
327
328 @ - phase2 -
329 sub r1, r1, #8*2*4
330 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
331
332 vmov d20, d0 @ gteMAC3 v=0
333 vmin.s16 d24, d1, d3 @ | find min IR
334 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
335 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
336 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
337 vmov d21, d9 @ ... v=2
338
339 vmov.i32 q14, #0xffff @ 0xffff[32]
340 vmax.s32 q10, q15
341 vmov.i32 q13, #1
342 vdup.32 q11, d22[0] @ gteH/2
343 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
344 vmin.s16 d24, d10 @ | find min/max IR
345 vmax.s16 d25, d10 @ |
346
347 add r3, r0, #4*19 @ ||
348 vld1.32 d14[0], [r3] @ || gteSZ3
349
350 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
351 add r3, r0, #4*17
352 vst1.32 d20, [r3]! @ | writeback fSZ(v)
353 vand q11, q10, q11
354 vst1.32 d21[0], [r3] @ |
355 vmax.u32 q10, q11, q13 @ make divisor 1 if not
356 add r3, r1, #8*8
357 vstmia r3, {q12} @ min/max IR for flags
358 vcvt.f32.u32 q10, q10
359 vshl.u32 d13, d12, #16 @ | preparing gteH
360
361 @ while NEON's busy we calculate some flags on ARM
362 add r2, r1, #8*2*3
363 mov lr, #0 @ gteFLAG
364 ldmia r2, {r4-r7} @ min/max gteMAC|12
365 subs r2, r4, #1
366 orrvs lr, #(1<<31)|(1<<27)
367 subs r3, r5, #1
368 orrvs lr, #(1<<31)|(1<<26)
369 adds r2, r6, #1
370 orrvs lr, #(1<<30)
371 adds r3, r7, #1
372 orrvs lr, #(1<<29)
373 ldr r4, [r1, #0] @ gteMAC3 v=0
374 ldr r5, [r1, #8*2] @ ... v=1
375 ldr r6, [r1, #8*4] @ ... v=2
376
377 add r3, r0, #4*(32+24)
378 vld1.32 d4, [r3] @ || gteOF|XY
379 add r3, r0, #4*(32+27)
380 vld1.32 d6, [r3] @ || gteDQ|AB
381
382 @ divide
383.if 1
384 vrecpe.f32 q11, q10 @ inv
385 vmovl.s32 q2, d4 @ || gteOF|XY [64]
386 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
387 vrecps.f32 q12, q10, q11 @ step
388 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
389 vmov.f32 q8, #0.5 @ |||
390 vmul.f32 q11, q12, q11 @ better inv
391 add r3, r0, #4*16
392 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
393 vdup.32 q13, d13[0] @ |
394@ vrecps.f32 q12, q10, q11 @ step
395@ vmul.f32 q11, q12, q11 @ better inv
396 vmul.f32 q10, q13, q11 @ result
397.else
398 vmov.f32 q8, #0.5 @ |||
399 vmovl.s32 q2, d4 @ || gteOF|XY [64]
400 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
401 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
402 vdup.32 q13, d13[0] @ |
403 add r3, r0, #4*16
404 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
405
406 vpush {q0}
407 vmov q0, q10 @ to test against C code
408 vdiv.f32 s0, s26, s0
409 vdiv.f32 s1, s26, s1
410 vdiv.f32 s2, s26, s2
411 vmov q10, q0
412 vpop {q0}
413.endif
414
415 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
416 orr r7, r4, r5
417 add r4, r1, #8*8
418 orr r3, r7, r6
419 ldmia r4, {r7,r8,r10,r11} @ min/max IR
420
421 movs r3, r3, lsr #16
422 orrne lr, #(1<<31)
423 orrne lr, #(1<<18) @ fSZ (limD)
424
425 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
426 vcvt.u32.f32 q8, q10
427 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
428 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
429 add r6, r1, #8*10
430 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
431 vqshl.u32 q8, #15
432 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
433 vshr.u32 q8, #15 @ quotients (limE)
434 vdup.32 d24, d16[0]
435 vdup.32 d25, d16[1]
436 vdup.32 d26, d17[0] @ quotient (dup)
437
438 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
439 mov r4, #0x10000
440 cmp r7, #1<<16
441 cmnvc r10, #1<<16
442 orrvs lr, #(1<<31)
443 orrvs lr, #(1<<23) @ IR2/limB2
444 rsbs r2, r4, r7, lsl #16
445 cmnvc r4, r10, lsl #16
446 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
447 rsbs r2, r4, r8, lsl #16
448 cmnvc r4, r11, lsl #16
449 orrvs lr, #(1<<22) @ IR3/limB3
450
451 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
452 vmull.s32 q10, d20, d25 @ ... v=1
453 vmull.s32 q11, d22, d26 @ ... v=2
454 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
455 vadd.s64 q10, q2 @ ... v=1
456 vadd.s64 q11, q2 @ ... v=2
457 vqmovn.s64 d18, q9 @ saturate to 32 v=0
458 vqmovn.s64 d19, q10 @ ... v=1
459 vqmovn.s64 d20, q11 @ ... v=2
460 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
461 vmax.s32 d15, d18, d19 @ || for flags
462 vmin.s32 d14, d20
463 vmax.s32 d15, d20
464 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
465 vqshl.s32 d24, d20, #5 @ ... v=2
466 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
467 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
468 vpmax.s32 d17, d15, d31 @ ||
469 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
470 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
471 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
472 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
473 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
474 vmovn.s32 d13, q12 @ 3
475 vstmia r1, {d14-d16} @ || other cacheline than quotients
476 add r3, r0, #4*12
477 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
478 vst1.32 d13[0], [r3]
479
480 vqmovn.s64 d26, q13 @ | gteMAC0
481 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
482
483 vmov.i32 d13, #0x1000
484 vshr.s32 d12, d26, #12
485
486 add r3, r0, #4*24
487 vst1.32 d26[0], [r3]! @ gteMAC0
488 vmax.s32 d12, d30
489 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
490 vst1.32 d9[0], [r3]
491
492 vmin.s32 d12, d13 @ | gteIR0
493
494 ldmia r6, {r4-r6} @ quotients
495 orr r4, r5
496 orr r4, r6
497 add r3, r0, #4*8
498 movs r4, r4, lsr #17
499
500 vst1.32 d12[0], [r3]! @ gteIR0
501 vst1.32 d10, [r3]! @ gteIR12
502 vst1.32 d11[0], [r3] @ ..3
503
504 @ ~23 cycles
505 orrne lr, #(1<<31) @ limE
506 orrne lr, #(1<<17) @ limE
507 ldmia r1, {r4-r9}
508 add r2, r4, #0x400<<16 @ min fSX
509 add r3, r6, #0x400<<16 @ max fSX
510 lsrs r2, #16+11
511 lsrseq r3, #16+11
512 orrne lr, #(1<<31) @ limG1
513 orrne lr, #(1<<14)
514 add r2, r5, #0x400<<16 @ min fSY
515 add r3, r7, #0x400<<16 @ max fSY
516 lsrs r2, #16+11
517 lsrseq r3, #16+11
518 orrne lr, #(1<<31) @ limG2
519 orrne lr, #(1<<13)
520 adds r2, r9, #1
521 orrvs lr, #(1<<16) @ F (31 already done by above)
522 subs r3, r8, #1
523
524 ldr r4, [r0, #4*24] @ gteMAC0
525 orrvs lr, #(1<<15)
526
527 adds r3, r4, #1
528 orrvs lr, #(1<<16)
529 orrvs lr, #(1<<31) @ F
530 subs r2, r4, #1
531 orrvs lr, #(1<<15)
532 orrvs lr, #(1<<31) @ F
533 cmp r4, #0x1000
534 orrhi lr, #(1<<12) @ limH
535
536 str lr, [r0, #4*(32+31)] @ gteFLAG
537
538 pop {r4-r11,pc}
539 .size gteRTPT_neon, .-gteRTPT_neon
540
541
542
543@ note: non-std calling convention used
544@ r0 = CP2 (d,c) (must preserve)
545@ r1 = op
546@ r4,r5 = VXYZ(v) packed
547@ r6 = &MX11(mx)
548@ r7 = &CV1(cv)
549.global gteMVMVA_part_neon
550gteMVMVA_part_neon:
551 uxth r5, r5
552 vmov.32 d8[0], r4
553 vmov.32 d8[1], r5 @ VXYZ(v)
554 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
555 vldmia r7, {d4-d5} @ CVx/gteTR*
556
557 vmov.i32 q15, #0
558 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
559 vext.16 d1, d0, d1, #3 @ xx32 -> x321
560 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
561 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
562
563 vmull.s16 q8, d0, d8
564 vmull.s16 q9, d1, d8
565 vmull.s16 q10, d2, d8
566 vpadd.s32 d16, d16, d17
567 vpadd.s32 d17, d18, d19
568 vpadd.s32 d18, d20, d21
569 vpadal.s32 q2, q8
570 vpadal.s32 q3, q9
571 tst r1, #1<<19
572 beq 0f
573 vshr.s64 q2, q2, #12
574 vshr.s64 q3, q3, #12
5750:
576 vqmovn.s64 d8, q2 @ gteMAC|12
577 vqmovn.s64 d9, q3 @ gteMAC3
578
579 tst r1, #1<<10
580 add r3, r0, #4*25
581 vqmovn.s32 d10, q4 @ gteIR|123
582 vst1.32 d8, [r3]!
583 vst1.32 d9[0], [r3] @ wb gteMAC|123
584
585 beq 0f
586 vmax.s16 d10, d31
5870:
588 vmovl.s16 q9, d10 @ expand gteIR|123
589 add r3, r0, #4*9
590 vst1.32 d18, [r3]!
591 vst1.32 d19[0], [r3]
592 bx lr
593 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
594
595
596@ get flags after gteMVMVA_part_neon operation
597.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
598gteMACtoIR_flags_neon:
599 push {r4,r5,lr}
600 tst r1, r1 @ lm
601 mov lr, #0 @ gteFLAG
602 mov r2, #0
603 mov r12, #15
604 moveq r2, #0x8000 @ adj
605 moveq r12, #16 @ shift
606
607 add r3, r0, #4*25
608 ldmia r3, {r3-r5} @ gteMAC|123
609
610 do_mac_flags r3, r4, r5
611
612 add r3, r2
613 add r4, r2
614 add r5, r2
615 asrs r3, r12
616 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
617 asrs r4, r12
618 orrne lr, #(1<<31)
619 orrne lr, #(1<<23) @ IR2/limB2
620 asrs r5, r12
621 orrne lr, #(1<<22) @ IR3/limB3
622 str lr, [r0, #4*(32+31)] @ gteFLAG
623
624 pop {r4,r5,pc}
625 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
626
627
628
629@ vim:filetype=armasm