gte_neon: make rtps division better match rtpt
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
... / ...
CommitLineData
1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8
9.bss
10.align 6 @ cacheline
11
12scratch:
13.rept 8*8*2/4
14 .word 0
15.endr
16
17.text
18.align 2
19
20@ XXX: gteMAC calc shouldn't be saturating, but it is here
21
22@ approximate gteMAC|123 flags
23@ in: rr 123 as gteMAC|123
24@ trash: nothing
25.macro do_mac_flags rr1 rr2 rr3
26 cmp \rr1, #1
27 orrvs lr, #(1<<31)|(1<<27)
28 cmp \rr2, #1
29 orrvs lr, #(1<<31)|(1<<26)
30 cmp \rr3, #1
31 orrvs lr, #(1<<31)|(1<<25)
32 cmn \rr1, #1 @ same as adds ...
33 orrvs lr, #(1<<30)
34 cmn \rr2, #1
35 orrvs lr, #(1<<29)
36 cmn \rr3, #1
37 orrvs lr, #(1<<28)
38.endm
39
40@ approximate 3x gteMACn flags
41@ in: rr 123 as 3 instances gteMACn, *flags
42@ trash: nothing
43.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
44 cmp \rr1, #1
45 cmpvc \rr2, #1
46 cmpvc \rr3, #1
47 orrvs lr, #\nflags
48 cmn \rr1, #1 @ adds ...
49 cmnvc \rr2, #1
50 cmnvc \rr3, #1
51 orrvs lr, #\pflags
52.endm
53
54@ get gteIR|123 flags from gteMAC|123
55@ in: rr 123 as gteMAC|123
56@ trash: r2,r3
57.macro do_irs_flags rr1 rr2 rr3
58 add r2, \rr1, #0x8000
59 add r3, \rr2, #0x8000
60 lsrs r2, #16
61 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
62 lsrs r3, #16
63 add r2, \rr3, #0x8000
64 orrne lr, #(1<<31)
65 orrne lr, #(1<<23) @ IR2/limB2
66 lsrs r2, #16
67 orrne lr, #(1<<22) @ IR3/limB3
68.endm
69
70
71/*
72 * RTPS/RTPT register map:
73 *
74 * q | d | c code / phase 1 phase 2 scratch
75 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
76 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
77 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
78 * 3 * gteIR1-3 = gteIR1-3 /
79 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
80 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
81 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
82 * 7 0 gteDQB [s64] max gteMAC|12
83 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
84 * 9 * / gteMAC3 max gteIR|123
85 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
86 * 11 0 quotient 3
87 * 6 12 gteH (adj. for cmp)
88 * 13 gteH (float for div)
89 * ... <scratch>
90 * 15 30 0
91 * 31 0
92 */
93
94@ load gteR*, gteTR* and gteH (see map above), clear q15
95@ in: r0 - context
96@ trash: r3
97.macro rtpx_preload
98 add r3, r0, #4*32
99 vldmia r3, {d0-d2} @ gteR* [16*9]
100 vmov.i32 q15, #0
101 add r3, r0, #4*(32+5)
102 vldmia r3, {d4-d5} @ gteTR*
103 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
104 vext.16 d1, d0, d1, #3 @ xx32 -> x321
105 add r3, r0, #4*(32+26)
106 vld1.32 d11[0], [r3] @ gteH
107 vshll.s32 q3, d5, #12 @ gteTRZ
108 vshll.s32 q2, d4, #12 @ gteTR|XY
109 vmovl.s16 q6, d11 @ gteH
110.endm
111
112@ do RTP* gteMAC* calculation
113@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
114@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
115@ trash: d16-d21
116.macro rtpx_mac
117 vmull.s16 q8, d0, d8
118 vmull.s16 q9, d1, d8
119 vmull.s16 q10, d2, d8
120 vpaddl.s32 q8, q8
121 vpaddl.s32 q9, q9
122 vpaddl.s32 q10, q10
123 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
124 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
125 vadd.s64 d20, d21 @ QC
126 vadd.s64 d16, d4
127 vadd.s64 d18, d5
128 vadd.s64 d20, d6
129 vqshrn.s64 d8, q8, #12 @ gteMAC1
130 vqshrn.s64 d18, q9, #12 @ gteMAC2
131 vqshrn.s64 d9, q10, #12 @ gteMAC3
132 vsli.u64 d8, d18, #32 @ gteMAC|12
133 vmov.32 d9[1], r12
134 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
135.endm
136
137.global gteRTPS_neon @ r0=CP2 (d,c),
138gteRTPS_neon:
139 push {r4-r6,lr}
140
141@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
142 movw r1, #:lower16:scratch
143 movt r1, #:upper16:scratch
144 mov r12, #0
145
146 vldmia r0, {d8} @ VXYZ(0)
147 rtpx_preload
148
149@ rtpx_mac @ slower here, faster in RTPT?
150 vmov.16 d8[3], r12 @ kill unused upper vector
151 vmull.s16 q8, d0, d8
152 vmull.s16 q9, d1, d8
153 vmull.s16 q10, d2, d8
154 vpadd.s32 d16, d16, d17
155 vpadd.s32 d17, d18, d19
156 vpadd.s32 d18, d20, d21
157 vpadal.s32 q2, q8
158 vpadal.s32 q3, q9 @ d6, d18 is slow?
159 vqshrn.s64 d8, q2, #12 @ gteMAC|12
160 vqshrn.s64 d9, q3, #12 @ gteMAC3
161
162 add r3, r0, #4*25
163 vst1.32 d8, [r3]!
164 vst1.32 d9[0], [r3] @ wb gteMAC|123
165 vqmovn.s32 d10, q4 @ gteIR|123
166
167 add r3, r0, #4*17 @ gteSZ*
168 vldmia r3, {q7} @ d14,d15 gteSZ|123x
169 vmov.i32 d28, #0xffff @ 0xffff[32]
170 vmax.s32 d11, d9, d31
171 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
172 vmov.i32 d26, #1
173 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
174 vmovl.s16 q9, d10 @ || expand gteIR|123
175 vshl.u32 d13, d12, #16 @ | preparing gteH
176 add r3, r0, #4*9
177 vst1.32 d18, [r3]!
178 vst1.32 d19[0], [r3]
179
180 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
181 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
182
183 add r3, r0, #4*(32+24)
184 vld1.32 d4, [r3] @ || gteOF|XY
185 add r3, r0, #4*(32+27)
186 vld1.32 d6, [r3] @ || gteDQ|AB
187
188 vand d11, d16
189 vmovl.s32 q2, d4 @ || gteOF|XY [64]
190 vmax.u32 d11, d26 @ make divisor 1 if not
191 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
192 add r3, r0, #4*16 @ | gteSZ*
193 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
194
195 vcvt.f32.u32 d13, d13 @ gteH (float for div)
196 vcvt.f32.u32 d11, d11 @ divisor
197
198 @ divide.. it's not worth messing with reciprocals here
199 @ just for 1 value, let's just use VFP divider here
200 vdiv.f32 s22, s26, s22
201
202 vmov.f32 d20, #0.5
203 vadd.f32 d11, d20
204 vcvt.u32.f32 d11, d11 @ quotient
205
206 @ while NEON's busy we calculate some flags on ARM
207 add r3, r0, #4*25
208 mov lr, #0 @ gteFLAG
209 ldmia r3, {r4-r6} @ gteMAC|123
210
211 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
212 vqshl.u32 d11, #15
213
214 do_mac_flags r4, r5, r6
215
216 vshr.u32 d11, #15 @ quotient (limE)
217
218 do_irs_flags r4, r5, r6
219
220 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
221 add r3, r0, #4*13
222 vld1.32 d16, [r3] @ || load fS|XY12, new 01
223 vqmovn.s64 d18, q2 @ saturate to 32
224 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
225 vqshl.s32 d19, d18, #5 @ 11bit precision
226
227 ldr r4, [r1] @ quotient
228 movs r3, r6, lsr #16
229 orrne lr, #(1<<31)
230 orrne lr, #(1<<18) @ fSZ (limD)
231
232 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
233
234 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
235 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
236 vmovn.s32 d18, q9 @ fS|XY2 [s16]
237
238 vqmovn.s64 d20, q10 @ | gteMAC0
239 add r3, r0, #4*12
240 vst1.32 d16, [r3]! @ writeback fS|XY01
241 vst1.32 d18[0], [r3] @ ...2
242 add r3, r0, #4*24
243 vshr.s32 d21, d20, #12
244 vst1.32 d20[0], [r3] @ gteMAC0
245
246 movs r4, r4, lsr #17
247 orrne lr, #(1<<31)
248 orrne lr, #(1<<17) @ limE
249
250 vmax.s32 d21, d31
251 vmov.i32 d22, #0x1000
252 vmin.s32 d21, d22
253 add r3, r0, #4*8
254 vst1.16 d21[0], [r3] @ gteIR0
255
256 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
257 add r2, r4, #0x400<<16
258 add r3, r5, #0x400<<16
259 lsrs r2, #16+11
260 orrne lr, #(1<<14) @ limG1
261 orrne lr, #(1<<31)
262 lsrs r3, #16+11
263 orrne lr, #(1<<13) @ limG2
264 orrne lr, #(1<<31)
265 adds r2, r4, #1
266 addvcs r3, r5, #1
267 orrvs lr, #(1<<16) @ F
268 orrvs lr, #(1<<31)
269 subs r2, r4, #1
270 subvcs r3, r5, #1
271 orrvs lr, #(1<<31)
272
273 ldr r4, [r0, #4*24] @ gteMAC0
274 orrvs lr, #(1<<15)
275
276 adds r3, r4, #1
277 orrvs lr, #(1<<16) @ F
278 orrvs lr, #(1<<31)
279 subs r2, r4, #1
280 orrvs lr, #(1<<15) @ F
281 orrvs lr, #(1<<31)
282 cmp r4, #0x1000
283 orrhi lr, #(1<<12) @ limH
284
285 str lr, [r0, #4*(32+31)] @ gteFLAG
286
287 pop {r4-r6,pc}
288 .size gteRTPS_neon, .-gteRTPS_neon
289
290
291
292.global gteRTPT_neon @ r0=CP2 (d,c),
293gteRTPT_neon:
294 push {r4-r11,lr}
295
296 movw r1, #:lower16:scratch
297 movt r1, #:upper16:scratch
298 mov r12, #0
299
300 rtpx_preload
301
302 vmov.i32 d22, #0x7fffffff
303 vmov.i32 d23, #0x80000000
304 mov r3, #3 @ counter
305 mov r2, r0 @ VXYZ(0)
3060:
307 vldmia r2!, {d8} @ VXYZ(v)
308 vmov.16 d8[3], r12 @ kill unused upper vector
309
310 rtpx_mac
311 vmin.s32 d22, d8 @ min gteMAC|12
312 vmax.s32 d23, d8 @ max gteMAC|12
313 subs r3, #1
314 vst1.32 {d9,d10}, [r1, :128]!
315 bgt 0b
316
317 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
318
319 @ - phase2 -
320 sub r1, r1, #8*2*4
321 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
322
323 vmov d20, d0 @ gteMAC3 v=0
324 vmin.s16 d24, d1, d3 @ | find min IR
325 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
326 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
327 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
328 vmov d21, d9 @ ... v=2
329
330 vmov.i32 q14, #0xffff @ 0xffff[32]
331 vmax.s32 q10, q15
332 vmov.i32 q13, #1
333 vdup.32 q11, d22[0] @ gteH/2
334 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
335 vmin.s16 d24, d10 @ | find min/max IR
336 vmax.s16 d25, d10 @ |
337
338 add r3, r0, #4*19 @ ||
339 vld1.32 d14[0], [r3] @ || gteSZ3
340
341 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
342 add r3, r0, #4*17
343 vst1.32 d20, [r3]! @ | writeback fSZ(v)
344 vand q11, q10, q11
345 vst1.32 d21[0], [r3] @ |
346 vmax.u32 q10, q11, q13 @ make divisor 1 if not
347 add r3, r1, #8*8
348 vstmia r3, {q12} @ min/max IR for flags
349 vcvt.f32.u32 q10, q10
350 vshl.u32 d13, d12, #16 @ | preparing gteH
351
352 @ while NEON's busy we calculate some flags on ARM
353 add r2, r1, #8*2*3
354 mov lr, #0 @ gteFLAG
355 ldmia r2, {r4-r7} @ min/max gteMAC|12
356 subs r2, r4, #1
357 orrvs lr, #(1<<31)|(1<<27)
358 subs r3, r5, #1
359 orrvs lr, #(1<<31)|(1<<26)
360 adds r2, r6, #1
361 orrvs lr, #(1<<30)
362 adds r3, r7, #1
363 orrvs lr, #(1<<29)
364 ldr r4, [r1, #0] @ gteMAC3 v=0
365 ldr r5, [r1, #8*2] @ ... v=1
366 ldr r6, [r1, #8*4] @ ... v=2
367
368 add r3, r0, #4*(32+24)
369 vld1.32 d4, [r3] @ || gteOF|XY
370 add r3, r0, #4*(32+27)
371 vld1.32 d6, [r3] @ || gteDQ|AB
372
373 @ divide
374.if 1
375 vrecpe.f32 q11, q10 @ inv
376 vmovl.s32 q2, d4 @ || gteOF|XY [64]
377 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
378 vrecps.f32 q12, q10, q11 @ step
379 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
380 vmov.f32 q8, #0.5 @ |||
381 vmul.f32 q11, q12, q11 @ better inv
382 add r3, r0, #4*16
383 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
384 vdup.32 q13, d13[0] @ |
385@ vrecps.f32 q12, q10, q11 @ step
386@ vmul.f32 q11, q12, q11 @ better inv
387 vmul.f32 q10, q13, q11 @ result
388.else
389 vmov.f32 q8, #0.5 @ |||
390 vmovl.s32 q2, d4 @ || gteOF|XY [64]
391 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
392 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
393 vdup.32 q13, d13[0] @ |
394 add r3, r0, #4*16
395 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
396
397 vpush {q0}
398 vmov q0, q10 @ to test against C code
399 vdiv.f32 s0, s26, s0
400 vdiv.f32 s1, s26, s1
401 vdiv.f32 s2, s26, s2
402 vmov q10, q0
403 vpop {q0}
404.endif
405
406 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
407 orr r7, r4, r5
408 add r4, r1, #8*8
409 orr r3, r7, r6
410 ldmia r4, {r7,r8,r10,r11} @ min/max IR
411
412 movs r3, r3, lsr #16
413 orrne lr, #(1<<31)
414 orrne lr, #(1<<18) @ fSZ (limD)
415
416 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
417 vcvt.u32.f32 q8, q10
418 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
419 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
420 add r6, r1, #8*10
421 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
422 vqshl.u32 q8, #15
423 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
424 vshr.u32 q8, #15 @ quotients (limE)
425 vdup.32 d24, d16[0]
426 vdup.32 d25, d16[1]
427 vdup.32 d26, d17[0] @ quotient (dup)
428
429 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
430 mov r4, #0x10000
431 cmp r7, #1<<16
432 cmnvc r10, #1<<16
433 orrvs lr, #(1<<31)
434 orrvs lr, #(1<<23) @ IR2/limB2
435 rsbs r2, r4, r7, lsl #16
436 cmnvc r4, r10, lsl #16
437 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
438 rsbs r2, r4, r8, lsl #16
439 cmnvc r4, r11, lsl #16
440 orrvs lr, #(1<<22) @ IR3/limB3
441
442 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
443 vmull.s32 q10, d20, d25 @ ... v=1
444 vmull.s32 q11, d22, d26 @ ... v=2
445 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
446 vadd.s64 q10, q2 @ ... v=1
447 vadd.s64 q11, q2 @ ... v=2
448 vqmovn.s64 d18, q9 @ saturate to 32 v=0
449 vqmovn.s64 d19, q10 @ ... v=1
450 vqmovn.s64 d20, q11 @ ... v=2
451 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
452 vmax.s32 d15, d18, d19 @ || for flags
453 vmin.s32 d14, d20
454 vmax.s32 d15, d20
455 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
456 vqshl.s32 d24, d20, #5 @ ... v=2
457 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
458 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
459 vpmax.s32 d17, d15, d31 @ ||
460 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
461 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
462 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
463 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
464 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
465 vmovn.s32 d13, q12 @ 3
466 vstmia r1, {d14-d16} @ || other cacheline than quotients
467 add r3, r0, #4*12
468 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
469 vst1.32 d13[0], [r3]
470
471 vqmovn.s64 d26, q13 @ | gteMAC0
472 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
473
474 vmov.i32 d13, #0x1000
475 vshr.s32 d12, d26, #12
476
477 add r3, r0, #4*24
478 vst1.32 d26[0], [r3]! @ gteMAC0
479 vmax.s32 d12, d30
480 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
481 vst1.32 d9[0], [r3]
482
483 vmin.s32 d12, d13 @ | gteIR0
484
485 ldmia r6, {r4-r6} @ quotients
486 orr r4, r5
487 orr r4, r6
488 add r3, r0, #4*8
489 movs r4, r4, lsr #17
490
491 vst1.32 d12[0], [r3]! @ gteIR0
492 vst1.32 d10, [r3]! @ gteIR12
493 vst1.32 d11[0], [r3] @ ..3
494
495 @ ~23 cycles
496 orrne lr, #(1<<31) @ limE
497 orrne lr, #(1<<17) @ limE
498 ldmia r1, {r4-r9}
499 add r2, r4, #0x400<<16 @ min fSX
500 add r3, r6, #0x400<<16 @ max fSX
501 lsrs r2, #16+11
502 lsreqs r3, #16+11
503 orrne lr, #(1<<31) @ limG1
504 orrne lr, #(1<<14)
505 add r2, r5, #0x400<<16 @ min fSY
506 add r3, r7, #0x400<<16 @ max fSY
507 lsrs r2, #16+11
508 lsreqs r3, #16+11
509 orrne lr, #(1<<31) @ limG2
510 orrne lr, #(1<<13)
511 adds r2, r9, #1
512 orrvs lr, #(1<<16) @ F (31 already done by above)
513 subs r3, r8, #1
514
515 ldr r4, [r0, #4*24] @ gteMAC0
516 orrvs lr, #(1<<15)
517
518 adds r3, r4, #1
519 orrvs lr, #(1<<16)
520 orrvs lr, #(1<<31) @ F
521 subs r2, r4, #1
522 orrvs lr, #(1<<15)
523 orrvs lr, #(1<<31) @ F
524 cmp r4, #0x1000
525 orrhi lr, #(1<<12) @ limH
526
527 str lr, [r0, #4*(32+31)] @ gteFLAG
528
529 pop {r4-r11,pc}
530 .size gteRTPT_neon, .-gteRTPT_neon
531
532
533
534@ note: non-std calling convention used
535@ r0 = CP2 (d,c) (must preserve)
536@ r1 = op
537@ r4,r5 = VXYZ(v) packed
538@ r6 = &MX11(mx)
539@ r7 = &CV1(cv)
540.global gteMVMVA_part_neon
541gteMVMVA_part_neon:
542 uxth r5, r5
543 vmov.32 d8[0], r4
544 vmov.32 d8[1], r5 @ VXYZ(v)
545 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
546 vldmia r7, {d4-d5} @ CVx/gteTR*
547
548 vmov.i32 q15, #0
549 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
550 vext.16 d1, d0, d1, #3 @ xx32 -> x321
551 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
552 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
553
554 vmull.s16 q8, d0, d8
555 vmull.s16 q9, d1, d8
556 vmull.s16 q10, d2, d8
557 vpadd.s32 d16, d16, d17
558 vpadd.s32 d17, d18, d19
559 vpadd.s32 d18, d20, d21
560 vpadal.s32 q2, q8
561 vpadal.s32 q3, q9
562 tst r1, #1<<19
563 beq 0f
564 vshr.s64 q2, q2, #12
565 vshr.s64 q3, q3, #12
5660:
567 vqmovn.s64 d8, q2 @ gteMAC|12
568 vqmovn.s64 d9, q3 @ gteMAC3
569
570 tst r1, #1<<10
571 add r3, r0, #4*25
572 vqmovn.s32 d10, q4 @ gteIR|123
573 vst1.32 d8, [r3]!
574 vst1.32 d9[0], [r3] @ wb gteMAC|123
575
576 beq 0f
577 vmax.s16 d10, d31
5780:
579 vmovl.s16 q9, d10 @ expand gteIR|123
580 add r3, r0, #4*9
581 vst1.32 d18, [r3]!
582 vst1.32 d19[0], [r3]
583 bx lr
584 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
585
586
587@ get flags after gteMVMVA_part_neon operation
588.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
589gteMACtoIR_flags_neon:
590 push {r4,r5,lr}
591 tst r1, r1 @ lm
592 mov lr, #0 @ gteFLAG
593 mov r2, #0
594 mov r12, #15
595 moveq r2, #0x8000 @ adj
596 moveq r12, #16 @ shift
597
598 add r3, r0, #4*25
599 ldmia r3, {r3-r5} @ gteMAC|123
600
601 do_mac_flags r3, r4, r5
602
603 add r3, r2
604 add r4, r2
605 add r5, r2
606 asrs r3, r12
607 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
608 asrs r4, r12
609 orrne lr, #(1<<31)
610 orrne lr, #(1<<23) @ IR2/limB2
611 asrs r5, r12
612 orrne lr, #(1<<22) @ IR3/limB3
613 str lr, [r0, #4*(32+31)] @ gteFLAG
614
615 pop {r4,r5,pc}
616 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
617
618
619
620@ vim:filetype=armasm