Buildfix for LLVM
[pcsx_rearmed.git] / libpcsxcore / gte_neon.S
... / ...
CommitLineData
1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8#include "arm_features.h"
9#include "new_dynarec/arm/linkage_offsets.h"
10
11.syntax unified
12.text
13.align 2
14
15@ XXX: gteMAC calc shouldn't be saturating, but it is here
16
17@ approximate gteMAC|123 flags
18@ in: rr 123 as gteMAC|123
19@ trash: nothing
20.macro do_mac_flags rr1 rr2 rr3
21 cmp \rr1, #1
22 orrvs lr, #(1<<31)|(1<<27)
23 cmp \rr2, #1
24 orrvs lr, #(1<<31)|(1<<26)
25 cmp \rr3, #1
26 orrvs lr, #(1<<31)|(1<<25)
27 cmn \rr1, #1 @ same as adds ...
28 orrvs lr, #(1<<30)
29 cmn \rr2, #1
30 orrvs lr, #(1<<29)
31 cmn \rr3, #1
32 orrvs lr, #(1<<28)
33.endm
34
35@ approximate 3x gteMACn flags
36@ in: rr 123 as 3 instances gteMACn, *flags
37@ trash: nothing
38.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
39 cmp \rr1, #1
40 cmpvc \rr2, #1
41 cmpvc \rr3, #1
42 orrvs lr, #\nflags
43 cmn \rr1, #1 @ adds ...
44 cmnvc \rr2, #1
45 cmnvc \rr3, #1
46 orrvs lr, #\pflags
47.endm
48
49@ get gteIR|123 flags from gteMAC|123
50@ in: rr 123 as gteMAC|123
51@ trash: r2,r3
52.macro do_irs_flags rr1 rr2 rr3
53 add r2, \rr1, #0x8000
54 add r3, \rr2, #0x8000
55 lsrs r2, #16
56 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
57 lsrs r3, #16
58 add r2, \rr3, #0x8000
59 orrne lr, #(1<<31)
60 orrne lr, #(1<<23) @ IR2/limB2
61 lsrs r2, #16
62 orrne lr, #(1<<22) @ IR3/limB3
63.endm
64
65
66/*
67 * RTPS/RTPT register map:
68 *
69 * q | d | c code / phase 1 phase 2 scratch
70 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
71 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
72 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
73 * 3 * gteIR1-3 = gteIR1-3 /
74 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
75 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
76 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
77 * 7 0 gteDQB [s64] max gteMAC|12
78 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
79 * 9 * / gteMAC3 max gteIR|123
80 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
81 * 11 0 quotient 3
82 * 6 12 gteH (adj. for cmp)
83 * 13 gteH (float for div)
84 * ... <scratch>
85 * 15 30 0
86 * 31 0
87 */
88
89@ load gteR*, gteTR* and gteH (see map above), clear q15
90@ in: r0 - context
91@ trash: r3
92.macro rtpx_preload
93 add r3, r0, #4*32
94 vldmia r3, {d0-d2} @ gteR* [16*9]
95 vmov.i32 q15, #0
96 add r3, r0, #4*(32+5)
97 vldmia r3, {d4-d5} @ gteTR*
98 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
99 vext.16 d1, d0, d1, #3 @ xx32 -> x321
100 add r3, r0, #4*(32+26)
101 vld1.32 d11[0], [r3] @ gteH
102 vshll.s32 q3, d5, #12 @ gteTRZ
103 vshll.s32 q2, d4, #12 @ gteTR|XY
104 vmovl.s16 q6, d11 @ gteH
105.endm
106
107@ do RTP* gteMAC* calculation
108@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
109@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
110@ trash: d16-d21
111.macro rtpx_mac
112 vmull.s16 q8, d0, d8
113 vmull.s16 q9, d1, d8
114 vmull.s16 q10, d2, d8
115 vpaddl.s32 q8, q8
116 vpaddl.s32 q9, q9
117 vpaddl.s32 q10, q10
118 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
119 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
120 vadd.s64 d20, d21 @ QC
121 vadd.s64 d16, d4
122 vadd.s64 d18, d5
123 vadd.s64 d20, d6
124 vqshrn.s64 d8, q8, #12 @ gteMAC1
125 vqshrn.s64 d18, q9, #12 @ gteMAC2
126 vqshrn.s64 d9, q10, #12 @ gteMAC3
127 vsli.u64 d8, d18, #32 @ gteMAC|12
128 vmov.32 d9[1], r12
129 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
130.endm
131
132FUNCTION(gteRTPS_neon): @ r0=CP2 (d,c),
133 push {r4-r6,lr}
134
135@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
136 ldr r1, [r0, #LO_cop2_to_scratch_buf]
137 mov r12, #0
138
139 vldmia r0, {d8} @ VXYZ(0)
140 rtpx_preload
141
142@ rtpx_mac @ slower here, faster in RTPT?
143 vmov.16 d8[3], r12 @ kill unused upper vector
144 vmull.s16 q8, d0, d8
145 vmull.s16 q9, d1, d8
146 vmull.s16 q10, d2, d8
147 vpadd.s32 d16, d16, d17
148 vpadd.s32 d17, d18, d19
149 vpadd.s32 d18, d20, d21
150 vpadal.s32 q2, q8
151 vpadal.s32 q3, q9 @ d6, d18 is slow?
152 vqshrn.s64 d8, q2, #12 @ gteMAC|12
153 vqshrn.s64 d9, q3, #12 @ gteMAC3
154
155 add r3, r0, #4*25
156 vst1.32 d8, [r3]!
157 vst1.32 d9[0], [r3] @ wb gteMAC|123
158 vqmovn.s32 d10, q4 @ gteIR|123
159
160 add r3, r0, #4*17 @ gteSZ*
161 vldmia r3, {q7} @ d14,d15 gteSZ|123x
162 vmov.i32 d28, #0xffff @ 0xffff[32]
163 vmax.s32 d11, d9, d31
164 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
165 vmov.i32 d26, #1
166 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
167 vmovl.s16 q9, d10 @ || expand gteIR|123
168 vshl.u32 d13, d12, #16 @ | preparing gteH
169 add r3, r0, #4*9
170 vst1.32 d18, [r3]!
171 vst1.32 d19[0], [r3]
172
173 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
174 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
175
176 add r3, r0, #4*(32+24)
177 vld1.32 d4, [r3] @ || gteOF|XY
178 add r3, r0, #4*(32+27)
179 vld1.32 d6, [r3] @ || gteDQ|AB
180
181 vand d11, d16
182 vmovl.s32 q2, d4 @ || gteOF|XY [64]
183 vmax.u32 d11, d26 @ make divisor 1 if not
184 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
185 add r3, r0, #4*16 @ | gteSZ*
186 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
187
188 vcvt.f32.u32 d13, d13 @ gteH (float for div)
189 vcvt.f32.u32 d11, d11 @ divisor
190
191 @ divide.. it's not worth messing with reciprocals here
192 @ just for 1 value, let's just use VFP divider here
193 vdiv.f32 s22, s26, s22
194
195 vmov.f32 d20, #0.5
196 vadd.f32 d11, d20
197 vcvt.u32.f32 d11, d11 @ quotient
198
199 @ while NEON's busy we calculate some flags on ARM
200 add r3, r0, #4*25
201 mov lr, #0 @ gteFLAG
202 ldmia r3, {r4-r6} @ gteMAC|123
203
204 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
205 vqshl.u32 d11, #15
206
207 do_mac_flags r4, r5, r6
208
209 vshr.u32 d11, #15 @ quotient (limE)
210
211 do_irs_flags r4, r5, r6
212
213 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
214 add r3, r0, #4*13
215 vld1.32 d16, [r3] @ || load fS|XY12, new 01
216 vqmovn.s64 d18, q2 @ saturate to 32
217 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
218 vqshl.s32 d19, d18, #5 @ 11bit precision
219
220 ldr r4, [r1] @ quotient
221 movs r3, r6, lsr #16
222 orrne lr, #(1<<31)
223 orrne lr, #(1<<18) @ fSZ (limD)
224
225 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
226
227 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
228 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
229 vmovn.s32 d18, q9 @ fS|XY2 [s16]
230
231 vqmovn.s64 d20, q10 @ | gteMAC0
232 add r3, r0, #4*12
233 vst1.32 d16, [r3]! @ writeback fS|XY01
234 vst1.32 d18[0], [r3] @ ...2
235 add r3, r0, #4*24
236 vshr.s32 d21, d20, #12
237 vst1.32 d20[0], [r3] @ gteMAC0
238
239 movs r4, r4, lsr #17
240 orrne lr, #(1<<31)
241 orrne lr, #(1<<17) @ limE
242
243 vmax.s32 d21, d31
244 vmov.i32 d22, #0x1000
245 vmin.s32 d21, d22
246 add r3, r0, #4*8
247 vst1.16 d21[0], [r3] @ gteIR0
248
249 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
250 add r2, r4, #0x400<<16
251 add r3, r5, #0x400<<16
252 lsrs r2, #16+11
253 orrne lr, #(1<<14) @ limG1
254 orrne lr, #(1<<31)
255 lsrs r3, #16+11
256 orrne lr, #(1<<13) @ limG2
257 orrne lr, #(1<<31)
258 adds r2, r4, #1
259 addsvc r3, r5, #1
260 orrvs lr, #(1<<16) @ F
261 orrvs lr, #(1<<31)
262 subs r2, r4, #1
263 subsvc r3, r5, #1
264 orrvs lr, #(1<<31)
265
266 ldr r4, [r0, #4*24] @ gteMAC0
267 orrvs lr, #(1<<15)
268
269 adds r3, r4, #1
270 orrvs lr, #(1<<16) @ F
271 orrvs lr, #(1<<31)
272 subs r2, r4, #1
273 orrvs lr, #(1<<15) @ F
274 orrvs lr, #(1<<31)
275 cmp r4, #0x1000
276 orrhi lr, #(1<<12) @ limH
277
278 str lr, [r0, #4*(32+31)] @ gteFLAG
279
280 pop {r4-r6,pc}
281 .size gteRTPS_neon, .-gteRTPS_neon
282
283
284
285FUNCTION(gteRTPT_neon): @ r0=CP2 (d,c),
286 push {r4-r11,lr}
287
288 ldr r1, [r0, #LO_cop2_to_scratch_buf]
289 mov r12, #0
290
291 rtpx_preload
292
293 vmov.i32 d23, #1
294 vmov.i32 d22, #0x80000000
295 vsub.i32 d22, d22, d23
296 vmov.i32 d23, #0
297 mov r3, #3 @ counter
298 mov r2, r0 @ VXYZ(0)
2990:
300 vldmia r2!, {d8} @ VXYZ(v)
301 vmov.16 d8[3], r12 @ kill unused upper vector
302
303 rtpx_mac
304 vmin.s32 d22, d8 @ min gteMAC|12
305 vmax.s32 d23, d8 @ max gteMAC|12
306 subs r3, #1
307 vst1.32 {d9,d10}, [r1, :128]!
308 bgt 0b
309
310 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
311
312 @ - phase2 -
313 sub r1, r1, #8*2*4
314 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
315
316 vmov d20, d0 @ gteMAC3 v=0
317 vmin.s16 d24, d1, d3 @ | find min IR
318 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
319 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
320 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
321 vmov d21, d9 @ ... v=2
322
323 vmov.i32 q14, #0xffff @ 0xffff[32]
324 vmax.s32 q10, q15
325 vmov.i32 q13, #1
326 vdup.32 q11, d22[0] @ gteH/2
327 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
328 vmin.s16 d24, d10 @ | find min/max IR
329 vmax.s16 d25, d10 @ |
330
331 add r3, r0, #4*19 @ ||
332 vld1.32 d14[0], [r3] @ || gteSZ3
333
334 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
335 add r3, r0, #4*17
336 vst1.32 d20, [r3]! @ | writeback fSZ(v)
337 vand q11, q10, q11
338 vst1.32 d21[0], [r3] @ |
339 vmax.u32 q10, q11, q13 @ make divisor 1 if not
340 add r3, r1, #8*8
341 vstmia r3, {q12} @ min/max IR for flags
342 vcvt.f32.u32 q10, q10
343 vshl.u32 d13, d12, #16 @ | preparing gteH
344
345 @ while NEON's busy we calculate some flags on ARM
346 add r2, r1, #8*2*3
347 mov lr, #0 @ gteFLAG
348 ldmia r2, {r4-r7} @ min/max gteMAC|12
349 subs r2, r4, #1
350 orrvs lr, #(1<<31)|(1<<27)
351 subs r3, r5, #1
352 orrvs lr, #(1<<31)|(1<<26)
353 adds r2, r6, #1
354 orrvs lr, #(1<<30)
355 adds r3, r7, #1
356 orrvs lr, #(1<<29)
357 ldr r4, [r1, #0] @ gteMAC3 v=0
358 ldr r5, [r1, #8*2] @ ... v=1
359 ldr r6, [r1, #8*4] @ ... v=2
360
361 add r3, r0, #4*(32+24)
362 vld1.32 d4, [r3] @ || gteOF|XY
363 add r3, r0, #4*(32+27)
364 vld1.32 d6, [r3] @ || gteDQ|AB
365
366 @ divide
367.if 1
368 vrecpe.f32 q11, q10 @ inv
369 vmovl.s32 q2, d4 @ || gteOF|XY [64]
370 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
371 vrecps.f32 q12, q10, q11 @ step
372 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
373 vmov.f32 q8, #0.5 @ |||
374 vmul.f32 q11, q12, q11 @ better inv
375 add r3, r0, #4*16
376 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
377 vdup.32 q13, d13[0] @ |
378@ vrecps.f32 q12, q10, q11 @ step
379@ vmul.f32 q11, q12, q11 @ better inv
380 vmul.f32 q10, q13, q11 @ result
381.else
382 vmov.f32 q8, #0.5 @ |||
383 vmovl.s32 q2, d4 @ || gteOF|XY [64]
384 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
385 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
386 vdup.32 q13, d13[0] @ |
387 add r3, r0, #4*16
388 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
389
390 vpush {q0}
391 vmov q0, q10 @ to test against C code
392 vdiv.f32 s0, s26, s0
393 vdiv.f32 s1, s26, s1
394 vdiv.f32 s2, s26, s2
395 vmov q10, q0
396 vpop {q0}
397.endif
398
399 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
400 orr r7, r4, r5
401 add r4, r1, #8*8
402 orr r3, r7, r6
403 ldmia r4, {r7,r8,r10,r11} @ min/max IR
404
405 movs r3, r3, lsr #16
406 orrne lr, #(1<<31)
407 orrne lr, #(1<<18) @ fSZ (limD)
408
409 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
410 vcvt.u32.f32 q8, q10
411 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
412 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
413 add r6, r1, #8*10
414 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
415 vqshl.u32 q8, #15
416 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
417 vshr.u32 q8, #15 @ quotients (limE)
418 vdup.32 d24, d16[0]
419 vdup.32 d25, d16[1]
420 vdup.32 d26, d17[0] @ quotient (dup)
421
422 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
423 mov r4, #0x10000
424 cmp r7, #1<<16
425 cmnvc r10, #1<<16
426 orrvs lr, #(1<<31)
427 orrvs lr, #(1<<23) @ IR2/limB2
428 rsbs r2, r4, r7, lsl #16
429 cmnvc r4, r10, lsl #16
430 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
431 rsbs r2, r4, r8, lsl #16
432 cmnvc r4, r11, lsl #16
433 orrvs lr, #(1<<22) @ IR3/limB3
434
435 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
436 vmull.s32 q10, d20, d25 @ ... v=1
437 vmull.s32 q11, d22, d26 @ ... v=2
438 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
439 vadd.s64 q10, q2 @ ... v=1
440 vadd.s64 q11, q2 @ ... v=2
441 vqmovn.s64 d18, q9 @ saturate to 32 v=0
442 vqmovn.s64 d19, q10 @ ... v=1
443 vqmovn.s64 d20, q11 @ ... v=2
444 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
445 vmax.s32 d15, d18, d19 @ || for flags
446 vmin.s32 d14, d20
447 vmax.s32 d15, d20
448 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
449 vqshl.s32 d24, d20, #5 @ ... v=2
450 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
451 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
452 vpmax.s32 d17, d15, d31 @ ||
453 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
454 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
455 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
456 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
457 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
458 vmovn.s32 d13, q12 @ 3
459 vstmia r1, {d14-d16} @ || other cacheline than quotients
460 add r3, r0, #4*12
461 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
462 vst1.32 d13[0], [r3]
463
464 vqmovn.s64 d26, q13 @ | gteMAC0
465 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
466
467 vmov.i32 d13, #0x1000
468 vshr.s32 d12, d26, #12
469
470 add r3, r0, #4*24
471 vst1.32 d26[0], [r3]! @ gteMAC0
472 vmax.s32 d12, d30
473 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
474 vst1.32 d9[0], [r3]
475
476 vmin.s32 d12, d13 @ | gteIR0
477
478 ldmia r6, {r4-r6} @ quotients
479 orr r4, r5
480 orr r4, r6
481 add r3, r0, #4*8
482 movs r4, r4, lsr #17
483
484 vst1.32 d12[0], [r3]! @ gteIR0
485 vst1.32 d10, [r3]! @ gteIR12
486 vst1.32 d11[0], [r3] @ ..3
487
488 @ ~23 cycles
489 orrne lr, #(1<<31) @ limE
490 orrne lr, #(1<<17) @ limE
491 ldmia r1, {r4-r9}
492 add r2, r4, #0x400<<16 @ min fSX
493 add r3, r6, #0x400<<16 @ max fSX
494 lsrs r2, #16+11
495 lsrseq r3, #16+11
496 orrne lr, #(1<<31) @ limG1
497 orrne lr, #(1<<14)
498 add r2, r5, #0x400<<16 @ min fSY
499 add r3, r7, #0x400<<16 @ max fSY
500 lsrs r2, #16+11
501 lsrseq r3, #16+11
502 orrne lr, #(1<<31) @ limG2
503 orrne lr, #(1<<13)
504 adds r2, r9, #1
505 orrvs lr, #(1<<16) @ F (31 already done by above)
506 subs r3, r8, #1
507
508 ldr r4, [r0, #4*24] @ gteMAC0
509 orrvs lr, #(1<<15)
510
511 adds r3, r4, #1
512 orrvs lr, #(1<<16)
513 orrvs lr, #(1<<31) @ F
514 subs r2, r4, #1
515 orrvs lr, #(1<<15)
516 orrvs lr, #(1<<31) @ F
517 cmp r4, #0x1000
518 orrhi lr, #(1<<12) @ limH
519
520 str lr, [r0, #4*(32+31)] @ gteFLAG
521
522 pop {r4-r11,pc}
523 .size gteRTPT_neon, .-gteRTPT_neon
524
525
526
527@ note: non-std calling convention used
528@ r0 = CP2 (d,c) (must preserve)
529@ r1 = op
530@ r4,r5 = VXYZ(v) packed
531@ r6 = &MX11(mx)
532@ r7 = &CV1(cv)
533FUNCTION(gteMVMVA_part_neon):
534 uxth r5, r5
535 vmov.32 d8[0], r4
536 vmov.32 d8[1], r5 @ VXYZ(v)
537 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
538 vldmia r7, {d4-d5} @ CVx/gteTR*
539
540 vmov.i32 q15, #0
541 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
542 vext.16 d1, d0, d1, #3 @ xx32 -> x321
543 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
544 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
545
546 vmull.s16 q8, d0, d8
547 vmull.s16 q9, d1, d8
548 vmull.s16 q10, d2, d8
549 vpadd.s32 d16, d16, d17
550 vpadd.s32 d17, d18, d19
551 vpadd.s32 d18, d20, d21
552 vpadal.s32 q2, q8
553 vpadal.s32 q3, q9
554 tst r1, #1<<19
555 beq 0f
556 vshr.s64 q2, q2, #12
557 vshr.s64 q3, q3, #12
5580:
559 vqmovn.s64 d8, q2 @ gteMAC|12
560 vqmovn.s64 d9, q3 @ gteMAC3
561
562 tst r1, #1<<10
563 add r3, r0, #4*25
564 vqmovn.s32 d10, q4 @ gteIR|123
565 vst1.32 d8, [r3]!
566 vst1.32 d9[0], [r3] @ wb gteMAC|123
567
568 beq 0f
569 vmax.s16 d10, d31
5700:
571 vmovl.s16 q9, d10 @ expand gteIR|123
572 add r3, r0, #4*9
573 vst1.32 d18, [r3]!
574 vst1.32 d19[0], [r3]
575 bx lr
576 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
577
578
579@ get flags after gteMVMVA_part_neon operation
580FUNCTION(gteMACtoIR_flags_neon): @ r0=CP2 (d,c), r1=lm
581 push {r4,r5,lr}
582 tst r1, r1 @ lm
583 mov lr, #0 @ gteFLAG
584 mov r2, #0
585 mov r12, #15
586 moveq r2, #0x8000 @ adj
587 moveq r12, #16 @ shift
588
589 add r3, r0, #4*25
590 ldmia r3, {r3-r5} @ gteMAC|123
591
592 do_mac_flags r3, r4, r5
593
594 add r3, r2
595 add r4, r2
596 add r5, r2
597 asrs r3, r12
598 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
599 asrs r4, r12
600 orrne lr, #(1<<31)
601 orrne lr, #(1<<23) @ IR2/limB2
602 asrs r5, r12
603 orrne lr, #(1<<22) @ IR3/limB3
604 str lr, [r0, #4*(32+31)] @ gteFLAG
605
606 pop {r4,r5,pc}
607 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
608
609
610
611@ vim:filetype=armasm