spu: rework synchronization
[pcsx_rearmed.git] / libpcsxcore / gte_neon.S
... / ...
CommitLineData
1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8#include "arm_features.h"
9#include "new_dynarec/linkage_offsets.h"
10
11.syntax unified
12.text
13.align 2
14
15@ XXX: gteMAC calc shouldn't be saturating, but it is here
16
17@ approximate gteMAC|123 flags
18@ in: rr 123 as gteMAC|123
19@ trash: nothing
20.macro do_mac_flags rr1 rr2 rr3
21 cmp \rr1, #1
22 orrvs lr, #(1<<31)|(1<<27)
23 cmp \rr2, #1
24 orrvs lr, #(1<<31)|(1<<26)
25 cmp \rr3, #1
26 orrvs lr, #(1<<31)|(1<<25)
27 cmn \rr1, #1 @ same as adds ...
28 orrvs lr, #(1<<30)
29 cmn \rr2, #1
30 orrvs lr, #(1<<29)
31 cmn \rr3, #1
32 orrvs lr, #(1<<28)
33.endm
34
35@ approximate 3x gteMACn flags
36@ in: rr 123 as 3 instances gteMACn, *flags
37@ trash: nothing
38.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
39 cmp \rr1, #1
40 cmpvc \rr2, #1
41 cmpvc \rr3, #1
42 orrvs lr, #\nflags
43 cmn \rr1, #1 @ adds ...
44 cmnvc \rr2, #1
45 cmnvc \rr3, #1
46 orrvs lr, #\pflags
47.endm
48
49@ get gteIR|123 flags from gteMAC|123
50@ in: rr 123 as gteMAC|123
51@ trash: r2,r3
52.macro do_irs_flags rr1 rr2 rr3
53 add r2, \rr1, #0x8000
54 add r3, \rr2, #0x8000
55 lsrs r2, #16
56 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
57 lsrs r3, #16
58 add r2, \rr3, #0x8000
59 orrne lr, #(1<<31)
60 orrne lr, #(1<<23) @ IR2/limB2
61 lsrs r2, #16
62 orrne lr, #(1<<22) @ IR3/limB3
63.endm
64
65
66/*
67 * RTPS/RTPT register map:
68 *
69 * q | d | c code / phase 1 phase 2 scratch
70 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
71 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
72 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
73 * 3 * gteIR1-3 = gteIR1-3 /
74 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
75 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
76 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
77 * 7 0 gteDQB [s64] max gteMAC|12
78 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
79 * 9 * / gteMAC3 max gteIR|123
80 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
81 * 11 0 quotient 3
82 * 6 12 gteH (adj. for cmp)
83 * 13 gteH (float for div)
84 * ... <scratch>
85 * 15 30 0
86 * 31 0
87 */
88
89@ load gteR*, gteTR* and gteH (see map above), clear q15
90@ in: r0 - context
91@ trash: r3
92.macro rtpx_preload
93 add r3, r0, #4*32
94 vldmia r3, {d0-d2} @ gteR* [16*9]
95 vmov.i32 q15, #0
96 add r3, r0, #4*(32+5)
97 vldmia r3, {d4-d5} @ gteTR*
98 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
99 vext.16 d1, d0, d1, #3 @ xx32 -> x321
100 add r3, r0, #4*(32+26)
101 vld1.32 d11[0], [r3] @ gteH
102 vshll.s32 q3, d5, #12 @ gteTRZ
103 vshll.s32 q2, d4, #12 @ gteTR|XY
104 vmovl.s16 q6, d11 @ gteH
105.endm
106
107@ do RTP* gteMAC* calculation
108@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
109@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
110@ trash: d16-d21
111.macro rtpx_mac
112 vmull.s16 q8, d0, d8
113 vmull.s16 q9, d1, d8
114 vmull.s16 q10, d2, d8
115 vpaddl.s32 q8, q8
116 vpaddl.s32 q9, q9
117 vpaddl.s32 q10, q10
118 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
119 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
120 vadd.s64 d20, d21 @ QC
121 vadd.s64 d16, d4
122 vadd.s64 d18, d5
123 vadd.s64 d20, d6
124 vqshrn.s64 d8, q8, #12 @ gteMAC1
125 vqshrn.s64 d18, q9, #12 @ gteMAC2
126 vqshrn.s64 d9, q10, #12 @ gteMAC3
127 vsli.u64 d8, d18, #32 @ gteMAC|12
128 vmov.32 d9[1], r12
129 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
130.endm
131
132FUNCTION(gteRTPS_neon): @ r0=CP2 (d,c),
133 push {r4-r6,lr}
134
135@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
136 ldr r1, [r0, #LO_cop2_to_scratch_buf]
137 mov r12, #0
138
139 vldmia r0, {d8} @ VXYZ(0)
140 rtpx_preload
141
142@ rtpx_mac @ slower here, faster in RTPT?
143 vmov.16 d8[3], r12 @ kill unused upper vector
144 vmull.s16 q8, d0, d8
145 vmull.s16 q9, d1, d8
146 vmull.s16 q10, d2, d8
147 vpadd.s32 d16, d16, d17
148 vpadd.s32 d17, d18, d19
149 vpadd.s32 d18, d20, d21
150 vpadal.s32 q2, q8
151 vpadal.s32 q3, q9 @ d6, d18 is slow?
152 vqshrn.s64 d8, q2, #12 @ gteMAC|12
153 vqshrn.s64 d9, q3, #12 @ gteMAC3
154
155 add r3, r0, #4*25
156 vst1.32 d8, [r3]!
157 vst1.32 d9[0], [r3] @ wb gteMAC|123
158 vqmovn.s32 d10, q4 @ gteIR|123
159
160 add r3, r0, #4*17 @ gteSZ*
161 vldmia r3, {q7} @ d14,d15 gteSZ|123x
162 vmov.i32 d28, #0xffff @ 0xffff[32]
163 vmax.s32 d11, d9, d31
164 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
165 vmov.i32 d26, #1
166 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
167 vmovl.s16 q9, d10 @ || expand gteIR|123
168 vshl.u32 d13, d12, #16 @ | preparing gteH
169 add r3, r0, #4*9
170 vst1.32 d18, [r3]!
171 vst1.32 d19[0], [r3]
172
173 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
174 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
175
176 add r3, r0, #4*(32+24)
177 vld1.32 d4, [r3] @ || gteOF|XY
178 add r3, r0, #4*(32+27)
179 vld1.32 d6, [r3] @ || gteDQ|AB
180
181 vand d11, d16
182 vmovl.s32 q2, d4 @ || gteOF|XY [64]
183 vmax.u32 d11, d26 @ make divisor 1 if not
184 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
185 add r3, r0, #4*16 @ | gteSZ*
186 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
187
188 vcvt.f32.u32 d13, d13 @ gteH (float for div)
189 vcvt.f32.u32 d11, d11 @ divisor
190
191 @ divide.. it's not worth messing with reciprocals here
192 @ just for 1 value, let's just use VFP divider here
193 vdiv.f32 s22, s26, s22
194
195 vmov.f32 d20, #0.5
196 vadd.f32 d11, d20
197 vcvt.u32.f32 d11, d11 @ quotient
198
199 @ while NEON's busy we calculate some flags on ARM
200 add r3, r0, #4*25
201 mov lr, #0 @ gteFLAG
202 ldmia r3, {r4-r6} @ gteMAC|123
203
204 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
205 vqshl.u32 d11, #15
206
207 do_mac_flags r4, r5, r6
208
209 vshr.u32 d11, #15 @ quotient (limE)
210
211 do_irs_flags r4, r5, r6
212
213 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
214 add r3, r0, #4*13
215 vld1.32 d16, [r3] @ || load fS|XY12, new 01
216 vqmovn.s64 d18, q2 @ saturate to 32
217 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
218 vqshl.s32 d19, d18, #5 @ 11bit precision
219
220 ldr r4, [r1] @ quotient
221 movs r3, r6, lsr #16
222 orrne lr, #(1<<31)
223 orrne lr, #(1<<18) @ fSZ (limD)
224
225 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
226
227 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
228 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
229 vmovn.s32 d18, q9 @ fS|XY2 [s16]
230
231 vqmovn.s64 d20, q10 @ | gteMAC0
232 add r3, r0, #4*12
233 vst1.32 d16, [r3]! @ writeback fS|XY01
234 vst1.32 d18[0], [r3] @ ...2
235 add r3, r0, #4*24
236 vshr.s32 d21, d20, #12
237 vst1.32 d20[0], [r3] @ gteMAC0
238
239 movs r4, r4, lsr #17
240 orrne lr, #(1<<31)
241 orrne lr, #(1<<17) @ limE
242
243 vmax.s32 d21, d31
244 vmov.i32 d22, #0x1000
245 vmin.s32 d21, d22
246 add r3, r0, #4*8
247 vst1.16 d21[0], [r3] @ gteIR0
248
249 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
250 add r2, r4, #0x400<<16
251 add r3, r5, #0x400<<16
252 lsrs r2, #16+11
253 orrne lr, #(1<<14) @ limG1
254 orrne lr, #(1<<31)
255 lsrs r3, #16+11
256 orrne lr, #(1<<13) @ limG2
257 orrne lr, #(1<<31)
258 adds r2, r4, #1
259 addsvc r3, r5, #1
260 orrvs lr, #(1<<16) @ F
261 orrvs lr, #(1<<31)
262 subs r2, r4, #1
263 subsvc r3, r5, #1
264 orrvs lr, #(1<<31)
265
266 ldr r4, [r0, #4*24] @ gteMAC0
267 orrvs lr, #(1<<15)
268
269 adds r3, r4, #1
270 orrvs lr, #(1<<16) @ F
271 orrvs lr, #(1<<31)
272 subs r2, r4, #1
273 orrvs lr, #(1<<15) @ F
274 orrvs lr, #(1<<31)
275 cmp r4, #0x1000
276 orrhi lr, #(1<<12) @ limH
277
278 str lr, [r0, #4*(32+31)] @ gteFLAG
279
280 pop {r4-r6,pc}
281 .size gteRTPS_neon, .-gteRTPS_neon
282
283
284
285FUNCTION(gteRTPT_neon): @ r0=CP2 (d,c),
286 push {r4-r11,lr}
287
288 ldr r1, [r0, #LO_cop2_to_scratch_buf]
289 mov r12, #0
290
291 rtpx_preload
292
293 vmov.i32 d22, #0x7fffffff
294 vmov.i32 d23, #0x80000000
295 mov r3, #3 @ counter
296 mov r2, r0 @ VXYZ(0)
2970:
298 vldmia r2!, {d8} @ VXYZ(v)
299 vmov.16 d8[3], r12 @ kill unused upper vector
300
301 rtpx_mac
302 vmin.s32 d22, d8 @ min gteMAC|12
303 vmax.s32 d23, d8 @ max gteMAC|12
304 subs r3, #1
305 vst1.32 {d9,d10}, [r1, :128]!
306 bgt 0b
307
308 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
309
310 @ - phase2 -
311 sub r1, r1, #8*2*4
312 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
313
314 vmov d20, d0 @ gteMAC3 v=0
315 vmin.s16 d24, d1, d3 @ | find min IR
316 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
317 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
318 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
319 vmov d21, d9 @ ... v=2
320
321 vmov.i32 q14, #0xffff @ 0xffff[32]
322 vmax.s32 q10, q15
323 vmov.i32 q13, #1
324 vdup.32 q11, d22[0] @ gteH/2
325 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
326 vmin.s16 d24, d10 @ | find min/max IR
327 vmax.s16 d25, d10 @ |
328
329 add r3, r0, #4*19 @ ||
330 vld1.32 d14[0], [r3] @ || gteSZ3
331
332 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
333 add r3, r0, #4*17
334 vst1.32 d20, [r3]! @ | writeback fSZ(v)
335 vand q11, q10, q11
336 vst1.32 d21[0], [r3] @ |
337 vmax.u32 q10, q11, q13 @ make divisor 1 if not
338 add r3, r1, #8*8
339 vstmia r3, {q12} @ min/max IR for flags
340 vcvt.f32.u32 q10, q10
341 vshl.u32 d13, d12, #16 @ | preparing gteH
342
343 @ while NEON's busy we calculate some flags on ARM
344 add r2, r1, #8*2*3
345 mov lr, #0 @ gteFLAG
346 ldmia r2, {r4-r7} @ min/max gteMAC|12
347 subs r2, r4, #1
348 orrvs lr, #(1<<31)|(1<<27)
349 subs r3, r5, #1
350 orrvs lr, #(1<<31)|(1<<26)
351 adds r2, r6, #1
352 orrvs lr, #(1<<30)
353 adds r3, r7, #1
354 orrvs lr, #(1<<29)
355 ldr r4, [r1, #0] @ gteMAC3 v=0
356 ldr r5, [r1, #8*2] @ ... v=1
357 ldr r6, [r1, #8*4] @ ... v=2
358
359 add r3, r0, #4*(32+24)
360 vld1.32 d4, [r3] @ || gteOF|XY
361 add r3, r0, #4*(32+27)
362 vld1.32 d6, [r3] @ || gteDQ|AB
363
364 @ divide
365.if 1
366 vrecpe.f32 q11, q10 @ inv
367 vmovl.s32 q2, d4 @ || gteOF|XY [64]
368 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
369 vrecps.f32 q12, q10, q11 @ step
370 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
371 vmov.f32 q8, #0.5 @ |||
372 vmul.f32 q11, q12, q11 @ better inv
373 add r3, r0, #4*16
374 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
375 vdup.32 q13, d13[0] @ |
376@ vrecps.f32 q12, q10, q11 @ step
377@ vmul.f32 q11, q12, q11 @ better inv
378 vmul.f32 q10, q13, q11 @ result
379.else
380 vmov.f32 q8, #0.5 @ |||
381 vmovl.s32 q2, d4 @ || gteOF|XY [64]
382 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
383 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
384 vdup.32 q13, d13[0] @ |
385 add r3, r0, #4*16
386 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
387
388 vpush {q0}
389 vmov q0, q10 @ to test against C code
390 vdiv.f32 s0, s26, s0
391 vdiv.f32 s1, s26, s1
392 vdiv.f32 s2, s26, s2
393 vmov q10, q0
394 vpop {q0}
395.endif
396
397 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
398 orr r7, r4, r5
399 add r4, r1, #8*8
400 orr r3, r7, r6
401 ldmia r4, {r7,r8,r10,r11} @ min/max IR
402
403 movs r3, r3, lsr #16
404 orrne lr, #(1<<31)
405 orrne lr, #(1<<18) @ fSZ (limD)
406
407 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
408 vcvt.u32.f32 q8, q10
409 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
410 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
411 add r6, r1, #8*10
412 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
413 vqshl.u32 q8, #15
414 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
415 vshr.u32 q8, #15 @ quotients (limE)
416 vdup.32 d24, d16[0]
417 vdup.32 d25, d16[1]
418 vdup.32 d26, d17[0] @ quotient (dup)
419
420 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
421 mov r4, #0x10000
422 cmp r7, #1<<16
423 cmnvc r10, #1<<16
424 orrvs lr, #(1<<31)
425 orrvs lr, #(1<<23) @ IR2/limB2
426 rsbs r2, r4, r7, lsl #16
427 cmnvc r4, r10, lsl #16
428 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
429 rsbs r2, r4, r8, lsl #16
430 cmnvc r4, r11, lsl #16
431 orrvs lr, #(1<<22) @ IR3/limB3
432
433 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
434 vmull.s32 q10, d20, d25 @ ... v=1
435 vmull.s32 q11, d22, d26 @ ... v=2
436 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
437 vadd.s64 q10, q2 @ ... v=1
438 vadd.s64 q11, q2 @ ... v=2
439 vqmovn.s64 d18, q9 @ saturate to 32 v=0
440 vqmovn.s64 d19, q10 @ ... v=1
441 vqmovn.s64 d20, q11 @ ... v=2
442 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
443 vmax.s32 d15, d18, d19 @ || for flags
444 vmin.s32 d14, d20
445 vmax.s32 d15, d20
446 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
447 vqshl.s32 d24, d20, #5 @ ... v=2
448 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
449 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
450 vpmax.s32 d17, d15, d31 @ ||
451 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
452 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
453 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
454 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
455 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
456 vmovn.s32 d13, q12 @ 3
457 vstmia r1, {d14-d16} @ || other cacheline than quotients
458 add r3, r0, #4*12
459 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
460 vst1.32 d13[0], [r3]
461
462 vqmovn.s64 d26, q13 @ | gteMAC0
463 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
464
465 vmov.i32 d13, #0x1000
466 vshr.s32 d12, d26, #12
467
468 add r3, r0, #4*24
469 vst1.32 d26[0], [r3]! @ gteMAC0
470 vmax.s32 d12, d30
471 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
472 vst1.32 d9[0], [r3]
473
474 vmin.s32 d12, d13 @ | gteIR0
475
476 ldmia r6, {r4-r6} @ quotients
477 orr r4, r5
478 orr r4, r6
479 add r3, r0, #4*8
480 movs r4, r4, lsr #17
481
482 vst1.32 d12[0], [r3]! @ gteIR0
483 vst1.32 d10, [r3]! @ gteIR12
484 vst1.32 d11[0], [r3] @ ..3
485
486 @ ~23 cycles
487 orrne lr, #(1<<31) @ limE
488 orrne lr, #(1<<17) @ limE
489 ldmia r1, {r4-r9}
490 add r2, r4, #0x400<<16 @ min fSX
491 add r3, r6, #0x400<<16 @ max fSX
492 lsrs r2, #16+11
493 lsrseq r3, #16+11
494 orrne lr, #(1<<31) @ limG1
495 orrne lr, #(1<<14)
496 add r2, r5, #0x400<<16 @ min fSY
497 add r3, r7, #0x400<<16 @ max fSY
498 lsrs r2, #16+11
499 lsrseq r3, #16+11
500 orrne lr, #(1<<31) @ limG2
501 orrne lr, #(1<<13)
502 adds r2, r9, #1
503 orrvs lr, #(1<<16) @ F (31 already done by above)
504 subs r3, r8, #1
505
506 ldr r4, [r0, #4*24] @ gteMAC0
507 orrvs lr, #(1<<15)
508
509 adds r3, r4, #1
510 orrvs lr, #(1<<16)
511 orrvs lr, #(1<<31) @ F
512 subs r2, r4, #1
513 orrvs lr, #(1<<15)
514 orrvs lr, #(1<<31) @ F
515 cmp r4, #0x1000
516 orrhi lr, #(1<<12) @ limH
517
518 str lr, [r0, #4*(32+31)] @ gteFLAG
519
520 pop {r4-r11,pc}
521 .size gteRTPT_neon, .-gteRTPT_neon
522
523
524
525@ note: non-std calling convention used
526@ r0 = CP2 (d,c) (must preserve)
527@ r1 = op
528@ r4,r5 = VXYZ(v) packed
529@ r6 = &MX11(mx)
530@ r7 = &CV1(cv)
531FUNCTION(gteMVMVA_part_neon):
532 uxth r5, r5
533 vmov.32 d8[0], r4
534 vmov.32 d8[1], r5 @ VXYZ(v)
535 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
536 vldmia r7, {d4-d5} @ CVx/gteTR*
537
538 vmov.i32 q15, #0
539 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
540 vext.16 d1, d0, d1, #3 @ xx32 -> x321
541 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
542 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
543
544 vmull.s16 q8, d0, d8
545 vmull.s16 q9, d1, d8
546 vmull.s16 q10, d2, d8
547 vpadd.s32 d16, d16, d17
548 vpadd.s32 d17, d18, d19
549 vpadd.s32 d18, d20, d21
550 vpadal.s32 q2, q8
551 vpadal.s32 q3, q9
552 tst r1, #1<<19
553 beq 0f
554 vshr.s64 q2, q2, #12
555 vshr.s64 q3, q3, #12
5560:
557 vqmovn.s64 d8, q2 @ gteMAC|12
558 vqmovn.s64 d9, q3 @ gteMAC3
559
560 tst r1, #1<<10
561 add r3, r0, #4*25
562 vqmovn.s32 d10, q4 @ gteIR|123
563 vst1.32 d8, [r3]!
564 vst1.32 d9[0], [r3] @ wb gteMAC|123
565
566 beq 0f
567 vmax.s16 d10, d31
5680:
569 vmovl.s16 q9, d10 @ expand gteIR|123
570 add r3, r0, #4*9
571 vst1.32 d18, [r3]!
572 vst1.32 d19[0], [r3]
573 bx lr
574 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
575
576
577@ get flags after gteMVMVA_part_neon operation
578FUNCTION(gteMACtoIR_flags_neon): @ r0=CP2 (d,c), r1=lm
579 push {r4,r5,lr}
580 tst r1, r1 @ lm
581 mov lr, #0 @ gteFLAG
582 mov r2, #0
583 mov r12, #15
584 moveq r2, #0x8000 @ adj
585 moveq r12, #16 @ shift
586
587 add r3, r0, #4*25
588 ldmia r3, {r3-r5} @ gteMAC|123
589
590 do_mac_flags r3, r4, r5
591
592 add r3, r2
593 add r4, r2
594 add r5, r2
595 asrs r3, r12
596 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
597 asrs r4, r12
598 orrne lr, #(1<<31)
599 orrne lr, #(1<<23) @ IR2/limB2
600 asrs r5, r12
601 orrne lr, #(1<<22) @ IR3/limB3
602 str lr, [r0, #4*(32+31)] @ gteFLAG
603
604 pop {r4,r5,pc}
605 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
606
607
608
609@ vim:filetype=armasm