spu: add asm mixing code for ARMv5 too
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
8cfbda97 5 * See the COPYING file in the top-level directory.
6 */
7
8
9.bss
10.align 6 @ cacheline
11
12scratch:
13.rept 8*8*2/4
14 .word 0
15.endr
16
17.text
18.align 2
19
59774ed0 20@ XXX: gteMAC calc shouldn't be saturating, but it is here
21
5d8e3bf8 22@ approximate gteMAC|123 flags
23@ in: rr 123 as gteMAC|123
24@ trash: nothing
25.macro do_mac_flags rr1 rr2 rr3
26 cmp \rr1, #1
27 orrvs lr, #(1<<31)|(1<<27)
28 cmp \rr2, #1
29 orrvs lr, #(1<<31)|(1<<26)
30 cmp \rr3, #1
31 orrvs lr, #(1<<31)|(1<<25)
32 cmn \rr1, #1 @ same as adds ...
33 orrvs lr, #(1<<30)
34 cmn \rr2, #1
35 orrvs lr, #(1<<29)
36 cmn \rr3, #1
37 orrvs lr, #(1<<28)
38.endm
39
40@ approximate 3x gteMACn flags
41@ in: rr 123 as 3 instances gteMACn, *flags
42@ trash: nothing
43.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
44 cmp \rr1, #1
45 cmpvc \rr2, #1
46 cmpvc \rr3, #1
47 orrvs lr, #\nflags
48 cmn \rr1, #1 @ adds ...
49 cmnvc \rr2, #1
50 cmnvc \rr3, #1
51 orrvs lr, #\pflags
52.endm
53
17ed0d69 54@ get gteIR|123 flags from gteMAC|123
55@ in: rr 123 as gteMAC|123
5d8e3bf8 56@ trash: r2,r3
57.macro do_irs_flags rr1 rr2 rr3
17ed0d69 58 add r2, \rr1, #0x8000
59 add r3, \rr2, #0x8000
60 lsrs r2, #16
61 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
62 lsrs r3, #16
63 add r2, \rr3, #0x8000
64 orrne lr, #(1<<31)
65 orrne lr, #(1<<23) @ IR2/limB2
66 lsrs r2, #16
67 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 68.endm
69
70
71/*
5d8e3bf8 72 * RTPS/RTPT register map:
73 *
8cfbda97 74 * q | d | c code / phase 1 phase 2 scratch
75 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
76 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
77 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
78 * 3 * gteIR1-3 = gteIR1-3 /
79 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
80 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
81 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
82 * 7 0 gteDQB [s64] max gteMAC|12
83 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
84 * 9 * / gteMAC3 max gteIR|123
85 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
86 * 11 0 quotient 3
87 * 6 12 gteH (adj. for cmp)
88 * 13 gteH (float for div)
89 * ... <scratch>
90 * 15 30 0
91 * 31 0
92 */
8cfbda97 93
5d8e3bf8 94@ load gteR*, gteTR* and gteH (see map above), clear q15
95@ in: r0 - context
96@ trash: r3
97.macro rtpx_preload
8cfbda97 98 add r3, r0, #4*32
99 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 100 vmov.i32 q15, #0
8cfbda97 101 add r3, r0, #4*(32+5)
102 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 103 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
104 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 105 add r3, r0, #4*(32+26)
106 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 107 vshll.s32 q3, d5, #12 @ gteTRZ
108 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 109 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 110.endm
8cfbda97 111
5d8e3bf8 112@ do RTP* gteMAC* calculation
113@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
114@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
115@ trash: d16-d21
116.macro rtpx_mac
8cfbda97 117 vmull.s16 q8, d0, d8
118 vmull.s16 q9, d1, d8
119 vmull.s16 q10, d2, d8
120 vpaddl.s32 q8, q8
121 vpaddl.s32 q9, q9
122 vpaddl.s32 q10, q10
123 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
124 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
125 vadd.s64 d20, d21 @ QC
126 vadd.s64 d16, d4
127 vadd.s64 d18, d5
128 vadd.s64 d20, d6
129 vqshrn.s64 d8, q8, #12 @ gteMAC1
130 vqshrn.s64 d18, q9, #12 @ gteMAC2
131 vqshrn.s64 d9, q10, #12 @ gteMAC3
132 vsli.u64 d8, d18, #32 @ gteMAC|12
133 vmov.32 d9[1], r12
5d8e3bf8 134 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
135.endm
136
137.global gteRTPS_neon @ r0=CP2 (d,c),
138gteRTPS_neon:
17ed0d69 139 push {r4-r6,lr}
5d8e3bf8 140
141@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
142 movw r1, #:lower16:scratch
143 movt r1, #:upper16:scratch
144 mov r12, #0
145
4cc3050c 146 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 147 rtpx_preload
148
4cc3050c 149@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 150 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 151 vmull.s16 q8, d0, d8
152 vmull.s16 q9, d1, d8
153 vmull.s16 q10, d2, d8
154 vpadd.s32 d16, d16, d17
155 vpadd.s32 d17, d18, d19
156 vpadd.s32 d18, d20, d21
157 vpadal.s32 q2, q8
158 vpadal.s32 q3, q9 @ d6, d18 is slow?
159 vqshrn.s64 d8, q2, #12 @ gteMAC|12
160 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 161
162 add r3, r0, #4*25
163 vst1.32 d8, [r3]!
164 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 165 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 166
167 add r3, r0, #4*17 @ gteSZ*
168 vldmia r3, {q7} @ d14,d15 gteSZ|123x
169 vmov.i32 d28, #0xffff @ 0xffff[32]
170 vmax.s32 d11, d9, d31
171 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
172 vmov.i32 d26, #1
173 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 174 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 175 vshl.u32 d13, d12, #16 @ | preparing gteH
176 add r3, r0, #4*9
177 vst1.32 d18, [r3]!
178 vst1.32 d19[0], [r3]
179
180 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
181 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
182
183 add r3, r0, #4*(32+24)
184 vld1.32 d4, [r3] @ || gteOF|XY
185 add r3, r0, #4*(32+27)
186 vld1.32 d6, [r3] @ || gteDQ|AB
187
188 vand d11, d16
189 vmovl.s32 q2, d4 @ || gteOF|XY [64]
190 vmax.u32 d11, d26 @ make divisor 1 if not
191 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
192 add r3, r0, #4*16 @ | gteSZ*
193 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
194
195 vcvt.f32.u32 d13, d13 @ gteH (float for div)
196 vcvt.f32.u32 d11, d11 @ divisor
197
198 @ divide.. it's not worth messing with reciprocals here
199 @ just for 1 value, let's just use VFP divider here
200 vdiv.f32 s22, s26, s22
201
202 vcvt.u32.f32 d11, d11 @ quotient
203
204 @ while NEON's busy we calculate some flags on ARM
205 add r3, r0, #4*25
206 mov lr, #0 @ gteFLAG
207 ldmia r3, {r4-r6} @ gteMAC|123
208
4cc3050c 209 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 210 vqshl.u32 d11, #15
211
212 do_mac_flags r4, r5, r6
213
214 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 215
5d8e3bf8 216 do_irs_flags r4, r5, r6
217
218 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
219 add r3, r0, #4*13
220 vld1.32 d16, [r3] @ || load fS|XY12, new 01
221 vqmovn.s64 d18, q2 @ saturate to 32
222 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
223 vqshl.s32 d19, d18, #5 @ 11bit precision
224
225 ldr r4, [r1] @ quotient
5d8e3bf8 226 movs r3, r6, lsr #16
227 orrne lr, #(1<<31)
228 orrne lr, #(1<<18) @ fSZ (limD)
229
17ed0d69 230 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
231
5d8e3bf8 232 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
233 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
234 vmovn.s32 d18, q9 @ fS|XY2 [s16]
235
0e828e88 236 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 237 add r3, r0, #4*12
238 vst1.32 d16, [r3]! @ writeback fS|XY01
239 vst1.32 d18[0], [r3] @ ...2
240 add r3, r0, #4*24
0e828e88 241 vshr.s32 d21, d20, #12
5d8e3bf8 242 vst1.32 d20[0], [r3] @ gteMAC0
243
17ed0d69 244 movs r4, r4, lsr #17
245 orrne lr, #(1<<31)
246 orrne lr, #(1<<17) @ limE
247
0e828e88 248 vmax.s32 d21, d31
5d8e3bf8 249 vmov.i32 d22, #0x1000
250 vmin.s32 d21, d22
251 add r3, r0, #4*8
252 vst1.16 d21[0], [r3] @ gteIR0
253
17ed0d69 254 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
255 add r2, r4, #0x400<<16
256 add r3, r5, #0x400<<16
257 lsrs r2, #16+11
258 orrne lr, #(1<<14) @ limG1
259 orrne lr, #(1<<31)
260 lsrs r3, #16+11
261 orrne lr, #(1<<13) @ limG2
5d8e3bf8 262 orrne lr, #(1<<31)
5d8e3bf8 263 adds r2, r4, #1
264 addvcs r3, r5, #1
265 orrvs lr, #(1<<16) @ F
266 orrvs lr, #(1<<31)
267 subs r2, r4, #1
268 subvcs r3, r5, #1
269 orrvs lr, #(1<<31)
270
271 ldr r4, [r0, #4*24] @ gteMAC0
272 orrvs lr, #(1<<15)
273
274 adds r3, r4, #1
275 orrvs lr, #(1<<16) @ F
276 orrvs lr, #(1<<31)
277 subs r2, r4, #1
278 orrvs lr, #(1<<15) @ F
279 orrvs lr, #(1<<31)
280 cmp r4, #0x1000
281 orrhi lr, #(1<<12) @ limH
282
283 str lr, [r0, #4*(32+31)] @ gteFLAG
284
17ed0d69 285 pop {r4-r6,pc}
5d8e3bf8 286 .size gteRTPS_neon, .-gteRTPS_neon
287
288
289
290.global gteRTPT_neon @ r0=CP2 (d,c),
291gteRTPT_neon:
292 push {r4-r11,lr}
293
294 movw r1, #:lower16:scratch
295 movt r1, #:upper16:scratch
296 mov r12, #0
297
298 rtpx_preload
299
300 vmov.i32 d22, #0x7fffffff
301 vmov.i32 d23, #0x80000000
302 mov r3, #3 @ counter
303 mov r2, r0 @ VXYZ(0)
3040:
305 vldmia r2!, {d8} @ VXYZ(v)
306 vmov.16 d8[3], r12 @ kill unused upper vector
307
308 rtpx_mac
8cfbda97 309 vmin.s32 d22, d8 @ min gteMAC|12
310 vmax.s32 d23, d8 @ max gteMAC|12
311 subs r3, #1
4cc3050c 312 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 313 bgt 0b
314
4cc3050c 315 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 316
317 @ - phase2 -
318 sub r1, r1, #8*2*4
319 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
320
321 vmov d20, d0 @ gteMAC3 v=0
322 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 323 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 324 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
325 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
326 vmov d21, d9 @ ... v=2
327
328 vmov.i32 q14, #0xffff @ 0xffff[32]
329 vmax.s32 q10, q15
330 vmov.i32 q13, #1
331 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 332 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 333 vmin.s16 d24, d10 @ | find min/max IR
334 vmax.s16 d25, d10 @ |
335
5d8e3bf8 336 add r3, r0, #4*19 @ ||
337 vld1.32 d14[0], [r3] @ || gteSZ3
338
8cfbda97 339 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
340 add r3, r0, #4*17
341 vst1.32 d20, [r3]! @ | writeback fSZ(v)
342 vand q11, q10, q11
343 vst1.32 d21[0], [r3] @ |
344 vmax.u32 q10, q11, q13 @ make divisor 1 if not
345 add r3, r1, #8*8
346 vstmia r3, {q12} @ min/max IR for flags
347 vcvt.f32.u32 q10, q10
348 vshl.u32 d13, d12, #16 @ | preparing gteH
349
350 @ while NEON's busy we calculate some flags on ARM
351 add r2, r1, #8*2*3
352 mov lr, #0 @ gteFLAG
353 ldmia r2, {r4-r7} @ min/max gteMAC|12
354 subs r2, r4, #1
355 orrvs lr, #(1<<31)|(1<<27)
356 subs r3, r5, #1
357 orrvs lr, #(1<<31)|(1<<26)
358 adds r2, r6, #1
359 orrvs lr, #(1<<30)
360 adds r3, r7, #1
361 orrvs lr, #(1<<29)
362 ldr r4, [r1, #0] @ gteMAC3 v=0
363 ldr r5, [r1, #8*2] @ ... v=1
364 ldr r6, [r1, #8*4] @ ... v=2
365
366 add r3, r0, #4*(32+24)
367 vld1.32 d4, [r3] @ || gteOF|XY
368 add r3, r0, #4*(32+27)
5d8e3bf8 369 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 370
371 @ divide
372.if 1
373 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 374 vmovl.s32 q2, d4 @ || gteOF|XY [64]
375 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 376 vrecps.f32 q12, q10, q11 @ step
377 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
378 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 379 add r3, r0, #4*16
380 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 381 vdup.32 q13, d13[0] @ |
382@ vrecps.f32 q12, q10, q11 @ step
383@ vmul.f32 q11, q12, q11 @ better inv
384 vmul.f32 q10, q13, q11 @ result
385.else
5d8e3bf8 386 vmovl.s32 q2, d4 @ || gteOF|XY [64]
387 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 388 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
389 vdup.32 q13, d13[0] @ |
5d8e3bf8 390 add r3, r0, #4*16
391 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 392
393 vpush {q0}
394 vmov q0, q10 @ to test against C code
395 vdiv.f32 s0, s26, s0
396 vdiv.f32 s1, s26, s1
397 vdiv.f32 s2, s26, s2
398 vmov q10, q0
399 vpop {q0}
400.endif
401
5d8e3bf8 402 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 403 orr r7, r4, r5
404 add r4, r1, #8*8
405 orr r3, r7, r6
406 ldmia r4, {r7,r8,r10,r11} @ min/max IR
407
408 movs r3, r3, lsr #16
409 orrne lr, #(1<<31)
410 orrne lr, #(1<<18) @ fSZ (limD)
411
412@ vadd.f32 q10, q @ adjust for vcvt rounding mode
413 vcvt.u32.f32 q8, q10
414 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
415 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
416 add r6, r1, #8*10
417 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
418 vqshl.u32 q8, #15
419 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
420 vshr.u32 q8, #15 @ quotients (limE)
421 vdup.32 d24, d16[0]
422 vdup.32 d25, d16[1]
423 vdup.32 d26, d17[0] @ quotient (dup)
424
5d8e3bf8 425 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
426 mov r4, #0x10000
427 cmp r7, #1<<16
428 cmnvc r10, #1<<16
8cfbda97 429 orrvs lr, #(1<<31)
430 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 431 rsbs r2, r4, r7, lsl #16
432 cmnvc r4, r10, lsl #16
8cfbda97 433 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 434 rsbs r2, r4, r8, lsl #16
435 cmnvc r4, r11, lsl #16
8cfbda97 436 orrvs lr, #(1<<22) @ IR3/limB3
437
438 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
439 vmull.s32 q10, d20, d25 @ ... v=1
440 vmull.s32 q11, d22, d26 @ ... v=2
441 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
442 vadd.s64 q10, q2 @ ... v=1
443 vadd.s64 q11, q2 @ ... v=2
444 vqmovn.s64 d18, q9 @ saturate to 32 v=0
445 vqmovn.s64 d19, q10 @ ... v=1
446 vqmovn.s64 d20, q11 @ ... v=2
447 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
448 vmax.s32 d15, d18, d19 @ || for flags
449 vmin.s32 d14, d20
450 vmax.s32 d15, d20
451 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
452 vqshl.s32 d24, d20, #5 @ ... v=2
453 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 454 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
455 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 456 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
457 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 458 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
459 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
460 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
461 vmovn.s32 d13, q12 @ 3
462 vstmia r1, {d14-d16} @ || other cacheline than quotients
463 add r3, r0, #4*12
464 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
465 vst1.32 d13[0], [r3]
466
0e828e88 467 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 468 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
469
470 vmov.i32 d13, #0x1000
0e828e88 471 vshr.s32 d12, d26, #12
8cfbda97 472
473 add r3, r0, #4*24
474 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 475 vmax.s32 d12, d30
8cfbda97 476 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
477 vst1.32 d9[0], [r3]
478
479 vmin.s32 d12, d13 @ | gteIR0
480
8cfbda97 481 ldmia r6, {r4-r6} @ quotients
482 orr r4, r5
483 orr r4, r6
5d8e3bf8 484 add r3, r0, #4*8
8cfbda97 485 movs r4, r4, lsr #17
8cfbda97 486
8cfbda97 487 vst1.32 d12[0], [r3]! @ gteIR0
488 vst1.32 d10, [r3]! @ gteIR12
489 vst1.32 d11[0], [r3] @ ..3
490
17ed0d69 491 @ ~23 cycles
5d8e3bf8 492 orrne lr, #(1<<31) @ limE
493 orrne lr, #(1<<17) @ limE
8cfbda97 494 ldmia r1, {r4-r9}
17ed0d69 495 add r2, r4, #0x400<<16 @ min fSX
496 add r3, r6, #0x400<<16 @ max fSX
497 lsrs r2, #16+11
498 lsreqs r3, #16+11
499 orrne lr, #(1<<31) @ limG1
500 orrne lr, #(1<<14)
501 add r2, r5, #0x400<<16 @ min fSY
502 add r3, r7, #0x400<<16 @ max fSY
503 lsrs r2, #16+11
504 lsreqs r3, #16+11
505 orrne lr, #(1<<31) @ limG2
506 orrne lr, #(1<<13)
8cfbda97 507 adds r2, r9, #1
17ed0d69 508 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 509 subs r3, r8, #1
8cfbda97 510
511 ldr r4, [r0, #4*24] @ gteMAC0
512 orrvs lr, #(1<<15)
513
514 adds r3, r4, #1
515 orrvs lr, #(1<<16)
516 orrvs lr, #(1<<31) @ F
517 subs r2, r4, #1
518 orrvs lr, #(1<<15)
519 orrvs lr, #(1<<31) @ F
520 cmp r4, #0x1000
5d8e3bf8 521 orrhi lr, #(1<<12) @ limH
8cfbda97 522
523 str lr, [r0, #4*(32+31)] @ gteFLAG
524
525 pop {r4-r11,pc}
5d8e3bf8 526 .size gteRTPT_neon, .-gteRTPT_neon
527
528
529
054175e9 530@ note: non-std calling convention used
531@ r0 = CP2 (d,c) (must preserve)
532@ r1 = op
533@ r4,r5 = VXYZ(v) packed
534@ r6 = &MX11(mx)
535@ r7 = &CV1(cv)
536.global gteMVMVA_part_neon
537gteMVMVA_part_neon:
17ed0d69 538 uxth r5, r5
539 vmov.32 d8[0], r4
540 vmov.32 d8[1], r5 @ VXYZ(v)
054175e9 541 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
542 vldmia r7, {d4-d5} @ CVx/gteTR*
17ed0d69 543
17ed0d69 544 vmov.i32 q15, #0
545 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
546 vext.16 d1, d0, d1, #3 @ xx32 -> x321
547 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
548 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
549
550 vmull.s16 q8, d0, d8
551 vmull.s16 q9, d1, d8
552 vmull.s16 q10, d2, d8
553 vpadd.s32 d16, d16, d17
554 vpadd.s32 d17, d18, d19
555 vpadd.s32 d18, d20, d21
556 vpadal.s32 q2, q8
557 vpadal.s32 q3, q9
558 tst r1, #1<<19
559 beq 0f
560 vshr.s64 q2, q2, #12
561 vshr.s64 q3, q3, #12
5620:
563 vqmovn.s64 d8, q2 @ gteMAC|12
564 vqmovn.s64 d9, q3 @ gteMAC3
565
566 tst r1, #1<<10
567 add r3, r0, #4*25
568 vqmovn.s32 d10, q4 @ gteIR|123
569 vst1.32 d8, [r3]!
570 vst1.32 d9[0], [r3] @ wb gteMAC|123
571
572 beq 0f
573 vmax.s16 d10, d31
5740:
575 vmovl.s16 q9, d10 @ expand gteIR|123
576 add r3, r0, #4*9
577 vst1.32 d18, [r3]!
578 vst1.32 d19[0], [r3]
054175e9 579 bx lr
580 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
17ed0d69 581
054175e9 582
583@ get flags after gteMVMVA_part_neon operation
584.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
585gteMACtoIR_flags_neon:
586 push {r4,r5,lr}
587 tst r1, r1 @ lm
17ed0d69 588 mov lr, #0 @ gteFLAG
054175e9 589 mov r2, #0
17ed0d69 590 mov r12, #15
591 moveq r2, #0x8000 @ adj
592 moveq r12, #16 @ shift
593
594 add r3, r0, #4*25
595 ldmia r3, {r3-r5} @ gteMAC|123
596
597 do_mac_flags r3, r4, r5
598
599 add r3, r2
600 add r4, r2
601 add r5, r2
602 asrs r3, r12
603 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
604 asrs r4, r12
605 orrne lr, #(1<<31)
606 orrne lr, #(1<<23) @ IR2/limB2
607 asrs r5, r12
608 orrne lr, #(1<<22) @ IR3/limB3
609 str lr, [r0, #4*(32+31)] @ gteFLAG
610
054175e9 611 pop {r4,r5,pc}
612 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
17ed0d69 613
614
615
8cfbda97 616@ vim:filetype=armasm