cdrom: resume cdda on state load
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
8cfbda97 5 * See the COPYING file in the top-level directory.
6 */
7
8
9.bss
10.align 6 @ cacheline
11
12scratch:
13.rept 8*8*2/4
14 .word 0
15.endr
16
17.text
18.align 2
19
59774ed0 20@ XXX: gteMAC calc shouldn't be saturating, but it is here
21
5d8e3bf8 22@ approximate gteMAC|123 flags
23@ in: rr 123 as gteMAC|123
24@ trash: nothing
25.macro do_mac_flags rr1 rr2 rr3
26 cmp \rr1, #1
27 orrvs lr, #(1<<31)|(1<<27)
28 cmp \rr2, #1
29 orrvs lr, #(1<<31)|(1<<26)
30 cmp \rr3, #1
31 orrvs lr, #(1<<31)|(1<<25)
32 cmn \rr1, #1 @ same as adds ...
33 orrvs lr, #(1<<30)
34 cmn \rr2, #1
35 orrvs lr, #(1<<29)
36 cmn \rr3, #1
37 orrvs lr, #(1<<28)
38.endm
39
40@ approximate 3x gteMACn flags
41@ in: rr 123 as 3 instances gteMACn, *flags
42@ trash: nothing
43.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
44 cmp \rr1, #1
45 cmpvc \rr2, #1
46 cmpvc \rr3, #1
47 orrvs lr, #\nflags
48 cmn \rr1, #1 @ adds ...
49 cmnvc \rr2, #1
50 cmnvc \rr3, #1
51 orrvs lr, #\pflags
52.endm
53
17ed0d69 54@ get gteIR|123 flags from gteMAC|123
55@ in: rr 123 as gteMAC|123
5d8e3bf8 56@ trash: r2,r3
57.macro do_irs_flags rr1 rr2 rr3
17ed0d69 58 add r2, \rr1, #0x8000
59 add r3, \rr2, #0x8000
60 lsrs r2, #16
61 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
62 lsrs r3, #16
63 add r2, \rr3, #0x8000
64 orrne lr, #(1<<31)
65 orrne lr, #(1<<23) @ IR2/limB2
66 lsrs r2, #16
67 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 68.endm
69
70
71/*
5d8e3bf8 72 * RTPS/RTPT register map:
73 *
8cfbda97 74 * q | d | c code / phase 1 phase 2 scratch
75 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
76 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
77 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
78 * 3 * gteIR1-3 = gteIR1-3 /
79 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
80 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
81 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
82 * 7 0 gteDQB [s64] max gteMAC|12
83 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
84 * 9 * / gteMAC3 max gteIR|123
85 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
86 * 11 0 quotient 3
87 * 6 12 gteH (adj. for cmp)
88 * 13 gteH (float for div)
89 * ... <scratch>
90 * 15 30 0
91 * 31 0
92 */
8cfbda97 93
5d8e3bf8 94@ load gteR*, gteTR* and gteH (see map above), clear q15
95@ in: r0 - context
96@ trash: r3
97.macro rtpx_preload
8cfbda97 98 add r3, r0, #4*32
99 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 100 vmov.i32 q15, #0
8cfbda97 101 add r3, r0, #4*(32+5)
102 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 103 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
104 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 105 add r3, r0, #4*(32+26)
106 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 107 vshll.s32 q3, d5, #12 @ gteTRZ
108 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 109 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 110.endm
8cfbda97 111
5d8e3bf8 112@ do RTP* gteMAC* calculation
113@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
114@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
115@ trash: d16-d21
116.macro rtpx_mac
8cfbda97 117 vmull.s16 q8, d0, d8
118 vmull.s16 q9, d1, d8
119 vmull.s16 q10, d2, d8
120 vpaddl.s32 q8, q8
121 vpaddl.s32 q9, q9
122 vpaddl.s32 q10, q10
123 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
124 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
125 vadd.s64 d20, d21 @ QC
126 vadd.s64 d16, d4
127 vadd.s64 d18, d5
128 vadd.s64 d20, d6
129 vqshrn.s64 d8, q8, #12 @ gteMAC1
130 vqshrn.s64 d18, q9, #12 @ gteMAC2
131 vqshrn.s64 d9, q10, #12 @ gteMAC3
132 vsli.u64 d8, d18, #32 @ gteMAC|12
133 vmov.32 d9[1], r12
5d8e3bf8 134 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
135.endm
136
137.global gteRTPS_neon @ r0=CP2 (d,c),
138gteRTPS_neon:
17ed0d69 139 push {r4-r6,lr}
5d8e3bf8 140
141@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
142 movw r1, #:lower16:scratch
143 movt r1, #:upper16:scratch
144 mov r12, #0
145
4cc3050c 146 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 147 rtpx_preload
148
4cc3050c 149@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 150 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 151 vmull.s16 q8, d0, d8
152 vmull.s16 q9, d1, d8
153 vmull.s16 q10, d2, d8
154 vpadd.s32 d16, d16, d17
155 vpadd.s32 d17, d18, d19
156 vpadd.s32 d18, d20, d21
157 vpadal.s32 q2, q8
158 vpadal.s32 q3, q9 @ d6, d18 is slow?
159 vqshrn.s64 d8, q2, #12 @ gteMAC|12
160 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 161
162 add r3, r0, #4*25
163 vst1.32 d8, [r3]!
164 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 165 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 166
167 add r3, r0, #4*17 @ gteSZ*
168 vldmia r3, {q7} @ d14,d15 gteSZ|123x
169 vmov.i32 d28, #0xffff @ 0xffff[32]
170 vmax.s32 d11, d9, d31
171 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
172 vmov.i32 d26, #1
173 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 174 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 175 vshl.u32 d13, d12, #16 @ | preparing gteH
176 add r3, r0, #4*9
177 vst1.32 d18, [r3]!
178 vst1.32 d19[0], [r3]
179
180 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
181 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
182
183 add r3, r0, #4*(32+24)
184 vld1.32 d4, [r3] @ || gteOF|XY
185 add r3, r0, #4*(32+27)
186 vld1.32 d6, [r3] @ || gteDQ|AB
187
188 vand d11, d16
189 vmovl.s32 q2, d4 @ || gteOF|XY [64]
190 vmax.u32 d11, d26 @ make divisor 1 if not
191 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
192 add r3, r0, #4*16 @ | gteSZ*
193 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
194
195 vcvt.f32.u32 d13, d13 @ gteH (float for div)
196 vcvt.f32.u32 d11, d11 @ divisor
197
198 @ divide.. it's not worth messing with reciprocals here
199 @ just for 1 value, let's just use VFP divider here
200 vdiv.f32 s22, s26, s22
201
202 vcvt.u32.f32 d11, d11 @ quotient
203
204 @ while NEON's busy we calculate some flags on ARM
205 add r3, r0, #4*25
206 mov lr, #0 @ gteFLAG
207 ldmia r3, {r4-r6} @ gteMAC|123
208
4cc3050c 209 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 210 vqshl.u32 d11, #15
211
212 do_mac_flags r4, r5, r6
213
214 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 215
5d8e3bf8 216 do_irs_flags r4, r5, r6
217
218 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
219 add r3, r0, #4*13
220 vld1.32 d16, [r3] @ || load fS|XY12, new 01
221 vqmovn.s64 d18, q2 @ saturate to 32
222 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
223 vqshl.s32 d19, d18, #5 @ 11bit precision
224
225 ldr r4, [r1] @ quotient
5d8e3bf8 226 movs r3, r6, lsr #16
227 orrne lr, #(1<<31)
228 orrne lr, #(1<<18) @ fSZ (limD)
229
17ed0d69 230 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
231
5d8e3bf8 232 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
233 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
234 vmovn.s32 d18, q9 @ fS|XY2 [s16]
235
0e828e88 236 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 237 add r3, r0, #4*12
238 vst1.32 d16, [r3]! @ writeback fS|XY01
239 vst1.32 d18[0], [r3] @ ...2
240 add r3, r0, #4*24
0e828e88 241 vshr.s32 d21, d20, #12
5d8e3bf8 242 vst1.32 d20[0], [r3] @ gteMAC0
243
17ed0d69 244 movs r4, r4, lsr #17
245 orrne lr, #(1<<31)
246 orrne lr, #(1<<17) @ limE
247
0e828e88 248 vmax.s32 d21, d31
5d8e3bf8 249 vmov.i32 d22, #0x1000
250 vmin.s32 d21, d22
251 add r3, r0, #4*8
252 vst1.16 d21[0], [r3] @ gteIR0
253
17ed0d69 254 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
255 add r2, r4, #0x400<<16
256 add r3, r5, #0x400<<16
257 lsrs r2, #16+11
258 orrne lr, #(1<<14) @ limG1
259 orrne lr, #(1<<31)
260 lsrs r3, #16+11
261 orrne lr, #(1<<13) @ limG2
5d8e3bf8 262 orrne lr, #(1<<31)
5d8e3bf8 263 adds r2, r4, #1
264 addvcs r3, r5, #1
265 orrvs lr, #(1<<16) @ F
266 orrvs lr, #(1<<31)
267 subs r2, r4, #1
268 subvcs r3, r5, #1
269 orrvs lr, #(1<<31)
270
271 ldr r4, [r0, #4*24] @ gteMAC0
272 orrvs lr, #(1<<15)
273
274 adds r3, r4, #1
275 orrvs lr, #(1<<16) @ F
276 orrvs lr, #(1<<31)
277 subs r2, r4, #1
278 orrvs lr, #(1<<15) @ F
279 orrvs lr, #(1<<31)
280 cmp r4, #0x1000
281 orrhi lr, #(1<<12) @ limH
282
283 str lr, [r0, #4*(32+31)] @ gteFLAG
284
17ed0d69 285 pop {r4-r6,pc}
5d8e3bf8 286 .size gteRTPS_neon, .-gteRTPS_neon
287
288
289
290.global gteRTPT_neon @ r0=CP2 (d,c),
291gteRTPT_neon:
292 push {r4-r11,lr}
293
294 movw r1, #:lower16:scratch
295 movt r1, #:upper16:scratch
296 mov r12, #0
297
298 rtpx_preload
299
300 vmov.i32 d22, #0x7fffffff
301 vmov.i32 d23, #0x80000000
302 mov r3, #3 @ counter
303 mov r2, r0 @ VXYZ(0)
3040:
305 vldmia r2!, {d8} @ VXYZ(v)
306 vmov.16 d8[3], r12 @ kill unused upper vector
307
308 rtpx_mac
8cfbda97 309 vmin.s32 d22, d8 @ min gteMAC|12
310 vmax.s32 d23, d8 @ max gteMAC|12
311 subs r3, #1
4cc3050c 312 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 313 bgt 0b
314
4cc3050c 315 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 316
317 @ - phase2 -
318 sub r1, r1, #8*2*4
319 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
320
321 vmov d20, d0 @ gteMAC3 v=0
322 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 323 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 324 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
325 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
326 vmov d21, d9 @ ... v=2
327
328 vmov.i32 q14, #0xffff @ 0xffff[32]
329 vmax.s32 q10, q15
330 vmov.i32 q13, #1
331 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 332 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 333 vmin.s16 d24, d10 @ | find min/max IR
334 vmax.s16 d25, d10 @ |
335
5d8e3bf8 336 add r3, r0, #4*19 @ ||
337 vld1.32 d14[0], [r3] @ || gteSZ3
338
8cfbda97 339 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
340 add r3, r0, #4*17
341 vst1.32 d20, [r3]! @ | writeback fSZ(v)
342 vand q11, q10, q11
343 vst1.32 d21[0], [r3] @ |
344 vmax.u32 q10, q11, q13 @ make divisor 1 if not
345 add r3, r1, #8*8
346 vstmia r3, {q12} @ min/max IR for flags
347 vcvt.f32.u32 q10, q10
348 vshl.u32 d13, d12, #16 @ | preparing gteH
349
350 @ while NEON's busy we calculate some flags on ARM
351 add r2, r1, #8*2*3
352 mov lr, #0 @ gteFLAG
353 ldmia r2, {r4-r7} @ min/max gteMAC|12
354 subs r2, r4, #1
355 orrvs lr, #(1<<31)|(1<<27)
356 subs r3, r5, #1
357 orrvs lr, #(1<<31)|(1<<26)
358 adds r2, r6, #1
359 orrvs lr, #(1<<30)
360 adds r3, r7, #1
361 orrvs lr, #(1<<29)
362 ldr r4, [r1, #0] @ gteMAC3 v=0
363 ldr r5, [r1, #8*2] @ ... v=1
364 ldr r6, [r1, #8*4] @ ... v=2
365
366 add r3, r0, #4*(32+24)
367 vld1.32 d4, [r3] @ || gteOF|XY
368 add r3, r0, #4*(32+27)
5d8e3bf8 369 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 370
371 @ divide
372.if 1
373 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 374 vmovl.s32 q2, d4 @ || gteOF|XY [64]
375 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 376 vrecps.f32 q12, q10, q11 @ step
377 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
4706bbe4 378 vmov.f32 q8, #0.5 @ |||
8cfbda97 379 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 380 add r3, r0, #4*16
381 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 382 vdup.32 q13, d13[0] @ |
4706bbe4 383@ vrecps.f32 q12, q10, q11 @ step
384@ vmul.f32 q11, q12, q11 @ better inv
8cfbda97 385 vmul.f32 q10, q13, q11 @ result
386.else
4706bbe4 387 vmov.f32 q8, #0.5 @ |||
5d8e3bf8 388 vmovl.s32 q2, d4 @ || gteOF|XY [64]
389 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 390 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
391 vdup.32 q13, d13[0] @ |
5d8e3bf8 392 add r3, r0, #4*16
393 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 394
395 vpush {q0}
396 vmov q0, q10 @ to test against C code
397 vdiv.f32 s0, s26, s0
398 vdiv.f32 s1, s26, s1
399 vdiv.f32 s2, s26, s2
400 vmov q10, q0
401 vpop {q0}
402.endif
403
5d8e3bf8 404 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 405 orr r7, r4, r5
406 add r4, r1, #8*8
407 orr r3, r7, r6
408 ldmia r4, {r7,r8,r10,r11} @ min/max IR
409
410 movs r3, r3, lsr #16
411 orrne lr, #(1<<31)
412 orrne lr, #(1<<18) @ fSZ (limD)
413
4706bbe4 414 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
8cfbda97 415 vcvt.u32.f32 q8, q10
416 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
417 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
418 add r6, r1, #8*10
419 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
420 vqshl.u32 q8, #15
421 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
422 vshr.u32 q8, #15 @ quotients (limE)
423 vdup.32 d24, d16[0]
424 vdup.32 d25, d16[1]
425 vdup.32 d26, d17[0] @ quotient (dup)
426
5d8e3bf8 427 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
428 mov r4, #0x10000
429 cmp r7, #1<<16
430 cmnvc r10, #1<<16
8cfbda97 431 orrvs lr, #(1<<31)
432 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 433 rsbs r2, r4, r7, lsl #16
434 cmnvc r4, r10, lsl #16
8cfbda97 435 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 436 rsbs r2, r4, r8, lsl #16
437 cmnvc r4, r11, lsl #16
8cfbda97 438 orrvs lr, #(1<<22) @ IR3/limB3
439
440 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
441 vmull.s32 q10, d20, d25 @ ... v=1
442 vmull.s32 q11, d22, d26 @ ... v=2
443 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
444 vadd.s64 q10, q2 @ ... v=1
445 vadd.s64 q11, q2 @ ... v=2
446 vqmovn.s64 d18, q9 @ saturate to 32 v=0
447 vqmovn.s64 d19, q10 @ ... v=1
448 vqmovn.s64 d20, q11 @ ... v=2
449 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
450 vmax.s32 d15, d18, d19 @ || for flags
451 vmin.s32 d14, d20
452 vmax.s32 d15, d20
453 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
454 vqshl.s32 d24, d20, #5 @ ... v=2
455 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 456 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
457 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 458 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
459 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 460 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
461 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
462 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
463 vmovn.s32 d13, q12 @ 3
464 vstmia r1, {d14-d16} @ || other cacheline than quotients
465 add r3, r0, #4*12
466 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
467 vst1.32 d13[0], [r3]
468
0e828e88 469 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 470 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
471
472 vmov.i32 d13, #0x1000
0e828e88 473 vshr.s32 d12, d26, #12
8cfbda97 474
475 add r3, r0, #4*24
476 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 477 vmax.s32 d12, d30
8cfbda97 478 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
479 vst1.32 d9[0], [r3]
480
481 vmin.s32 d12, d13 @ | gteIR0
482
8cfbda97 483 ldmia r6, {r4-r6} @ quotients
484 orr r4, r5
485 orr r4, r6
5d8e3bf8 486 add r3, r0, #4*8
8cfbda97 487 movs r4, r4, lsr #17
8cfbda97 488
8cfbda97 489 vst1.32 d12[0], [r3]! @ gteIR0
490 vst1.32 d10, [r3]! @ gteIR12
491 vst1.32 d11[0], [r3] @ ..3
492
17ed0d69 493 @ ~23 cycles
5d8e3bf8 494 orrne lr, #(1<<31) @ limE
495 orrne lr, #(1<<17) @ limE
8cfbda97 496 ldmia r1, {r4-r9}
17ed0d69 497 add r2, r4, #0x400<<16 @ min fSX
498 add r3, r6, #0x400<<16 @ max fSX
499 lsrs r2, #16+11
500 lsreqs r3, #16+11
501 orrne lr, #(1<<31) @ limG1
502 orrne lr, #(1<<14)
503 add r2, r5, #0x400<<16 @ min fSY
504 add r3, r7, #0x400<<16 @ max fSY
505 lsrs r2, #16+11
506 lsreqs r3, #16+11
507 orrne lr, #(1<<31) @ limG2
508 orrne lr, #(1<<13)
8cfbda97 509 adds r2, r9, #1
17ed0d69 510 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 511 subs r3, r8, #1
8cfbda97 512
513 ldr r4, [r0, #4*24] @ gteMAC0
514 orrvs lr, #(1<<15)
515
516 adds r3, r4, #1
517 orrvs lr, #(1<<16)
518 orrvs lr, #(1<<31) @ F
519 subs r2, r4, #1
520 orrvs lr, #(1<<15)
521 orrvs lr, #(1<<31) @ F
522 cmp r4, #0x1000
5d8e3bf8 523 orrhi lr, #(1<<12) @ limH
8cfbda97 524
525 str lr, [r0, #4*(32+31)] @ gteFLAG
526
527 pop {r4-r11,pc}
5d8e3bf8 528 .size gteRTPT_neon, .-gteRTPT_neon
529
530
531
054175e9 532@ note: non-std calling convention used
533@ r0 = CP2 (d,c) (must preserve)
534@ r1 = op
535@ r4,r5 = VXYZ(v) packed
536@ r6 = &MX11(mx)
537@ r7 = &CV1(cv)
538.global gteMVMVA_part_neon
539gteMVMVA_part_neon:
17ed0d69 540 uxth r5, r5
541 vmov.32 d8[0], r4
542 vmov.32 d8[1], r5 @ VXYZ(v)
054175e9 543 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
544 vldmia r7, {d4-d5} @ CVx/gteTR*
17ed0d69 545
17ed0d69 546 vmov.i32 q15, #0
547 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
548 vext.16 d1, d0, d1, #3 @ xx32 -> x321
549 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
550 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
551
552 vmull.s16 q8, d0, d8
553 vmull.s16 q9, d1, d8
554 vmull.s16 q10, d2, d8
555 vpadd.s32 d16, d16, d17
556 vpadd.s32 d17, d18, d19
557 vpadd.s32 d18, d20, d21
558 vpadal.s32 q2, q8
559 vpadal.s32 q3, q9
560 tst r1, #1<<19
561 beq 0f
562 vshr.s64 q2, q2, #12
563 vshr.s64 q3, q3, #12
5640:
565 vqmovn.s64 d8, q2 @ gteMAC|12
566 vqmovn.s64 d9, q3 @ gteMAC3
567
568 tst r1, #1<<10
569 add r3, r0, #4*25
570 vqmovn.s32 d10, q4 @ gteIR|123
571 vst1.32 d8, [r3]!
572 vst1.32 d9[0], [r3] @ wb gteMAC|123
573
574 beq 0f
575 vmax.s16 d10, d31
5760:
577 vmovl.s16 q9, d10 @ expand gteIR|123
578 add r3, r0, #4*9
579 vst1.32 d18, [r3]!
580 vst1.32 d19[0], [r3]
054175e9 581 bx lr
582 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
17ed0d69 583
054175e9 584
585@ get flags after gteMVMVA_part_neon operation
586.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
587gteMACtoIR_flags_neon:
588 push {r4,r5,lr}
589 tst r1, r1 @ lm
17ed0d69 590 mov lr, #0 @ gteFLAG
054175e9 591 mov r2, #0
17ed0d69 592 mov r12, #15
593 moveq r2, #0x8000 @ adj
594 moveq r12, #16 @ shift
595
596 add r3, r0, #4*25
597 ldmia r3, {r3-r5} @ gteMAC|123
598
599 do_mac_flags r3, r4, r5
600
601 add r3, r2
602 add r4, r2
603 add r5, r2
604 asrs r3, r12
605 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
606 asrs r4, r12
607 orrne lr, #(1<<31)
608 orrne lr, #(1<<23) @ IR2/limB2
609 asrs r5, r12
610 orrne lr, #(1<<22) @ IR3/limB3
611 str lr, [r0, #4*(32+31)] @ gteFLAG
612
054175e9 613 pop {r4,r5,pc}
614 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
17ed0d69 615
616
617
8cfbda97 618@ vim:filetype=armasm