cdrom: implement attenuator/volume properly
[pcsx_rearmed.git] / libpcsxcore / gte_neon.S
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
8cfbda97 5 * See the COPYING file in the top-level directory.
6 */
7
8
9.bss
10.align 6 @ cacheline
11
12scratch:
13.rept 8*8*2/4
14 .word 0
15.endr
16
17.text
18.align 2
19
c67af2ac 20.macro ldr_scratch rd
21#ifndef __PIC__
22 movw \rd, #:lower16:scratch
23 movt \rd, #:upper16:scratch
24#else
25 ldr \rd, =scratch
26#endif
27.endm
28
59774ed0 29@ XXX: gteMAC calc shouldn't be saturating, but it is here
30
5d8e3bf8 31@ approximate gteMAC|123 flags
32@ in: rr 123 as gteMAC|123
33@ trash: nothing
34.macro do_mac_flags rr1 rr2 rr3
35 cmp \rr1, #1
36 orrvs lr, #(1<<31)|(1<<27)
37 cmp \rr2, #1
38 orrvs lr, #(1<<31)|(1<<26)
39 cmp \rr3, #1
40 orrvs lr, #(1<<31)|(1<<25)
41 cmn \rr1, #1 @ same as adds ...
42 orrvs lr, #(1<<30)
43 cmn \rr2, #1
44 orrvs lr, #(1<<29)
45 cmn \rr3, #1
46 orrvs lr, #(1<<28)
47.endm
48
49@ approximate 3x gteMACn flags
50@ in: rr 123 as 3 instances gteMACn, *flags
51@ trash: nothing
52.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
53 cmp \rr1, #1
54 cmpvc \rr2, #1
55 cmpvc \rr3, #1
56 orrvs lr, #\nflags
57 cmn \rr1, #1 @ adds ...
58 cmnvc \rr2, #1
59 cmnvc \rr3, #1
60 orrvs lr, #\pflags
61.endm
62
17ed0d69 63@ get gteIR|123 flags from gteMAC|123
64@ in: rr 123 as gteMAC|123
5d8e3bf8 65@ trash: r2,r3
66.macro do_irs_flags rr1 rr2 rr3
17ed0d69 67 add r2, \rr1, #0x8000
68 add r3, \rr2, #0x8000
69 lsrs r2, #16
70 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
71 lsrs r3, #16
72 add r2, \rr3, #0x8000
73 orrne lr, #(1<<31)
74 orrne lr, #(1<<23) @ IR2/limB2
75 lsrs r2, #16
76 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 77.endm
78
79
80/*
5d8e3bf8 81 * RTPS/RTPT register map:
82 *
8cfbda97 83 * q | d | c code / phase 1 phase 2 scratch
84 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
85 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
86 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
87 * 3 * gteIR1-3 = gteIR1-3 /
88 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
89 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
90 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
91 * 7 0 gteDQB [s64] max gteMAC|12
92 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
93 * 9 * / gteMAC3 max gteIR|123
94 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
95 * 11 0 quotient 3
96 * 6 12 gteH (adj. for cmp)
97 * 13 gteH (float for div)
98 * ... <scratch>
99 * 15 30 0
100 * 31 0
101 */
8cfbda97 102
5d8e3bf8 103@ load gteR*, gteTR* and gteH (see map above), clear q15
104@ in: r0 - context
105@ trash: r3
106.macro rtpx_preload
8cfbda97 107 add r3, r0, #4*32
108 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 109 vmov.i32 q15, #0
8cfbda97 110 add r3, r0, #4*(32+5)
111 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 112 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
113 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 114 add r3, r0, #4*(32+26)
115 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 116 vshll.s32 q3, d5, #12 @ gteTRZ
117 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 118 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 119.endm
8cfbda97 120
5d8e3bf8 121@ do RTP* gteMAC* calculation
122@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
123@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
124@ trash: d16-d21
125.macro rtpx_mac
8cfbda97 126 vmull.s16 q8, d0, d8
127 vmull.s16 q9, d1, d8
128 vmull.s16 q10, d2, d8
129 vpaddl.s32 q8, q8
130 vpaddl.s32 q9, q9
131 vpaddl.s32 q10, q10
132 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
133 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
134 vadd.s64 d20, d21 @ QC
135 vadd.s64 d16, d4
136 vadd.s64 d18, d5
137 vadd.s64 d20, d6
138 vqshrn.s64 d8, q8, #12 @ gteMAC1
139 vqshrn.s64 d18, q9, #12 @ gteMAC2
140 vqshrn.s64 d9, q10, #12 @ gteMAC3
141 vsli.u64 d8, d18, #32 @ gteMAC|12
142 vmov.32 d9[1], r12
5d8e3bf8 143 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
144.endm
145
146.global gteRTPS_neon @ r0=CP2 (d,c),
147gteRTPS_neon:
17ed0d69 148 push {r4-r6,lr}
5d8e3bf8 149
150@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
c67af2ac 151 ldr_scratch r1
5d8e3bf8 152 mov r12, #0
153
4cc3050c 154 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 155 rtpx_preload
156
4cc3050c 157@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 158 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 159 vmull.s16 q8, d0, d8
160 vmull.s16 q9, d1, d8
161 vmull.s16 q10, d2, d8
162 vpadd.s32 d16, d16, d17
163 vpadd.s32 d17, d18, d19
164 vpadd.s32 d18, d20, d21
165 vpadal.s32 q2, q8
166 vpadal.s32 q3, q9 @ d6, d18 is slow?
167 vqshrn.s64 d8, q2, #12 @ gteMAC|12
168 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 169
170 add r3, r0, #4*25
171 vst1.32 d8, [r3]!
172 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 173 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 174
175 add r3, r0, #4*17 @ gteSZ*
176 vldmia r3, {q7} @ d14,d15 gteSZ|123x
177 vmov.i32 d28, #0xffff @ 0xffff[32]
178 vmax.s32 d11, d9, d31
179 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
180 vmov.i32 d26, #1
181 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 182 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 183 vshl.u32 d13, d12, #16 @ | preparing gteH
184 add r3, r0, #4*9
185 vst1.32 d18, [r3]!
186 vst1.32 d19[0], [r3]
187
188 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
189 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
190
191 add r3, r0, #4*(32+24)
192 vld1.32 d4, [r3] @ || gteOF|XY
193 add r3, r0, #4*(32+27)
194 vld1.32 d6, [r3] @ || gteDQ|AB
195
196 vand d11, d16
197 vmovl.s32 q2, d4 @ || gteOF|XY [64]
198 vmax.u32 d11, d26 @ make divisor 1 if not
199 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
200 add r3, r0, #4*16 @ | gteSZ*
201 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
202
203 vcvt.f32.u32 d13, d13 @ gteH (float for div)
204 vcvt.f32.u32 d11, d11 @ divisor
205
206 @ divide.. it's not worth messing with reciprocals here
207 @ just for 1 value, let's just use VFP divider here
208 vdiv.f32 s22, s26, s22
209
02455d0d 210 vmov.f32 d20, #0.5
211 vadd.f32 d11, d20
5d8e3bf8 212 vcvt.u32.f32 d11, d11 @ quotient
213
214 @ while NEON's busy we calculate some flags on ARM
215 add r3, r0, #4*25
216 mov lr, #0 @ gteFLAG
217 ldmia r3, {r4-r6} @ gteMAC|123
218
4cc3050c 219 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 220 vqshl.u32 d11, #15
221
222 do_mac_flags r4, r5, r6
223
224 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 225
5d8e3bf8 226 do_irs_flags r4, r5, r6
227
228 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
229 add r3, r0, #4*13
230 vld1.32 d16, [r3] @ || load fS|XY12, new 01
231 vqmovn.s64 d18, q2 @ saturate to 32
232 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
233 vqshl.s32 d19, d18, #5 @ 11bit precision
234
235 ldr r4, [r1] @ quotient
5d8e3bf8 236 movs r3, r6, lsr #16
237 orrne lr, #(1<<31)
238 orrne lr, #(1<<18) @ fSZ (limD)
239
17ed0d69 240 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
241
5d8e3bf8 242 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
243 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
244 vmovn.s32 d18, q9 @ fS|XY2 [s16]
245
0e828e88 246 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 247 add r3, r0, #4*12
248 vst1.32 d16, [r3]! @ writeback fS|XY01
249 vst1.32 d18[0], [r3] @ ...2
250 add r3, r0, #4*24
0e828e88 251 vshr.s32 d21, d20, #12
5d8e3bf8 252 vst1.32 d20[0], [r3] @ gteMAC0
253
17ed0d69 254 movs r4, r4, lsr #17
255 orrne lr, #(1<<31)
256 orrne lr, #(1<<17) @ limE
257
0e828e88 258 vmax.s32 d21, d31
5d8e3bf8 259 vmov.i32 d22, #0x1000
260 vmin.s32 d21, d22
261 add r3, r0, #4*8
262 vst1.16 d21[0], [r3] @ gteIR0
263
17ed0d69 264 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
265 add r2, r4, #0x400<<16
266 add r3, r5, #0x400<<16
267 lsrs r2, #16+11
268 orrne lr, #(1<<14) @ limG1
269 orrne lr, #(1<<31)
270 lsrs r3, #16+11
271 orrne lr, #(1<<13) @ limG2
5d8e3bf8 272 orrne lr, #(1<<31)
5d8e3bf8 273 adds r2, r4, #1
274 addvcs r3, r5, #1
275 orrvs lr, #(1<<16) @ F
276 orrvs lr, #(1<<31)
277 subs r2, r4, #1
278 subvcs r3, r5, #1
279 orrvs lr, #(1<<31)
280
281 ldr r4, [r0, #4*24] @ gteMAC0
282 orrvs lr, #(1<<15)
283
284 adds r3, r4, #1
285 orrvs lr, #(1<<16) @ F
286 orrvs lr, #(1<<31)
287 subs r2, r4, #1
288 orrvs lr, #(1<<15) @ F
289 orrvs lr, #(1<<31)
290 cmp r4, #0x1000
291 orrhi lr, #(1<<12) @ limH
292
293 str lr, [r0, #4*(32+31)] @ gteFLAG
294
17ed0d69 295 pop {r4-r6,pc}
5d8e3bf8 296 .size gteRTPS_neon, .-gteRTPS_neon
297
298
299
300.global gteRTPT_neon @ r0=CP2 (d,c),
301gteRTPT_neon:
302 push {r4-r11,lr}
303
c67af2ac 304 ldr_scratch r1
5d8e3bf8 305 mov r12, #0
306
307 rtpx_preload
308
309 vmov.i32 d22, #0x7fffffff
310 vmov.i32 d23, #0x80000000
311 mov r3, #3 @ counter
312 mov r2, r0 @ VXYZ(0)
3130:
314 vldmia r2!, {d8} @ VXYZ(v)
315 vmov.16 d8[3], r12 @ kill unused upper vector
316
317 rtpx_mac
8cfbda97 318 vmin.s32 d22, d8 @ min gteMAC|12
319 vmax.s32 d23, d8 @ max gteMAC|12
320 subs r3, #1
4cc3050c 321 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 322 bgt 0b
323
4cc3050c 324 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 325
326 @ - phase2 -
327 sub r1, r1, #8*2*4
328 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
329
330 vmov d20, d0 @ gteMAC3 v=0
331 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 332 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 333 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
334 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
335 vmov d21, d9 @ ... v=2
336
337 vmov.i32 q14, #0xffff @ 0xffff[32]
338 vmax.s32 q10, q15
339 vmov.i32 q13, #1
340 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 341 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 342 vmin.s16 d24, d10 @ | find min/max IR
343 vmax.s16 d25, d10 @ |
344
5d8e3bf8 345 add r3, r0, #4*19 @ ||
346 vld1.32 d14[0], [r3] @ || gteSZ3
347
8cfbda97 348 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
349 add r3, r0, #4*17
350 vst1.32 d20, [r3]! @ | writeback fSZ(v)
351 vand q11, q10, q11
352 vst1.32 d21[0], [r3] @ |
353 vmax.u32 q10, q11, q13 @ make divisor 1 if not
354 add r3, r1, #8*8
355 vstmia r3, {q12} @ min/max IR for flags
356 vcvt.f32.u32 q10, q10
357 vshl.u32 d13, d12, #16 @ | preparing gteH
358
359 @ while NEON's busy we calculate some flags on ARM
360 add r2, r1, #8*2*3
361 mov lr, #0 @ gteFLAG
362 ldmia r2, {r4-r7} @ min/max gteMAC|12
363 subs r2, r4, #1
364 orrvs lr, #(1<<31)|(1<<27)
365 subs r3, r5, #1
366 orrvs lr, #(1<<31)|(1<<26)
367 adds r2, r6, #1
368 orrvs lr, #(1<<30)
369 adds r3, r7, #1
370 orrvs lr, #(1<<29)
371 ldr r4, [r1, #0] @ gteMAC3 v=0
372 ldr r5, [r1, #8*2] @ ... v=1
373 ldr r6, [r1, #8*4] @ ... v=2
374
375 add r3, r0, #4*(32+24)
376 vld1.32 d4, [r3] @ || gteOF|XY
377 add r3, r0, #4*(32+27)
5d8e3bf8 378 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 379
380 @ divide
381.if 1
382 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 383 vmovl.s32 q2, d4 @ || gteOF|XY [64]
384 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 385 vrecps.f32 q12, q10, q11 @ step
386 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
4706bbe4 387 vmov.f32 q8, #0.5 @ |||
8cfbda97 388 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 389 add r3, r0, #4*16
390 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 391 vdup.32 q13, d13[0] @ |
4706bbe4 392@ vrecps.f32 q12, q10, q11 @ step
393@ vmul.f32 q11, q12, q11 @ better inv
8cfbda97 394 vmul.f32 q10, q13, q11 @ result
395.else
4706bbe4 396 vmov.f32 q8, #0.5 @ |||
5d8e3bf8 397 vmovl.s32 q2, d4 @ || gteOF|XY [64]
398 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 399 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
400 vdup.32 q13, d13[0] @ |
5d8e3bf8 401 add r3, r0, #4*16
402 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 403
404 vpush {q0}
405 vmov q0, q10 @ to test against C code
406 vdiv.f32 s0, s26, s0
407 vdiv.f32 s1, s26, s1
408 vdiv.f32 s2, s26, s2
409 vmov q10, q0
410 vpop {q0}
411.endif
412
5d8e3bf8 413 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 414 orr r7, r4, r5
415 add r4, r1, #8*8
416 orr r3, r7, r6
417 ldmia r4, {r7,r8,r10,r11} @ min/max IR
418
419 movs r3, r3, lsr #16
420 orrne lr, #(1<<31)
421 orrne lr, #(1<<18) @ fSZ (limD)
422
4706bbe4 423 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
8cfbda97 424 vcvt.u32.f32 q8, q10
425 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
426 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
427 add r6, r1, #8*10
428 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
429 vqshl.u32 q8, #15
430 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
431 vshr.u32 q8, #15 @ quotients (limE)
432 vdup.32 d24, d16[0]
433 vdup.32 d25, d16[1]
434 vdup.32 d26, d17[0] @ quotient (dup)
435
5d8e3bf8 436 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
437 mov r4, #0x10000
438 cmp r7, #1<<16
439 cmnvc r10, #1<<16
8cfbda97 440 orrvs lr, #(1<<31)
441 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 442 rsbs r2, r4, r7, lsl #16
443 cmnvc r4, r10, lsl #16
8cfbda97 444 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 445 rsbs r2, r4, r8, lsl #16
446 cmnvc r4, r11, lsl #16
8cfbda97 447 orrvs lr, #(1<<22) @ IR3/limB3
448
449 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
450 vmull.s32 q10, d20, d25 @ ... v=1
451 vmull.s32 q11, d22, d26 @ ... v=2
452 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
453 vadd.s64 q10, q2 @ ... v=1
454 vadd.s64 q11, q2 @ ... v=2
455 vqmovn.s64 d18, q9 @ saturate to 32 v=0
456 vqmovn.s64 d19, q10 @ ... v=1
457 vqmovn.s64 d20, q11 @ ... v=2
458 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
459 vmax.s32 d15, d18, d19 @ || for flags
460 vmin.s32 d14, d20
461 vmax.s32 d15, d20
462 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
463 vqshl.s32 d24, d20, #5 @ ... v=2
464 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 465 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
466 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 467 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
468 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 469 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
470 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
471 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
472 vmovn.s32 d13, q12 @ 3
473 vstmia r1, {d14-d16} @ || other cacheline than quotients
474 add r3, r0, #4*12
475 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
476 vst1.32 d13[0], [r3]
477
0e828e88 478 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 479 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
480
481 vmov.i32 d13, #0x1000
0e828e88 482 vshr.s32 d12, d26, #12
8cfbda97 483
484 add r3, r0, #4*24
485 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 486 vmax.s32 d12, d30
8cfbda97 487 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
488 vst1.32 d9[0], [r3]
489
490 vmin.s32 d12, d13 @ | gteIR0
491
8cfbda97 492 ldmia r6, {r4-r6} @ quotients
493 orr r4, r5
494 orr r4, r6
5d8e3bf8 495 add r3, r0, #4*8
8cfbda97 496 movs r4, r4, lsr #17
8cfbda97 497
8cfbda97 498 vst1.32 d12[0], [r3]! @ gteIR0
499 vst1.32 d10, [r3]! @ gteIR12
500 vst1.32 d11[0], [r3] @ ..3
501
17ed0d69 502 @ ~23 cycles
5d8e3bf8 503 orrne lr, #(1<<31) @ limE
504 orrne lr, #(1<<17) @ limE
8cfbda97 505 ldmia r1, {r4-r9}
17ed0d69 506 add r2, r4, #0x400<<16 @ min fSX
507 add r3, r6, #0x400<<16 @ max fSX
508 lsrs r2, #16+11
509 lsreqs r3, #16+11
510 orrne lr, #(1<<31) @ limG1
511 orrne lr, #(1<<14)
512 add r2, r5, #0x400<<16 @ min fSY
513 add r3, r7, #0x400<<16 @ max fSY
514 lsrs r2, #16+11
515 lsreqs r3, #16+11
516 orrne lr, #(1<<31) @ limG2
517 orrne lr, #(1<<13)
8cfbda97 518 adds r2, r9, #1
17ed0d69 519 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 520 subs r3, r8, #1
8cfbda97 521
522 ldr r4, [r0, #4*24] @ gteMAC0
523 orrvs lr, #(1<<15)
524
525 adds r3, r4, #1
526 orrvs lr, #(1<<16)
527 orrvs lr, #(1<<31) @ F
528 subs r2, r4, #1
529 orrvs lr, #(1<<15)
530 orrvs lr, #(1<<31) @ F
531 cmp r4, #0x1000
5d8e3bf8 532 orrhi lr, #(1<<12) @ limH
8cfbda97 533
534 str lr, [r0, #4*(32+31)] @ gteFLAG
535
536 pop {r4-r11,pc}
5d8e3bf8 537 .size gteRTPT_neon, .-gteRTPT_neon
538
539
540
054175e9 541@ note: non-std calling convention used
542@ r0 = CP2 (d,c) (must preserve)
543@ r1 = op
544@ r4,r5 = VXYZ(v) packed
545@ r6 = &MX11(mx)
546@ r7 = &CV1(cv)
547.global gteMVMVA_part_neon
548gteMVMVA_part_neon:
17ed0d69 549 uxth r5, r5
550 vmov.32 d8[0], r4
551 vmov.32 d8[1], r5 @ VXYZ(v)
054175e9 552 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
553 vldmia r7, {d4-d5} @ CVx/gteTR*
17ed0d69 554
17ed0d69 555 vmov.i32 q15, #0
556 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
557 vext.16 d1, d0, d1, #3 @ xx32 -> x321
558 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
559 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
560
561 vmull.s16 q8, d0, d8
562 vmull.s16 q9, d1, d8
563 vmull.s16 q10, d2, d8
564 vpadd.s32 d16, d16, d17
565 vpadd.s32 d17, d18, d19
566 vpadd.s32 d18, d20, d21
567 vpadal.s32 q2, q8
568 vpadal.s32 q3, q9
569 tst r1, #1<<19
570 beq 0f
571 vshr.s64 q2, q2, #12
572 vshr.s64 q3, q3, #12
5730:
574 vqmovn.s64 d8, q2 @ gteMAC|12
575 vqmovn.s64 d9, q3 @ gteMAC3
576
577 tst r1, #1<<10
578 add r3, r0, #4*25
579 vqmovn.s32 d10, q4 @ gteIR|123
580 vst1.32 d8, [r3]!
581 vst1.32 d9[0], [r3] @ wb gteMAC|123
582
583 beq 0f
584 vmax.s16 d10, d31
5850:
586 vmovl.s16 q9, d10 @ expand gteIR|123
587 add r3, r0, #4*9
588 vst1.32 d18, [r3]!
589 vst1.32 d19[0], [r3]
054175e9 590 bx lr
591 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
17ed0d69 592
054175e9 593
594@ get flags after gteMVMVA_part_neon operation
595.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
596gteMACtoIR_flags_neon:
597 push {r4,r5,lr}
598 tst r1, r1 @ lm
17ed0d69 599 mov lr, #0 @ gteFLAG
054175e9 600 mov r2, #0
17ed0d69 601 mov r12, #15
602 moveq r2, #0x8000 @ adj
603 moveq r12, #16 @ shift
604
605 add r3, r0, #4*25
606 ldmia r3, {r3-r5} @ gteMAC|123
607
608 do_mac_flags r3, r4, r5
609
610 add r3, r2
611 add r4, r2
612 add r5, r2
613 asrs r3, r12
614 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
615 asrs r4, r12
616 orrne lr, #(1<<31)
617 orrne lr, #(1<<23) @ IR2/limB2
618 asrs r5, r12
619 orrne lr, #(1<<22) @ IR3/limB3
620 str lr, [r0, #4*(32+31)] @ gteFLAG
621
054175e9 622 pop {r4,r5,pc}
623 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
17ed0d69 624
625
626
8cfbda97 627@ vim:filetype=armasm