psxcounters: avoid update on each hsync
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
11
12.bss
13.align 6 @ cacheline
14
15scratch:
16.rept 8*8*2/4
17 .word 0
18.endr
19
20.text
21.align 2
22
59774ed0 23@ XXX: gteMAC calc shouldn't be saturating, but it is here
24
5d8e3bf8 25@ approximate gteMAC|123 flags
26@ in: rr 123 as gteMAC|123
27@ trash: nothing
28.macro do_mac_flags rr1 rr2 rr3
29 cmp \rr1, #1
30 orrvs lr, #(1<<31)|(1<<27)
31 cmp \rr2, #1
32 orrvs lr, #(1<<31)|(1<<26)
33 cmp \rr3, #1
34 orrvs lr, #(1<<31)|(1<<25)
35 cmn \rr1, #1 @ same as adds ...
36 orrvs lr, #(1<<30)
37 cmn \rr2, #1
38 orrvs lr, #(1<<29)
39 cmn \rr3, #1
40 orrvs lr, #(1<<28)
41.endm
42
43@ approximate 3x gteMACn flags
44@ in: rr 123 as 3 instances gteMACn, *flags
45@ trash: nothing
46.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
47 cmp \rr1, #1
48 cmpvc \rr2, #1
49 cmpvc \rr3, #1
50 orrvs lr, #\nflags
51 cmn \rr1, #1 @ adds ...
52 cmnvc \rr2, #1
53 cmnvc \rr3, #1
54 orrvs lr, #\pflags
55.endm
56
17ed0d69 57@ get gteIR|123 flags from gteMAC|123
58@ in: rr 123 as gteMAC|123
5d8e3bf8 59@ trash: r2,r3
60.macro do_irs_flags rr1 rr2 rr3
17ed0d69 61 add r2, \rr1, #0x8000
62 add r3, \rr2, #0x8000
63 lsrs r2, #16
64 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
65 lsrs r3, #16
66 add r2, \rr3, #0x8000
67 orrne lr, #(1<<31)
68 orrne lr, #(1<<23) @ IR2/limB2
69 lsrs r2, #16
70 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 71.endm
72
73
74/*
5d8e3bf8 75 * RTPS/RTPT register map:
76 *
8cfbda97 77 * q | d | c code / phase 1 phase 2 scratch
78 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
79 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
80 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
81 * 3 * gteIR1-3 = gteIR1-3 /
82 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
83 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
84 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
85 * 7 0 gteDQB [s64] max gteMAC|12
86 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
87 * 9 * / gteMAC3 max gteIR|123
88 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
89 * 11 0 quotient 3
90 * 6 12 gteH (adj. for cmp)
91 * 13 gteH (float for div)
92 * ... <scratch>
93 * 15 30 0
94 * 31 0
95 */
8cfbda97 96
5d8e3bf8 97@ load gteR*, gteTR* and gteH (see map above), clear q15
98@ in: r0 - context
99@ trash: r3
100.macro rtpx_preload
8cfbda97 101 add r3, r0, #4*32
102 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 103 vmov.i32 q15, #0
8cfbda97 104 add r3, r0, #4*(32+5)
105 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 106 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
107 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 108 add r3, r0, #4*(32+26)
109 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 110 vshll.s32 q3, d5, #12 @ gteTRZ
111 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 112 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 113.endm
8cfbda97 114
5d8e3bf8 115@ do RTP* gteMAC* calculation
116@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
117@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
118@ trash: d16-d21
119.macro rtpx_mac
8cfbda97 120 vmull.s16 q8, d0, d8
121 vmull.s16 q9, d1, d8
122 vmull.s16 q10, d2, d8
123 vpaddl.s32 q8, q8
124 vpaddl.s32 q9, q9
125 vpaddl.s32 q10, q10
126 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
127 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
128 vadd.s64 d20, d21 @ QC
129 vadd.s64 d16, d4
130 vadd.s64 d18, d5
131 vadd.s64 d20, d6
132 vqshrn.s64 d8, q8, #12 @ gteMAC1
133 vqshrn.s64 d18, q9, #12 @ gteMAC2
134 vqshrn.s64 d9, q10, #12 @ gteMAC3
135 vsli.u64 d8, d18, #32 @ gteMAC|12
136 vmov.32 d9[1], r12
5d8e3bf8 137 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
138.endm
139
140.global gteRTPS_neon @ r0=CP2 (d,c),
141gteRTPS_neon:
17ed0d69 142 push {r4-r6,lr}
5d8e3bf8 143
144@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
145 movw r1, #:lower16:scratch
146 movt r1, #:upper16:scratch
147 mov r12, #0
148
4cc3050c 149 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 150 rtpx_preload
151
4cc3050c 152@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 153 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 154 vmull.s16 q8, d0, d8
155 vmull.s16 q9, d1, d8
156 vmull.s16 q10, d2, d8
157 vpadd.s32 d16, d16, d17
158 vpadd.s32 d17, d18, d19
159 vpadd.s32 d18, d20, d21
160 vpadal.s32 q2, q8
161 vpadal.s32 q3, q9 @ d6, d18 is slow?
162 vqshrn.s64 d8, q2, #12 @ gteMAC|12
163 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 164
165 add r3, r0, #4*25
166 vst1.32 d8, [r3]!
167 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 168 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 169
170 add r3, r0, #4*17 @ gteSZ*
171 vldmia r3, {q7} @ d14,d15 gteSZ|123x
172 vmov.i32 d28, #0xffff @ 0xffff[32]
173 vmax.s32 d11, d9, d31
174 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
175 vmov.i32 d26, #1
176 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 177 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 178 vshl.u32 d13, d12, #16 @ | preparing gteH
179 add r3, r0, #4*9
180 vst1.32 d18, [r3]!
181 vst1.32 d19[0], [r3]
182
183 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
184 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
185
186 add r3, r0, #4*(32+24)
187 vld1.32 d4, [r3] @ || gteOF|XY
188 add r3, r0, #4*(32+27)
189 vld1.32 d6, [r3] @ || gteDQ|AB
190
191 vand d11, d16
192 vmovl.s32 q2, d4 @ || gteOF|XY [64]
193 vmax.u32 d11, d26 @ make divisor 1 if not
194 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
195 add r3, r0, #4*16 @ | gteSZ*
196 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
197
198 vcvt.f32.u32 d13, d13 @ gteH (float for div)
199 vcvt.f32.u32 d11, d11 @ divisor
200
201 @ divide.. it's not worth messing with reciprocals here
202 @ just for 1 value, let's just use VFP divider here
203 vdiv.f32 s22, s26, s22
204
205 vcvt.u32.f32 d11, d11 @ quotient
206
207 @ while NEON's busy we calculate some flags on ARM
208 add r3, r0, #4*25
209 mov lr, #0 @ gteFLAG
210 ldmia r3, {r4-r6} @ gteMAC|123
211
4cc3050c 212 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 213 vqshl.u32 d11, #15
214
215 do_mac_flags r4, r5, r6
216
217 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 218
5d8e3bf8 219 do_irs_flags r4, r5, r6
220
221 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
222 add r3, r0, #4*13
223 vld1.32 d16, [r3] @ || load fS|XY12, new 01
224 vqmovn.s64 d18, q2 @ saturate to 32
225 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
226 vqshl.s32 d19, d18, #5 @ 11bit precision
227
228 ldr r4, [r1] @ quotient
5d8e3bf8 229 movs r3, r6, lsr #16
230 orrne lr, #(1<<31)
231 orrne lr, #(1<<18) @ fSZ (limD)
232
17ed0d69 233 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
234
5d8e3bf8 235 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
236 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
237 vmovn.s32 d18, q9 @ fS|XY2 [s16]
238
0e828e88 239 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 240 add r3, r0, #4*12
241 vst1.32 d16, [r3]! @ writeback fS|XY01
242 vst1.32 d18[0], [r3] @ ...2
243 add r3, r0, #4*24
0e828e88 244 vshr.s32 d21, d20, #12
5d8e3bf8 245 vst1.32 d20[0], [r3] @ gteMAC0
246
17ed0d69 247 movs r4, r4, lsr #17
248 orrne lr, #(1<<31)
249 orrne lr, #(1<<17) @ limE
250
0e828e88 251 vmax.s32 d21, d31
5d8e3bf8 252 vmov.i32 d22, #0x1000
253 vmin.s32 d21, d22
254 add r3, r0, #4*8
255 vst1.16 d21[0], [r3] @ gteIR0
256
17ed0d69 257 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
258 add r2, r4, #0x400<<16
259 add r3, r5, #0x400<<16
260 lsrs r2, #16+11
261 orrne lr, #(1<<14) @ limG1
262 orrne lr, #(1<<31)
263 lsrs r3, #16+11
264 orrne lr, #(1<<13) @ limG2
5d8e3bf8 265 orrne lr, #(1<<31)
5d8e3bf8 266 adds r2, r4, #1
267 addvcs r3, r5, #1
268 orrvs lr, #(1<<16) @ F
269 orrvs lr, #(1<<31)
270 subs r2, r4, #1
271 subvcs r3, r5, #1
272 orrvs lr, #(1<<31)
273
274 ldr r4, [r0, #4*24] @ gteMAC0
275 orrvs lr, #(1<<15)
276
277 adds r3, r4, #1
278 orrvs lr, #(1<<16) @ F
279 orrvs lr, #(1<<31)
280 subs r2, r4, #1
281 orrvs lr, #(1<<15) @ F
282 orrvs lr, #(1<<31)
283 cmp r4, #0x1000
284 orrhi lr, #(1<<12) @ limH
285
286 str lr, [r0, #4*(32+31)] @ gteFLAG
287
17ed0d69 288 pop {r4-r6,pc}
5d8e3bf8 289 .size gteRTPS_neon, .-gteRTPS_neon
290
291
292
293.global gteRTPT_neon @ r0=CP2 (d,c),
294gteRTPT_neon:
295 push {r4-r11,lr}
296
297 movw r1, #:lower16:scratch
298 movt r1, #:upper16:scratch
299 mov r12, #0
300
301 rtpx_preload
302
303 vmov.i32 d22, #0x7fffffff
304 vmov.i32 d23, #0x80000000
305 mov r3, #3 @ counter
306 mov r2, r0 @ VXYZ(0)
3070:
308 vldmia r2!, {d8} @ VXYZ(v)
309 vmov.16 d8[3], r12 @ kill unused upper vector
310
311 rtpx_mac
8cfbda97 312 vmin.s32 d22, d8 @ min gteMAC|12
313 vmax.s32 d23, d8 @ max gteMAC|12
314 subs r3, #1
4cc3050c 315 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 316 bgt 0b
317
4cc3050c 318 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 319
320 @ - phase2 -
321 sub r1, r1, #8*2*4
322 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
323
324 vmov d20, d0 @ gteMAC3 v=0
325 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 326 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 327 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
328 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
329 vmov d21, d9 @ ... v=2
330
331 vmov.i32 q14, #0xffff @ 0xffff[32]
332 vmax.s32 q10, q15
333 vmov.i32 q13, #1
334 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 335 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 336 vmin.s16 d24, d10 @ | find min/max IR
337 vmax.s16 d25, d10 @ |
338
5d8e3bf8 339 add r3, r0, #4*19 @ ||
340 vld1.32 d14[0], [r3] @ || gteSZ3
341
8cfbda97 342 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
343 add r3, r0, #4*17
344 vst1.32 d20, [r3]! @ | writeback fSZ(v)
345 vand q11, q10, q11
346 vst1.32 d21[0], [r3] @ |
347 vmax.u32 q10, q11, q13 @ make divisor 1 if not
348 add r3, r1, #8*8
349 vstmia r3, {q12} @ min/max IR for flags
350 vcvt.f32.u32 q10, q10
351 vshl.u32 d13, d12, #16 @ | preparing gteH
352
353 @ while NEON's busy we calculate some flags on ARM
354 add r2, r1, #8*2*3
355 mov lr, #0 @ gteFLAG
356 ldmia r2, {r4-r7} @ min/max gteMAC|12
357 subs r2, r4, #1
358 orrvs lr, #(1<<31)|(1<<27)
359 subs r3, r5, #1
360 orrvs lr, #(1<<31)|(1<<26)
361 adds r2, r6, #1
362 orrvs lr, #(1<<30)
363 adds r3, r7, #1
364 orrvs lr, #(1<<29)
365 ldr r4, [r1, #0] @ gteMAC3 v=0
366 ldr r5, [r1, #8*2] @ ... v=1
367 ldr r6, [r1, #8*4] @ ... v=2
368
369 add r3, r0, #4*(32+24)
370 vld1.32 d4, [r3] @ || gteOF|XY
371 add r3, r0, #4*(32+27)
5d8e3bf8 372 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 373
374 @ divide
375.if 1
376 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 377 vmovl.s32 q2, d4 @ || gteOF|XY [64]
378 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 379 vrecps.f32 q12, q10, q11 @ step
380 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
381 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 382 add r3, r0, #4*16
383 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 384 vdup.32 q13, d13[0] @ |
385@ vrecps.f32 q12, q10, q11 @ step
386@ vmul.f32 q11, q12, q11 @ better inv
387 vmul.f32 q10, q13, q11 @ result
388.else
5d8e3bf8 389 vmovl.s32 q2, d4 @ || gteOF|XY [64]
390 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 391 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
392 vdup.32 q13, d13[0] @ |
5d8e3bf8 393 add r3, r0, #4*16
394 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 395
396 vpush {q0}
397 vmov q0, q10 @ to test against C code
398 vdiv.f32 s0, s26, s0
399 vdiv.f32 s1, s26, s1
400 vdiv.f32 s2, s26, s2
401 vmov q10, q0
402 vpop {q0}
403.endif
404
5d8e3bf8 405 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 406 orr r7, r4, r5
407 add r4, r1, #8*8
408 orr r3, r7, r6
409 ldmia r4, {r7,r8,r10,r11} @ min/max IR
410
411 movs r3, r3, lsr #16
412 orrne lr, #(1<<31)
413 orrne lr, #(1<<18) @ fSZ (limD)
414
415@ vadd.f32 q10, q @ adjust for vcvt rounding mode
416 vcvt.u32.f32 q8, q10
417 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
418 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
419 add r6, r1, #8*10
420 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
421 vqshl.u32 q8, #15
422 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
423 vshr.u32 q8, #15 @ quotients (limE)
424 vdup.32 d24, d16[0]
425 vdup.32 d25, d16[1]
426 vdup.32 d26, d17[0] @ quotient (dup)
427
5d8e3bf8 428 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
429 mov r4, #0x10000
430 cmp r7, #1<<16
431 cmnvc r10, #1<<16
8cfbda97 432 orrvs lr, #(1<<31)
433 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 434 rsbs r2, r4, r7, lsl #16
435 cmnvc r4, r10, lsl #16
8cfbda97 436 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 437 rsbs r2, r4, r8, lsl #16
438 cmnvc r4, r11, lsl #16
8cfbda97 439 orrvs lr, #(1<<22) @ IR3/limB3
440
441 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
442 vmull.s32 q10, d20, d25 @ ... v=1
443 vmull.s32 q11, d22, d26 @ ... v=2
444 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
445 vadd.s64 q10, q2 @ ... v=1
446 vadd.s64 q11, q2 @ ... v=2
447 vqmovn.s64 d18, q9 @ saturate to 32 v=0
448 vqmovn.s64 d19, q10 @ ... v=1
449 vqmovn.s64 d20, q11 @ ... v=2
450 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
451 vmax.s32 d15, d18, d19 @ || for flags
452 vmin.s32 d14, d20
453 vmax.s32 d15, d20
454 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
455 vqshl.s32 d24, d20, #5 @ ... v=2
456 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 457 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
458 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 459 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
460 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 461 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
462 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
463 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
464 vmovn.s32 d13, q12 @ 3
465 vstmia r1, {d14-d16} @ || other cacheline than quotients
466 add r3, r0, #4*12
467 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
468 vst1.32 d13[0], [r3]
469
0e828e88 470 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 471 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
472
473 vmov.i32 d13, #0x1000
0e828e88 474 vshr.s32 d12, d26, #12
8cfbda97 475
476 add r3, r0, #4*24
477 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 478 vmax.s32 d12, d30
8cfbda97 479 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
480 vst1.32 d9[0], [r3]
481
482 vmin.s32 d12, d13 @ | gteIR0
483
8cfbda97 484 ldmia r6, {r4-r6} @ quotients
485 orr r4, r5
486 orr r4, r6
5d8e3bf8 487 add r3, r0, #4*8
8cfbda97 488 movs r4, r4, lsr #17
8cfbda97 489
8cfbda97 490 vst1.32 d12[0], [r3]! @ gteIR0
491 vst1.32 d10, [r3]! @ gteIR12
492 vst1.32 d11[0], [r3] @ ..3
493
17ed0d69 494 @ ~23 cycles
5d8e3bf8 495 orrne lr, #(1<<31) @ limE
496 orrne lr, #(1<<17) @ limE
8cfbda97 497 ldmia r1, {r4-r9}
17ed0d69 498 add r2, r4, #0x400<<16 @ min fSX
499 add r3, r6, #0x400<<16 @ max fSX
500 lsrs r2, #16+11
501 lsreqs r3, #16+11
502 orrne lr, #(1<<31) @ limG1
503 orrne lr, #(1<<14)
504 add r2, r5, #0x400<<16 @ min fSY
505 add r3, r7, #0x400<<16 @ max fSY
506 lsrs r2, #16+11
507 lsreqs r3, #16+11
508 orrne lr, #(1<<31) @ limG2
509 orrne lr, #(1<<13)
8cfbda97 510 adds r2, r9, #1
17ed0d69 511 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 512 subs r3, r8, #1
8cfbda97 513
514 ldr r4, [r0, #4*24] @ gteMAC0
515 orrvs lr, #(1<<15)
516
517 adds r3, r4, #1
518 orrvs lr, #(1<<16)
519 orrvs lr, #(1<<31) @ F
520 subs r2, r4, #1
521 orrvs lr, #(1<<15)
522 orrvs lr, #(1<<31) @ F
523 cmp r4, #0x1000
5d8e3bf8 524 orrhi lr, #(1<<12) @ limH
8cfbda97 525
526 str lr, [r0, #4*(32+31)] @ gteFLAG
527
528 pop {r4-r11,pc}
5d8e3bf8 529 .size gteRTPT_neon, .-gteRTPT_neon
530
531
532
17ed0d69 533.global gteMVMVA_neon @ r0=CP2 (d,c), op
534gteMVMVA_neon:
535 push {r4-r5,lr}
536
537 add r12, r0, #4*32
538
539 ubfx r2, r1, #15, #2 @ v
540
541 vmov.i32 q0, #0 @ d0,d1
542 vmov.i32 q1, #0 @ d2,d3
543 vmov.i32 q2, #0 @ d4,d5
544 cmp r2, #3
545 addeq r4, r0, #4*9
546 addne r3, r0, r2, lsl #3
547 ldmeqia r4, {r3-r5}
548 ldmneia r3, {r4,r5}
549 pkhbteq r4, r3, r4, lsl #16
550 uxth r5, r5
551 vmov.32 d8[0], r4
552 vmov.32 d8[1], r5 @ VXYZ(v)
553 ubfx r3, r1, #17, #2 @ mx
554 ubfx r2, r1, #13, #2 @ cv
555 cmp r3, #3
556 beq 0f @ very rare case
557 add r3, r12, r3, lsl #5
558 vldmia r3, {d0-d2} @ MXxy/gteR* [16*9]
5590:
560 cmp r2, #3
561 add r3, r12, r2, lsl #5
562 beq 0f
563 add r3, #4*5
564 vldmia r3, {d4-d5} @ CVx/gteTR*
565
5660:
567 vmov.i32 q15, #0
568 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
569 vext.16 d1, d0, d1, #3 @ xx32 -> x321
570 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
571 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
572
573 vmull.s16 q8, d0, d8
574 vmull.s16 q9, d1, d8
575 vmull.s16 q10, d2, d8
576 vpadd.s32 d16, d16, d17
577 vpadd.s32 d17, d18, d19
578 vpadd.s32 d18, d20, d21
579 vpadal.s32 q2, q8
580 vpadal.s32 q3, q9
581 tst r1, #1<<19
582 beq 0f
583 vshr.s64 q2, q2, #12
584 vshr.s64 q3, q3, #12
5850:
586 vqmovn.s64 d8, q2 @ gteMAC|12
587 vqmovn.s64 d9, q3 @ gteMAC3
588
589 tst r1, #1<<10
590 add r3, r0, #4*25
591 vqmovn.s32 d10, q4 @ gteIR|123
592 vst1.32 d8, [r3]!
593 vst1.32 d9[0], [r3] @ wb gteMAC|123
594
595 beq 0f
596 vmax.s16 d10, d31
5970:
598 vmovl.s16 q9, d10 @ expand gteIR|123
599 add r3, r0, #4*9
600 vst1.32 d18, [r3]!
601 vst1.32 d19[0], [r3]
602
603 tst r1, #1<<10 @ lm
604 mov r2, #0
605 mov lr, #0 @ gteFLAG
606 mov r12, #15
607 moveq r2, #0x8000 @ adj
608 moveq r12, #16 @ shift
609
610 add r3, r0, #4*25
611 ldmia r3, {r3-r5} @ gteMAC|123
612
613 do_mac_flags r3, r4, r5
614
615 add r3, r2
616 add r4, r2
617 add r5, r2
618 asrs r3, r12
619 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
620 asrs r4, r12
621 orrne lr, #(1<<31)
622 orrne lr, #(1<<23) @ IR2/limB2
623 asrs r5, r12
624 orrne lr, #(1<<22) @ IR3/limB3
625 str lr, [r0, #4*(32+31)] @ gteFLAG
626
627 pop {r4-r5,pc}
628 .size gteMVMVA_neon, .-gteMVMVA_neon
629
630
631
8cfbda97 632@ vim:filetype=armasm