gpu_neon: frameskip: skip blits until flipped
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
11
12.bss
13.align 6 @ cacheline
14
15scratch:
16.rept 8*8*2/4
17 .word 0
18.endr
19
20.text
21.align 2
22
5d8e3bf8 23@ approximate gteMAC|123 flags
24@ in: rr 123 as gteMAC|123
25@ trash: nothing
26.macro do_mac_flags rr1 rr2 rr3
27 cmp \rr1, #1
28 orrvs lr, #(1<<31)|(1<<27)
29 cmp \rr2, #1
30 orrvs lr, #(1<<31)|(1<<26)
31 cmp \rr3, #1
32 orrvs lr, #(1<<31)|(1<<25)
33 cmn \rr1, #1 @ same as adds ...
34 orrvs lr, #(1<<30)
35 cmn \rr2, #1
36 orrvs lr, #(1<<29)
37 cmn \rr3, #1
38 orrvs lr, #(1<<28)
39.endm
40
41@ approximate 3x gteMACn flags
42@ in: rr 123 as 3 instances gteMACn, *flags
43@ trash: nothing
44.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
45 cmp \rr1, #1
46 cmpvc \rr2, #1
47 cmpvc \rr3, #1
48 orrvs lr, #\nflags
49 cmn \rr1, #1 @ adds ...
50 cmnvc \rr2, #1
51 cmnvc \rr3, #1
52 orrvs lr, #\pflags
53.endm
54
17ed0d69 55@ get gteIR|123 flags from gteMAC|123
56@ in: rr 123 as gteMAC|123
5d8e3bf8 57@ trash: r2,r3
58.macro do_irs_flags rr1 rr2 rr3
17ed0d69 59 add r2, \rr1, #0x8000
60 add r3, \rr2, #0x8000
61 lsrs r2, #16
62 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
63 lsrs r3, #16
64 add r2, \rr3, #0x8000
65 orrne lr, #(1<<31)
66 orrne lr, #(1<<23) @ IR2/limB2
67 lsrs r2, #16
68 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 69.endm
70
71
72/*
5d8e3bf8 73 * RTPS/RTPT register map:
74 *
8cfbda97 75 * q | d | c code / phase 1 phase 2 scratch
76 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
77 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
78 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
79 * 3 * gteIR1-3 = gteIR1-3 /
80 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
81 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
82 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
83 * 7 0 gteDQB [s64] max gteMAC|12
84 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
85 * 9 * / gteMAC3 max gteIR|123
86 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
87 * 11 0 quotient 3
88 * 6 12 gteH (adj. for cmp)
89 * 13 gteH (float for div)
90 * ... <scratch>
91 * 15 30 0
92 * 31 0
93 */
8cfbda97 94
5d8e3bf8 95@ load gteR*, gteTR* and gteH (see map above), clear q15
96@ in: r0 - context
97@ trash: r3
98.macro rtpx_preload
8cfbda97 99 add r3, r0, #4*32
100 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 101 vmov.i32 q15, #0
8cfbda97 102 add r3, r0, #4*(32+5)
103 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 104 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
105 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 106 add r3, r0, #4*(32+26)
107 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 108 vshll.s32 q3, d5, #12 @ gteTRZ
109 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 110 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 111.endm
8cfbda97 112
5d8e3bf8 113@ do RTP* gteMAC* calculation
114@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
115@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
116@ trash: d16-d21
117.macro rtpx_mac
8cfbda97 118 vmull.s16 q8, d0, d8
119 vmull.s16 q9, d1, d8
120 vmull.s16 q10, d2, d8
121 vpaddl.s32 q8, q8
122 vpaddl.s32 q9, q9
123 vpaddl.s32 q10, q10
124 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
125 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
126 vadd.s64 d20, d21 @ QC
127 vadd.s64 d16, d4
128 vadd.s64 d18, d5
129 vadd.s64 d20, d6
130 vqshrn.s64 d8, q8, #12 @ gteMAC1
131 vqshrn.s64 d18, q9, #12 @ gteMAC2
132 vqshrn.s64 d9, q10, #12 @ gteMAC3
133 vsli.u64 d8, d18, #32 @ gteMAC|12
134 vmov.32 d9[1], r12
5d8e3bf8 135 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
136.endm
137
138.global gteRTPS_neon @ r0=CP2 (d,c),
139gteRTPS_neon:
17ed0d69 140 push {r4-r6,lr}
5d8e3bf8 141
142@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
143 movw r1, #:lower16:scratch
144 movt r1, #:upper16:scratch
145 mov r12, #0
146
4cc3050c 147 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 148 rtpx_preload
149
4cc3050c 150@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 151 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 152 vmull.s16 q8, d0, d8
153 vmull.s16 q9, d1, d8
154 vmull.s16 q10, d2, d8
155 vpadd.s32 d16, d16, d17
156 vpadd.s32 d17, d18, d19
157 vpadd.s32 d18, d20, d21
158 vpadal.s32 q2, q8
159 vpadal.s32 q3, q9 @ d6, d18 is slow?
160 vqshrn.s64 d8, q2, #12 @ gteMAC|12
161 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 162
163 add r3, r0, #4*25
164 vst1.32 d8, [r3]!
165 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 166 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 167
168 add r3, r0, #4*17 @ gteSZ*
169 vldmia r3, {q7} @ d14,d15 gteSZ|123x
170 vmov.i32 d28, #0xffff @ 0xffff[32]
171 vmax.s32 d11, d9, d31
172 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
173 vmov.i32 d26, #1
174 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 175 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 176 vshl.u32 d13, d12, #16 @ | preparing gteH
177 add r3, r0, #4*9
178 vst1.32 d18, [r3]!
179 vst1.32 d19[0], [r3]
180
181 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
182 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
183
184 add r3, r0, #4*(32+24)
185 vld1.32 d4, [r3] @ || gteOF|XY
186 add r3, r0, #4*(32+27)
187 vld1.32 d6, [r3] @ || gteDQ|AB
188
189 vand d11, d16
190 vmovl.s32 q2, d4 @ || gteOF|XY [64]
191 vmax.u32 d11, d26 @ make divisor 1 if not
192 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
193 add r3, r0, #4*16 @ | gteSZ*
194 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
195
196 vcvt.f32.u32 d13, d13 @ gteH (float for div)
197 vcvt.f32.u32 d11, d11 @ divisor
198
199 @ divide.. it's not worth messing with reciprocals here
200 @ just for 1 value, let's just use VFP divider here
201 vdiv.f32 s22, s26, s22
202
203 vcvt.u32.f32 d11, d11 @ quotient
204
205 @ while NEON's busy we calculate some flags on ARM
206 add r3, r0, #4*25
207 mov lr, #0 @ gteFLAG
208 ldmia r3, {r4-r6} @ gteMAC|123
209
4cc3050c 210 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 211 vqshl.u32 d11, #15
212
213 do_mac_flags r4, r5, r6
214
215 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 216
5d8e3bf8 217 do_irs_flags r4, r5, r6
218
219 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
220 add r3, r0, #4*13
221 vld1.32 d16, [r3] @ || load fS|XY12, new 01
222 vqmovn.s64 d18, q2 @ saturate to 32
223 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
224 vqshl.s32 d19, d18, #5 @ 11bit precision
225
226 ldr r4, [r1] @ quotient
5d8e3bf8 227 movs r3, r6, lsr #16
228 orrne lr, #(1<<31)
229 orrne lr, #(1<<18) @ fSZ (limD)
230
17ed0d69 231 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
232
5d8e3bf8 233 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
234 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
235 vmovn.s32 d18, q9 @ fS|XY2 [s16]
236
0e828e88 237 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 238 add r3, r0, #4*12
239 vst1.32 d16, [r3]! @ writeback fS|XY01
240 vst1.32 d18[0], [r3] @ ...2
241 add r3, r0, #4*24
0e828e88 242 vshr.s32 d21, d20, #12
5d8e3bf8 243 vst1.32 d20[0], [r3] @ gteMAC0
244
17ed0d69 245 movs r4, r4, lsr #17
246 orrne lr, #(1<<31)
247 orrne lr, #(1<<17) @ limE
248
0e828e88 249 vmax.s32 d21, d31
5d8e3bf8 250 vmov.i32 d22, #0x1000
251 vmin.s32 d21, d22
252 add r3, r0, #4*8
253 vst1.16 d21[0], [r3] @ gteIR0
254
17ed0d69 255 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
256 add r2, r4, #0x400<<16
257 add r3, r5, #0x400<<16
258 lsrs r2, #16+11
259 orrne lr, #(1<<14) @ limG1
260 orrne lr, #(1<<31)
261 lsrs r3, #16+11
262 orrne lr, #(1<<13) @ limG2
5d8e3bf8 263 orrne lr, #(1<<31)
5d8e3bf8 264 adds r2, r4, #1
265 addvcs r3, r5, #1
266 orrvs lr, #(1<<16) @ F
267 orrvs lr, #(1<<31)
268 subs r2, r4, #1
269 subvcs r3, r5, #1
270 orrvs lr, #(1<<31)
271
272 ldr r4, [r0, #4*24] @ gteMAC0
273 orrvs lr, #(1<<15)
274
275 adds r3, r4, #1
276 orrvs lr, #(1<<16) @ F
277 orrvs lr, #(1<<31)
278 subs r2, r4, #1
279 orrvs lr, #(1<<15) @ F
280 orrvs lr, #(1<<31)
281 cmp r4, #0x1000
282 orrhi lr, #(1<<12) @ limH
283
284 str lr, [r0, #4*(32+31)] @ gteFLAG
285
17ed0d69 286 pop {r4-r6,pc}
5d8e3bf8 287 .size gteRTPS_neon, .-gteRTPS_neon
288
289
290
291.global gteRTPT_neon @ r0=CP2 (d,c),
292gteRTPT_neon:
293 push {r4-r11,lr}
294
295 movw r1, #:lower16:scratch
296 movt r1, #:upper16:scratch
297 mov r12, #0
298
299 rtpx_preload
300
301 vmov.i32 d22, #0x7fffffff
302 vmov.i32 d23, #0x80000000
303 mov r3, #3 @ counter
304 mov r2, r0 @ VXYZ(0)
3050:
306 vldmia r2!, {d8} @ VXYZ(v)
307 vmov.16 d8[3], r12 @ kill unused upper vector
308
309 rtpx_mac
8cfbda97 310 vmin.s32 d22, d8 @ min gteMAC|12
311 vmax.s32 d23, d8 @ max gteMAC|12
312 subs r3, #1
4cc3050c 313 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 314 bgt 0b
315
4cc3050c 316 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 317
318 @ - phase2 -
319 sub r1, r1, #8*2*4
320 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
321
322 vmov d20, d0 @ gteMAC3 v=0
323 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 324 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 325 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
326 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
327 vmov d21, d9 @ ... v=2
328
329 vmov.i32 q14, #0xffff @ 0xffff[32]
330 vmax.s32 q10, q15
331 vmov.i32 q13, #1
332 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 333 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 334 vmin.s16 d24, d10 @ | find min/max IR
335 vmax.s16 d25, d10 @ |
336
5d8e3bf8 337 add r3, r0, #4*19 @ ||
338 vld1.32 d14[0], [r3] @ || gteSZ3
339
8cfbda97 340 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
341 add r3, r0, #4*17
342 vst1.32 d20, [r3]! @ | writeback fSZ(v)
343 vand q11, q10, q11
344 vst1.32 d21[0], [r3] @ |
345 vmax.u32 q10, q11, q13 @ make divisor 1 if not
346 add r3, r1, #8*8
347 vstmia r3, {q12} @ min/max IR for flags
348 vcvt.f32.u32 q10, q10
349 vshl.u32 d13, d12, #16 @ | preparing gteH
350
351 @ while NEON's busy we calculate some flags on ARM
352 add r2, r1, #8*2*3
353 mov lr, #0 @ gteFLAG
354 ldmia r2, {r4-r7} @ min/max gteMAC|12
355 subs r2, r4, #1
356 orrvs lr, #(1<<31)|(1<<27)
357 subs r3, r5, #1
358 orrvs lr, #(1<<31)|(1<<26)
359 adds r2, r6, #1
360 orrvs lr, #(1<<30)
361 adds r3, r7, #1
362 orrvs lr, #(1<<29)
363 ldr r4, [r1, #0] @ gteMAC3 v=0
364 ldr r5, [r1, #8*2] @ ... v=1
365 ldr r6, [r1, #8*4] @ ... v=2
366
367 add r3, r0, #4*(32+24)
368 vld1.32 d4, [r3] @ || gteOF|XY
369 add r3, r0, #4*(32+27)
5d8e3bf8 370 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 371
372 @ divide
373.if 1
374 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 375 vmovl.s32 q2, d4 @ || gteOF|XY [64]
376 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 377 vrecps.f32 q12, q10, q11 @ step
378 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
379 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 380 add r3, r0, #4*16
381 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 382 vdup.32 q13, d13[0] @ |
383@ vrecps.f32 q12, q10, q11 @ step
384@ vmul.f32 q11, q12, q11 @ better inv
385 vmul.f32 q10, q13, q11 @ result
386.else
5d8e3bf8 387 vmovl.s32 q2, d4 @ || gteOF|XY [64]
388 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 389 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
390 vdup.32 q13, d13[0] @ |
5d8e3bf8 391 add r3, r0, #4*16
392 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 393
394 vpush {q0}
395 vmov q0, q10 @ to test against C code
396 vdiv.f32 s0, s26, s0
397 vdiv.f32 s1, s26, s1
398 vdiv.f32 s2, s26, s2
399 vmov q10, q0
400 vpop {q0}
401.endif
402
5d8e3bf8 403 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 404 orr r7, r4, r5
405 add r4, r1, #8*8
406 orr r3, r7, r6
407 ldmia r4, {r7,r8,r10,r11} @ min/max IR
408
409 movs r3, r3, lsr #16
410 orrne lr, #(1<<31)
411 orrne lr, #(1<<18) @ fSZ (limD)
412
413@ vadd.f32 q10, q @ adjust for vcvt rounding mode
414 vcvt.u32.f32 q8, q10
415 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
416 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
417 add r6, r1, #8*10
418 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
419 vqshl.u32 q8, #15
420 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
421 vshr.u32 q8, #15 @ quotients (limE)
422 vdup.32 d24, d16[0]
423 vdup.32 d25, d16[1]
424 vdup.32 d26, d17[0] @ quotient (dup)
425
5d8e3bf8 426 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
427 mov r4, #0x10000
428 cmp r7, #1<<16
429 cmnvc r10, #1<<16
8cfbda97 430 orrvs lr, #(1<<31)
431 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 432 rsbs r2, r4, r7, lsl #16
433 cmnvc r4, r10, lsl #16
8cfbda97 434 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 435 rsbs r2, r4, r8, lsl #16
436 cmnvc r4, r11, lsl #16
8cfbda97 437 orrvs lr, #(1<<22) @ IR3/limB3
438
439 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
440 vmull.s32 q10, d20, d25 @ ... v=1
441 vmull.s32 q11, d22, d26 @ ... v=2
442 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
443 vadd.s64 q10, q2 @ ... v=1
444 vadd.s64 q11, q2 @ ... v=2
445 vqmovn.s64 d18, q9 @ saturate to 32 v=0
446 vqmovn.s64 d19, q10 @ ... v=1
447 vqmovn.s64 d20, q11 @ ... v=2
448 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
449 vmax.s32 d15, d18, d19 @ || for flags
450 vmin.s32 d14, d20
451 vmax.s32 d15, d20
452 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
453 vqshl.s32 d24, d20, #5 @ ... v=2
454 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 455 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
456 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 457 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
458 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 459 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
460 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
461 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
462 vmovn.s32 d13, q12 @ 3
463 vstmia r1, {d14-d16} @ || other cacheline than quotients
464 add r3, r0, #4*12
465 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
466 vst1.32 d13[0], [r3]
467
0e828e88 468 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 469 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
470
471 vmov.i32 d13, #0x1000
0e828e88 472 vshr.s32 d12, d26, #12
8cfbda97 473
474 add r3, r0, #4*24
475 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 476 vmax.s32 d12, d30
8cfbda97 477 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
478 vst1.32 d9[0], [r3]
479
480 vmin.s32 d12, d13 @ | gteIR0
481
8cfbda97 482 ldmia r6, {r4-r6} @ quotients
483 orr r4, r5
484 orr r4, r6
5d8e3bf8 485 add r3, r0, #4*8
8cfbda97 486 movs r4, r4, lsr #17
8cfbda97 487
8cfbda97 488 vst1.32 d12[0], [r3]! @ gteIR0
489 vst1.32 d10, [r3]! @ gteIR12
490 vst1.32 d11[0], [r3] @ ..3
491
17ed0d69 492 @ ~23 cycles
5d8e3bf8 493 orrne lr, #(1<<31) @ limE
494 orrne lr, #(1<<17) @ limE
8cfbda97 495 ldmia r1, {r4-r9}
17ed0d69 496 add r2, r4, #0x400<<16 @ min fSX
497 add r3, r6, #0x400<<16 @ max fSX
498 lsrs r2, #16+11
499 lsreqs r3, #16+11
500 orrne lr, #(1<<31) @ limG1
501 orrne lr, #(1<<14)
502 add r2, r5, #0x400<<16 @ min fSY
503 add r3, r7, #0x400<<16 @ max fSY
504 lsrs r2, #16+11
505 lsreqs r3, #16+11
506 orrne lr, #(1<<31) @ limG2
507 orrne lr, #(1<<13)
8cfbda97 508 adds r2, r9, #1
17ed0d69 509 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 510 subs r3, r8, #1
8cfbda97 511
512 ldr r4, [r0, #4*24] @ gteMAC0
513 orrvs lr, #(1<<15)
514
515 adds r3, r4, #1
516 orrvs lr, #(1<<16)
517 orrvs lr, #(1<<31) @ F
518 subs r2, r4, #1
519 orrvs lr, #(1<<15)
520 orrvs lr, #(1<<31) @ F
521 cmp r4, #0x1000
5d8e3bf8 522 orrhi lr, #(1<<12) @ limH
8cfbda97 523
524 str lr, [r0, #4*(32+31)] @ gteFLAG
525
526 pop {r4-r11,pc}
5d8e3bf8 527 .size gteRTPT_neon, .-gteRTPT_neon
528
529
530
17ed0d69 531.global gteMVMVA_neon @ r0=CP2 (d,c), op
532gteMVMVA_neon:
533 push {r4-r5,lr}
534
535 add r12, r0, #4*32
536
537 ubfx r2, r1, #15, #2 @ v
538
539 vmov.i32 q0, #0 @ d0,d1
540 vmov.i32 q1, #0 @ d2,d3
541 vmov.i32 q2, #0 @ d4,d5
542 cmp r2, #3
543 addeq r4, r0, #4*9
544 addne r3, r0, r2, lsl #3
545 ldmeqia r4, {r3-r5}
546 ldmneia r3, {r4,r5}
547 pkhbteq r4, r3, r4, lsl #16
548 uxth r5, r5
549 vmov.32 d8[0], r4
550 vmov.32 d8[1], r5 @ VXYZ(v)
551 ubfx r3, r1, #17, #2 @ mx
552 ubfx r2, r1, #13, #2 @ cv
553 cmp r3, #3
554 beq 0f @ very rare case
555 add r3, r12, r3, lsl #5
556 vldmia r3, {d0-d2} @ MXxy/gteR* [16*9]
5570:
558 cmp r2, #3
559 add r3, r12, r2, lsl #5
560 beq 0f
561 add r3, #4*5
562 vldmia r3, {d4-d5} @ CVx/gteTR*
563
5640:
565 vmov.i32 q15, #0
566 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
567 vext.16 d1, d0, d1, #3 @ xx32 -> x321
568 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
569 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
570
571 vmull.s16 q8, d0, d8
572 vmull.s16 q9, d1, d8
573 vmull.s16 q10, d2, d8
574 vpadd.s32 d16, d16, d17
575 vpadd.s32 d17, d18, d19
576 vpadd.s32 d18, d20, d21
577 vpadal.s32 q2, q8
578 vpadal.s32 q3, q9
579 tst r1, #1<<19
580 beq 0f
581 vshr.s64 q2, q2, #12
582 vshr.s64 q3, q3, #12
5830:
584 vqmovn.s64 d8, q2 @ gteMAC|12
585 vqmovn.s64 d9, q3 @ gteMAC3
586
587 tst r1, #1<<10
588 add r3, r0, #4*25
589 vqmovn.s32 d10, q4 @ gteIR|123
590 vst1.32 d8, [r3]!
591 vst1.32 d9[0], [r3] @ wb gteMAC|123
592
593 beq 0f
594 vmax.s16 d10, d31
5950:
596 vmovl.s16 q9, d10 @ expand gteIR|123
597 add r3, r0, #4*9
598 vst1.32 d18, [r3]!
599 vst1.32 d19[0], [r3]
600
601 tst r1, #1<<10 @ lm
602 mov r2, #0
603 mov lr, #0 @ gteFLAG
604 mov r12, #15
605 moveq r2, #0x8000 @ adj
606 moveq r12, #16 @ shift
607
608 add r3, r0, #4*25
609 ldmia r3, {r3-r5} @ gteMAC|123
610
611 do_mac_flags r3, r4, r5
612
613 add r3, r2
614 add r4, r2
615 add r5, r2
616 asrs r3, r12
617 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
618 asrs r4, r12
619 orrne lr, #(1<<31)
620 orrne lr, #(1<<23) @ IR2/limB2
621 asrs r5, r12
622 orrne lr, #(1<<22) @ IR3/limB3
623 str lr, [r0, #4*(32+31)] @ gteFLAG
624
625 pop {r4-r5,pc}
626 .size gteMVMVA_neon, .-gteMVMVA_neon
627
628
629
5d8e3bf8 630@ the name is misnormer, this doesn't use NEON but oh well..
631.global gteNCLIP_neon @ r0=CP2 (d,c),
632gteNCLIP_neon:
633 push {r4-r6,lr}
634
635 add r1, r0, #4*12
636 ldmia r1, {r1-r3}
637 mov r4, r1, asr #16
638 mov r5, r2, asr #16
639 mov r6, r3, asr #16
640 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
641 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
642 sxth r1, r1
643 smull r1, r5, r1, r5 @ RdLo, RdHi
644 sub r6, r4 @ 2: gteSY2 - gteSY0
645 sxth r2, r2
646 smlal r1, r5, r2, r6
647 mov lr, #0 @ gteFLAG
648 sxth r3, r3
649 smlal r1, r5, r3, r12
650 mov r6, #1<<31
651 orr r6, #1<<15
652 movs r2, r1, lsl #1
653 adc r5, r5
654 cmp r5, #0
655 movtgt lr, #((1<<31)|(1<<16))>>16
656 mvngt r1, #1<<31 @ maxint
657 cmn r5, #1
658 movmi r1, #1<<31 @ minint
659 orrmi lr, r6
660 str r1, [r0, #4*24]
661 str lr, [r0, #4*(32+31)] @ gteFLAG
662
663 pop {r4-r6,pc}
664 .size gteNCLIP_neon, .-gteNCLIP_neon
665
8cfbda97 666
667@ vim:filetype=armasm