asm: use a macro for functions
[pcsx_rearmed.git] / libpcsxcore / gte_neon.S
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
8cfbda97 5 * See the COPYING file in the top-level directory.
6 */
7
5c6457c3 8#include "arm_features.h"
8cfbda97 9
a53073ec 10.syntax unified
11
8cfbda97 12.bss
13.align 6 @ cacheline
14
15scratch:
16.rept 8*8*2/4
17 .word 0
18.endr
19
20.text
21.align 2
22
c67af2ac 23.macro ldr_scratch rd
24#ifndef __PIC__
25 movw \rd, #:lower16:scratch
26 movt \rd, #:upper16:scratch
27#else
28 ldr \rd, =scratch
29#endif
30.endm
31
59774ed0 32@ XXX: gteMAC calc shouldn't be saturating, but it is here
33
5d8e3bf8 34@ approximate gteMAC|123 flags
35@ in: rr 123 as gteMAC|123
36@ trash: nothing
37.macro do_mac_flags rr1 rr2 rr3
38 cmp \rr1, #1
39 orrvs lr, #(1<<31)|(1<<27)
40 cmp \rr2, #1
41 orrvs lr, #(1<<31)|(1<<26)
42 cmp \rr3, #1
43 orrvs lr, #(1<<31)|(1<<25)
44 cmn \rr1, #1 @ same as adds ...
45 orrvs lr, #(1<<30)
46 cmn \rr2, #1
47 orrvs lr, #(1<<29)
48 cmn \rr3, #1
49 orrvs lr, #(1<<28)
50.endm
51
52@ approximate 3x gteMACn flags
53@ in: rr 123 as 3 instances gteMACn, *flags
54@ trash: nothing
55.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
56 cmp \rr1, #1
57 cmpvc \rr2, #1
58 cmpvc \rr3, #1
59 orrvs lr, #\nflags
60 cmn \rr1, #1 @ adds ...
61 cmnvc \rr2, #1
62 cmnvc \rr3, #1
63 orrvs lr, #\pflags
64.endm
65
17ed0d69 66@ get gteIR|123 flags from gteMAC|123
67@ in: rr 123 as gteMAC|123
5d8e3bf8 68@ trash: r2,r3
69.macro do_irs_flags rr1 rr2 rr3
17ed0d69 70 add r2, \rr1, #0x8000
71 add r3, \rr2, #0x8000
72 lsrs r2, #16
73 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
74 lsrs r3, #16
75 add r2, \rr3, #0x8000
76 orrne lr, #(1<<31)
77 orrne lr, #(1<<23) @ IR2/limB2
78 lsrs r2, #16
79 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 80.endm
81
82
83/*
5d8e3bf8 84 * RTPS/RTPT register map:
85 *
8cfbda97 86 * q | d | c code / phase 1 phase 2 scratch
87 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
88 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
89 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
90 * 3 * gteIR1-3 = gteIR1-3 /
91 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
92 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
93 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
94 * 7 0 gteDQB [s64] max gteMAC|12
95 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
96 * 9 * / gteMAC3 max gteIR|123
97 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
98 * 11 0 quotient 3
99 * 6 12 gteH (adj. for cmp)
100 * 13 gteH (float for div)
101 * ... <scratch>
102 * 15 30 0
103 * 31 0
104 */
8cfbda97 105
5d8e3bf8 106@ load gteR*, gteTR* and gteH (see map above), clear q15
107@ in: r0 - context
108@ trash: r3
109.macro rtpx_preload
8cfbda97 110 add r3, r0, #4*32
111 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 112 vmov.i32 q15, #0
8cfbda97 113 add r3, r0, #4*(32+5)
114 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 115 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
116 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 117 add r3, r0, #4*(32+26)
118 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 119 vshll.s32 q3, d5, #12 @ gteTRZ
120 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 121 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 122.endm
8cfbda97 123
5d8e3bf8 124@ do RTP* gteMAC* calculation
125@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
126@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
127@ trash: d16-d21
128.macro rtpx_mac
8cfbda97 129 vmull.s16 q8, d0, d8
130 vmull.s16 q9, d1, d8
131 vmull.s16 q10, d2, d8
132 vpaddl.s32 q8, q8
133 vpaddl.s32 q9, q9
134 vpaddl.s32 q10, q10
135 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
136 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
137 vadd.s64 d20, d21 @ QC
138 vadd.s64 d16, d4
139 vadd.s64 d18, d5
140 vadd.s64 d20, d6
141 vqshrn.s64 d8, q8, #12 @ gteMAC1
142 vqshrn.s64 d18, q9, #12 @ gteMAC2
143 vqshrn.s64 d9, q10, #12 @ gteMAC3
144 vsli.u64 d8, d18, #32 @ gteMAC|12
145 vmov.32 d9[1], r12
5d8e3bf8 146 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
147.endm
148
5c6457c3 149FUNCTION(gteRTPS_neon): @ r0=CP2 (d,c),
17ed0d69 150 push {r4-r6,lr}
5d8e3bf8 151
152@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
c67af2ac 153 ldr_scratch r1
5d8e3bf8 154 mov r12, #0
155
4cc3050c 156 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 157 rtpx_preload
158
4cc3050c 159@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 160 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 161 vmull.s16 q8, d0, d8
162 vmull.s16 q9, d1, d8
163 vmull.s16 q10, d2, d8
164 vpadd.s32 d16, d16, d17
165 vpadd.s32 d17, d18, d19
166 vpadd.s32 d18, d20, d21
167 vpadal.s32 q2, q8
168 vpadal.s32 q3, q9 @ d6, d18 is slow?
169 vqshrn.s64 d8, q2, #12 @ gteMAC|12
170 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 171
172 add r3, r0, #4*25
173 vst1.32 d8, [r3]!
174 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 175 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 176
177 add r3, r0, #4*17 @ gteSZ*
178 vldmia r3, {q7} @ d14,d15 gteSZ|123x
179 vmov.i32 d28, #0xffff @ 0xffff[32]
180 vmax.s32 d11, d9, d31
181 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
182 vmov.i32 d26, #1
183 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 184 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 185 vshl.u32 d13, d12, #16 @ | preparing gteH
186 add r3, r0, #4*9
187 vst1.32 d18, [r3]!
188 vst1.32 d19[0], [r3]
189
190 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
191 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
192
193 add r3, r0, #4*(32+24)
194 vld1.32 d4, [r3] @ || gteOF|XY
195 add r3, r0, #4*(32+27)
196 vld1.32 d6, [r3] @ || gteDQ|AB
197
198 vand d11, d16
199 vmovl.s32 q2, d4 @ || gteOF|XY [64]
200 vmax.u32 d11, d26 @ make divisor 1 if not
201 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
202 add r3, r0, #4*16 @ | gteSZ*
203 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
204
205 vcvt.f32.u32 d13, d13 @ gteH (float for div)
206 vcvt.f32.u32 d11, d11 @ divisor
207
208 @ divide.. it's not worth messing with reciprocals here
209 @ just for 1 value, let's just use VFP divider here
210 vdiv.f32 s22, s26, s22
211
02455d0d 212 vmov.f32 d20, #0.5
213 vadd.f32 d11, d20
5d8e3bf8 214 vcvt.u32.f32 d11, d11 @ quotient
215
216 @ while NEON's busy we calculate some flags on ARM
217 add r3, r0, #4*25
218 mov lr, #0 @ gteFLAG
219 ldmia r3, {r4-r6} @ gteMAC|123
220
4cc3050c 221 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 222 vqshl.u32 d11, #15
223
224 do_mac_flags r4, r5, r6
225
226 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 227
5d8e3bf8 228 do_irs_flags r4, r5, r6
229
230 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
231 add r3, r0, #4*13
232 vld1.32 d16, [r3] @ || load fS|XY12, new 01
233 vqmovn.s64 d18, q2 @ saturate to 32
234 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
235 vqshl.s32 d19, d18, #5 @ 11bit precision
236
237 ldr r4, [r1] @ quotient
5d8e3bf8 238 movs r3, r6, lsr #16
239 orrne lr, #(1<<31)
240 orrne lr, #(1<<18) @ fSZ (limD)
241
17ed0d69 242 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
243
5d8e3bf8 244 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
245 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
246 vmovn.s32 d18, q9 @ fS|XY2 [s16]
247
0e828e88 248 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 249 add r3, r0, #4*12
250 vst1.32 d16, [r3]! @ writeback fS|XY01
251 vst1.32 d18[0], [r3] @ ...2
252 add r3, r0, #4*24
0e828e88 253 vshr.s32 d21, d20, #12
5d8e3bf8 254 vst1.32 d20[0], [r3] @ gteMAC0
255
17ed0d69 256 movs r4, r4, lsr #17
257 orrne lr, #(1<<31)
258 orrne lr, #(1<<17) @ limE
259
0e828e88 260 vmax.s32 d21, d31
5d8e3bf8 261 vmov.i32 d22, #0x1000
262 vmin.s32 d21, d22
263 add r3, r0, #4*8
264 vst1.16 d21[0], [r3] @ gteIR0
265
17ed0d69 266 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
267 add r2, r4, #0x400<<16
268 add r3, r5, #0x400<<16
269 lsrs r2, #16+11
270 orrne lr, #(1<<14) @ limG1
271 orrne lr, #(1<<31)
272 lsrs r3, #16+11
273 orrne lr, #(1<<13) @ limG2
5d8e3bf8 274 orrne lr, #(1<<31)
5d8e3bf8 275 adds r2, r4, #1
a53073ec 276 addsvc r3, r5, #1
5d8e3bf8 277 orrvs lr, #(1<<16) @ F
278 orrvs lr, #(1<<31)
279 subs r2, r4, #1
a53073ec 280 subsvc r3, r5, #1
5d8e3bf8 281 orrvs lr, #(1<<31)
282
283 ldr r4, [r0, #4*24] @ gteMAC0
284 orrvs lr, #(1<<15)
285
286 adds r3, r4, #1
287 orrvs lr, #(1<<16) @ F
288 orrvs lr, #(1<<31)
289 subs r2, r4, #1
290 orrvs lr, #(1<<15) @ F
291 orrvs lr, #(1<<31)
292 cmp r4, #0x1000
293 orrhi lr, #(1<<12) @ limH
294
295 str lr, [r0, #4*(32+31)] @ gteFLAG
296
17ed0d69 297 pop {r4-r6,pc}
5d8e3bf8 298 .size gteRTPS_neon, .-gteRTPS_neon
299
300
301
5c6457c3 302FUNCTION(gteRTPT_neon): @ r0=CP2 (d,c),
5d8e3bf8 303 push {r4-r11,lr}
304
c67af2ac 305 ldr_scratch r1
5d8e3bf8 306 mov r12, #0
307
308 rtpx_preload
309
310 vmov.i32 d22, #0x7fffffff
311 vmov.i32 d23, #0x80000000
312 mov r3, #3 @ counter
313 mov r2, r0 @ VXYZ(0)
3140:
315 vldmia r2!, {d8} @ VXYZ(v)
316 vmov.16 d8[3], r12 @ kill unused upper vector
317
318 rtpx_mac
8cfbda97 319 vmin.s32 d22, d8 @ min gteMAC|12
320 vmax.s32 d23, d8 @ max gteMAC|12
321 subs r3, #1
4cc3050c 322 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 323 bgt 0b
324
4cc3050c 325 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 326
327 @ - phase2 -
328 sub r1, r1, #8*2*4
329 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
330
331 vmov d20, d0 @ gteMAC3 v=0
332 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 333 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 334 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
335 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
336 vmov d21, d9 @ ... v=2
337
338 vmov.i32 q14, #0xffff @ 0xffff[32]
339 vmax.s32 q10, q15
340 vmov.i32 q13, #1
341 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 342 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 343 vmin.s16 d24, d10 @ | find min/max IR
344 vmax.s16 d25, d10 @ |
345
5d8e3bf8 346 add r3, r0, #4*19 @ ||
347 vld1.32 d14[0], [r3] @ || gteSZ3
348
8cfbda97 349 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
350 add r3, r0, #4*17
351 vst1.32 d20, [r3]! @ | writeback fSZ(v)
352 vand q11, q10, q11
353 vst1.32 d21[0], [r3] @ |
354 vmax.u32 q10, q11, q13 @ make divisor 1 if not
355 add r3, r1, #8*8
356 vstmia r3, {q12} @ min/max IR for flags
357 vcvt.f32.u32 q10, q10
358 vshl.u32 d13, d12, #16 @ | preparing gteH
359
360 @ while NEON's busy we calculate some flags on ARM
361 add r2, r1, #8*2*3
362 mov lr, #0 @ gteFLAG
363 ldmia r2, {r4-r7} @ min/max gteMAC|12
364 subs r2, r4, #1
365 orrvs lr, #(1<<31)|(1<<27)
366 subs r3, r5, #1
367 orrvs lr, #(1<<31)|(1<<26)
368 adds r2, r6, #1
369 orrvs lr, #(1<<30)
370 adds r3, r7, #1
371 orrvs lr, #(1<<29)
372 ldr r4, [r1, #0] @ gteMAC3 v=0
373 ldr r5, [r1, #8*2] @ ... v=1
374 ldr r6, [r1, #8*4] @ ... v=2
375
376 add r3, r0, #4*(32+24)
377 vld1.32 d4, [r3] @ || gteOF|XY
378 add r3, r0, #4*(32+27)
5d8e3bf8 379 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 380
381 @ divide
382.if 1
383 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 384 vmovl.s32 q2, d4 @ || gteOF|XY [64]
385 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 386 vrecps.f32 q12, q10, q11 @ step
387 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
4706bbe4 388 vmov.f32 q8, #0.5 @ |||
8cfbda97 389 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 390 add r3, r0, #4*16
391 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 392 vdup.32 q13, d13[0] @ |
4706bbe4 393@ vrecps.f32 q12, q10, q11 @ step
394@ vmul.f32 q11, q12, q11 @ better inv
8cfbda97 395 vmul.f32 q10, q13, q11 @ result
396.else
4706bbe4 397 vmov.f32 q8, #0.5 @ |||
5d8e3bf8 398 vmovl.s32 q2, d4 @ || gteOF|XY [64]
399 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 400 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
401 vdup.32 q13, d13[0] @ |
5d8e3bf8 402 add r3, r0, #4*16
403 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 404
405 vpush {q0}
406 vmov q0, q10 @ to test against C code
407 vdiv.f32 s0, s26, s0
408 vdiv.f32 s1, s26, s1
409 vdiv.f32 s2, s26, s2
410 vmov q10, q0
411 vpop {q0}
412.endif
413
5d8e3bf8 414 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 415 orr r7, r4, r5
416 add r4, r1, #8*8
417 orr r3, r7, r6
418 ldmia r4, {r7,r8,r10,r11} @ min/max IR
419
420 movs r3, r3, lsr #16
421 orrne lr, #(1<<31)
422 orrne lr, #(1<<18) @ fSZ (limD)
423
4706bbe4 424 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
8cfbda97 425 vcvt.u32.f32 q8, q10
426 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
427 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
428 add r6, r1, #8*10
429 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
430 vqshl.u32 q8, #15
431 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
432 vshr.u32 q8, #15 @ quotients (limE)
433 vdup.32 d24, d16[0]
434 vdup.32 d25, d16[1]
435 vdup.32 d26, d17[0] @ quotient (dup)
436
5d8e3bf8 437 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
438 mov r4, #0x10000
439 cmp r7, #1<<16
440 cmnvc r10, #1<<16
8cfbda97 441 orrvs lr, #(1<<31)
442 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 443 rsbs r2, r4, r7, lsl #16
444 cmnvc r4, r10, lsl #16
8cfbda97 445 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 446 rsbs r2, r4, r8, lsl #16
447 cmnvc r4, r11, lsl #16
8cfbda97 448 orrvs lr, #(1<<22) @ IR3/limB3
449
450 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
451 vmull.s32 q10, d20, d25 @ ... v=1
452 vmull.s32 q11, d22, d26 @ ... v=2
453 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
454 vadd.s64 q10, q2 @ ... v=1
455 vadd.s64 q11, q2 @ ... v=2
456 vqmovn.s64 d18, q9 @ saturate to 32 v=0
457 vqmovn.s64 d19, q10 @ ... v=1
458 vqmovn.s64 d20, q11 @ ... v=2
459 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
460 vmax.s32 d15, d18, d19 @ || for flags
461 vmin.s32 d14, d20
462 vmax.s32 d15, d20
463 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
464 vqshl.s32 d24, d20, #5 @ ... v=2
465 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 466 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
467 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 468 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
469 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 470 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
471 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
472 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
473 vmovn.s32 d13, q12 @ 3
474 vstmia r1, {d14-d16} @ || other cacheline than quotients
475 add r3, r0, #4*12
476 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
477 vst1.32 d13[0], [r3]
478
0e828e88 479 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 480 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
481
482 vmov.i32 d13, #0x1000
0e828e88 483 vshr.s32 d12, d26, #12
8cfbda97 484
485 add r3, r0, #4*24
486 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 487 vmax.s32 d12, d30
8cfbda97 488 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
489 vst1.32 d9[0], [r3]
490
491 vmin.s32 d12, d13 @ | gteIR0
492
8cfbda97 493 ldmia r6, {r4-r6} @ quotients
494 orr r4, r5
495 orr r4, r6
5d8e3bf8 496 add r3, r0, #4*8
8cfbda97 497 movs r4, r4, lsr #17
8cfbda97 498
8cfbda97 499 vst1.32 d12[0], [r3]! @ gteIR0
500 vst1.32 d10, [r3]! @ gteIR12
501 vst1.32 d11[0], [r3] @ ..3
502
17ed0d69 503 @ ~23 cycles
5d8e3bf8 504 orrne lr, #(1<<31) @ limE
505 orrne lr, #(1<<17) @ limE
8cfbda97 506 ldmia r1, {r4-r9}
17ed0d69 507 add r2, r4, #0x400<<16 @ min fSX
508 add r3, r6, #0x400<<16 @ max fSX
509 lsrs r2, #16+11
a53073ec 510 lsrseq r3, #16+11
17ed0d69 511 orrne lr, #(1<<31) @ limG1
512 orrne lr, #(1<<14)
513 add r2, r5, #0x400<<16 @ min fSY
514 add r3, r7, #0x400<<16 @ max fSY
515 lsrs r2, #16+11
a53073ec 516 lsrseq r3, #16+11
17ed0d69 517 orrne lr, #(1<<31) @ limG2
518 orrne lr, #(1<<13)
8cfbda97 519 adds r2, r9, #1
17ed0d69 520 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 521 subs r3, r8, #1
8cfbda97 522
523 ldr r4, [r0, #4*24] @ gteMAC0
524 orrvs lr, #(1<<15)
525
526 adds r3, r4, #1
527 orrvs lr, #(1<<16)
528 orrvs lr, #(1<<31) @ F
529 subs r2, r4, #1
530 orrvs lr, #(1<<15)
531 orrvs lr, #(1<<31) @ F
532 cmp r4, #0x1000
5d8e3bf8 533 orrhi lr, #(1<<12) @ limH
8cfbda97 534
535 str lr, [r0, #4*(32+31)] @ gteFLAG
536
537 pop {r4-r11,pc}
5d8e3bf8 538 .size gteRTPT_neon, .-gteRTPT_neon
539
540
541
054175e9 542@ note: non-std calling convention used
543@ r0 = CP2 (d,c) (must preserve)
544@ r1 = op
545@ r4,r5 = VXYZ(v) packed
546@ r6 = &MX11(mx)
547@ r7 = &CV1(cv)
5c6457c3 548FUNCTION(gteMVMVA_part_neon):
17ed0d69 549 uxth r5, r5
550 vmov.32 d8[0], r4
551 vmov.32 d8[1], r5 @ VXYZ(v)
054175e9 552 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
553 vldmia r7, {d4-d5} @ CVx/gteTR*
17ed0d69 554
17ed0d69 555 vmov.i32 q15, #0
556 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
557 vext.16 d1, d0, d1, #3 @ xx32 -> x321
558 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
559 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
560
561 vmull.s16 q8, d0, d8
562 vmull.s16 q9, d1, d8
563 vmull.s16 q10, d2, d8
564 vpadd.s32 d16, d16, d17
565 vpadd.s32 d17, d18, d19
566 vpadd.s32 d18, d20, d21
567 vpadal.s32 q2, q8
568 vpadal.s32 q3, q9
569 tst r1, #1<<19
570 beq 0f
571 vshr.s64 q2, q2, #12
572 vshr.s64 q3, q3, #12
5730:
574 vqmovn.s64 d8, q2 @ gteMAC|12
575 vqmovn.s64 d9, q3 @ gteMAC3
576
577 tst r1, #1<<10
578 add r3, r0, #4*25
579 vqmovn.s32 d10, q4 @ gteIR|123
580 vst1.32 d8, [r3]!
581 vst1.32 d9[0], [r3] @ wb gteMAC|123
582
583 beq 0f
584 vmax.s16 d10, d31
5850:
586 vmovl.s16 q9, d10 @ expand gteIR|123
587 add r3, r0, #4*9
588 vst1.32 d18, [r3]!
589 vst1.32 d19[0], [r3]
054175e9 590 bx lr
591 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
17ed0d69 592
054175e9 593
594@ get flags after gteMVMVA_part_neon operation
5c6457c3 595FUNCTION(gteMACtoIR_flags_neon): @ r0=CP2 (d,c), r1=lm
054175e9 596 push {r4,r5,lr}
597 tst r1, r1 @ lm
17ed0d69 598 mov lr, #0 @ gteFLAG
054175e9 599 mov r2, #0
17ed0d69 600 mov r12, #15
601 moveq r2, #0x8000 @ adj
602 moveq r12, #16 @ shift
603
604 add r3, r0, #4*25
605 ldmia r3, {r3-r5} @ gteMAC|123
606
607 do_mac_flags r3, r4, r5
608
609 add r3, r2
610 add r4, r2
611 add r5, r2
612 asrs r3, r12
613 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
614 asrs r4, r12
615 orrne lr, #(1<<31)
616 orrne lr, #(1<<23) @ IR2/limB2
617 asrs r5, r12
618 orrne lr, #(1<<22) @ IR3/limB3
619 str lr, [r0, #4*(32+31)] @ gteFLAG
620
054175e9 621 pop {r4,r5,pc}
622 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
17ed0d69 623
624
625
8cfbda97 626@ vim:filetype=armasm