cspace_neon: convert to .S
[pcsx_rearmed.git] / libpcsxcore / gte_neon.S
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
8cfbda97 5 * See the COPYING file in the top-level directory.
6 */
7
8
a53073ec 9.syntax unified
10
8cfbda97 11.bss
12.align 6 @ cacheline
13
14scratch:
15.rept 8*8*2/4
16 .word 0
17.endr
18
19.text
20.align 2
21
c67af2ac 22.macro ldr_scratch rd
23#ifndef __PIC__
24 movw \rd, #:lower16:scratch
25 movt \rd, #:upper16:scratch
26#else
27 ldr \rd, =scratch
28#endif
29.endm
30
59774ed0 31@ XXX: gteMAC calc shouldn't be saturating, but it is here
32
5d8e3bf8 33@ approximate gteMAC|123 flags
34@ in: rr 123 as gteMAC|123
35@ trash: nothing
36.macro do_mac_flags rr1 rr2 rr3
37 cmp \rr1, #1
38 orrvs lr, #(1<<31)|(1<<27)
39 cmp \rr2, #1
40 orrvs lr, #(1<<31)|(1<<26)
41 cmp \rr3, #1
42 orrvs lr, #(1<<31)|(1<<25)
43 cmn \rr1, #1 @ same as adds ...
44 orrvs lr, #(1<<30)
45 cmn \rr2, #1
46 orrvs lr, #(1<<29)
47 cmn \rr3, #1
48 orrvs lr, #(1<<28)
49.endm
50
51@ approximate 3x gteMACn flags
52@ in: rr 123 as 3 instances gteMACn, *flags
53@ trash: nothing
54.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
55 cmp \rr1, #1
56 cmpvc \rr2, #1
57 cmpvc \rr3, #1
58 orrvs lr, #\nflags
59 cmn \rr1, #1 @ adds ...
60 cmnvc \rr2, #1
61 cmnvc \rr3, #1
62 orrvs lr, #\pflags
63.endm
64
17ed0d69 65@ get gteIR|123 flags from gteMAC|123
66@ in: rr 123 as gteMAC|123
5d8e3bf8 67@ trash: r2,r3
68.macro do_irs_flags rr1 rr2 rr3
17ed0d69 69 add r2, \rr1, #0x8000
70 add r3, \rr2, #0x8000
71 lsrs r2, #16
72 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
73 lsrs r3, #16
74 add r2, \rr3, #0x8000
75 orrne lr, #(1<<31)
76 orrne lr, #(1<<23) @ IR2/limB2
77 lsrs r2, #16
78 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 79.endm
80
81
82/*
5d8e3bf8 83 * RTPS/RTPT register map:
84 *
8cfbda97 85 * q | d | c code / phase 1 phase 2 scratch
86 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
87 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
88 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
89 * 3 * gteIR1-3 = gteIR1-3 /
90 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
91 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
92 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
93 * 7 0 gteDQB [s64] max gteMAC|12
94 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
95 * 9 * / gteMAC3 max gteIR|123
96 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
97 * 11 0 quotient 3
98 * 6 12 gteH (adj. for cmp)
99 * 13 gteH (float for div)
100 * ... <scratch>
101 * 15 30 0
102 * 31 0
103 */
8cfbda97 104
5d8e3bf8 105@ load gteR*, gteTR* and gteH (see map above), clear q15
106@ in: r0 - context
107@ trash: r3
108.macro rtpx_preload
8cfbda97 109 add r3, r0, #4*32
110 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 111 vmov.i32 q15, #0
8cfbda97 112 add r3, r0, #4*(32+5)
113 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 114 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
115 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 116 add r3, r0, #4*(32+26)
117 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 118 vshll.s32 q3, d5, #12 @ gteTRZ
119 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 120 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 121.endm
8cfbda97 122
5d8e3bf8 123@ do RTP* gteMAC* calculation
124@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
125@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
126@ trash: d16-d21
127.macro rtpx_mac
8cfbda97 128 vmull.s16 q8, d0, d8
129 vmull.s16 q9, d1, d8
130 vmull.s16 q10, d2, d8
131 vpaddl.s32 q8, q8
132 vpaddl.s32 q9, q9
133 vpaddl.s32 q10, q10
134 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
135 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
136 vadd.s64 d20, d21 @ QC
137 vadd.s64 d16, d4
138 vadd.s64 d18, d5
139 vadd.s64 d20, d6
140 vqshrn.s64 d8, q8, #12 @ gteMAC1
141 vqshrn.s64 d18, q9, #12 @ gteMAC2
142 vqshrn.s64 d9, q10, #12 @ gteMAC3
143 vsli.u64 d8, d18, #32 @ gteMAC|12
144 vmov.32 d9[1], r12
5d8e3bf8 145 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
146.endm
147
148.global gteRTPS_neon @ r0=CP2 (d,c),
149gteRTPS_neon:
17ed0d69 150 push {r4-r6,lr}
5d8e3bf8 151
152@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
c67af2ac 153 ldr_scratch r1
5d8e3bf8 154 mov r12, #0
155
4cc3050c 156 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 157 rtpx_preload
158
4cc3050c 159@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 160 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 161 vmull.s16 q8, d0, d8
162 vmull.s16 q9, d1, d8
163 vmull.s16 q10, d2, d8
164 vpadd.s32 d16, d16, d17
165 vpadd.s32 d17, d18, d19
166 vpadd.s32 d18, d20, d21
167 vpadal.s32 q2, q8
168 vpadal.s32 q3, q9 @ d6, d18 is slow?
169 vqshrn.s64 d8, q2, #12 @ gteMAC|12
170 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 171
172 add r3, r0, #4*25
173 vst1.32 d8, [r3]!
174 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 175 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 176
177 add r3, r0, #4*17 @ gteSZ*
178 vldmia r3, {q7} @ d14,d15 gteSZ|123x
179 vmov.i32 d28, #0xffff @ 0xffff[32]
180 vmax.s32 d11, d9, d31
181 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
182 vmov.i32 d26, #1
183 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 184 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 185 vshl.u32 d13, d12, #16 @ | preparing gteH
186 add r3, r0, #4*9
187 vst1.32 d18, [r3]!
188 vst1.32 d19[0], [r3]
189
190 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
191 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
192
193 add r3, r0, #4*(32+24)
194 vld1.32 d4, [r3] @ || gteOF|XY
195 add r3, r0, #4*(32+27)
196 vld1.32 d6, [r3] @ || gteDQ|AB
197
198 vand d11, d16
199 vmovl.s32 q2, d4 @ || gteOF|XY [64]
200 vmax.u32 d11, d26 @ make divisor 1 if not
201 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
202 add r3, r0, #4*16 @ | gteSZ*
203 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
204
205 vcvt.f32.u32 d13, d13 @ gteH (float for div)
206 vcvt.f32.u32 d11, d11 @ divisor
207
208 @ divide.. it's not worth messing with reciprocals here
209 @ just for 1 value, let's just use VFP divider here
210 vdiv.f32 s22, s26, s22
211
02455d0d 212 vmov.f32 d20, #0.5
213 vadd.f32 d11, d20
5d8e3bf8 214 vcvt.u32.f32 d11, d11 @ quotient
215
216 @ while NEON's busy we calculate some flags on ARM
217 add r3, r0, #4*25
218 mov lr, #0 @ gteFLAG
219 ldmia r3, {r4-r6} @ gteMAC|123
220
4cc3050c 221 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 222 vqshl.u32 d11, #15
223
224 do_mac_flags r4, r5, r6
225
226 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 227
5d8e3bf8 228 do_irs_flags r4, r5, r6
229
230 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
231 add r3, r0, #4*13
232 vld1.32 d16, [r3] @ || load fS|XY12, new 01
233 vqmovn.s64 d18, q2 @ saturate to 32
234 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
235 vqshl.s32 d19, d18, #5 @ 11bit precision
236
237 ldr r4, [r1] @ quotient
5d8e3bf8 238 movs r3, r6, lsr #16
239 orrne lr, #(1<<31)
240 orrne lr, #(1<<18) @ fSZ (limD)
241
17ed0d69 242 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
243
5d8e3bf8 244 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
245 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
246 vmovn.s32 d18, q9 @ fS|XY2 [s16]
247
0e828e88 248 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 249 add r3, r0, #4*12
250 vst1.32 d16, [r3]! @ writeback fS|XY01
251 vst1.32 d18[0], [r3] @ ...2
252 add r3, r0, #4*24
0e828e88 253 vshr.s32 d21, d20, #12
5d8e3bf8 254 vst1.32 d20[0], [r3] @ gteMAC0
255
17ed0d69 256 movs r4, r4, lsr #17
257 orrne lr, #(1<<31)
258 orrne lr, #(1<<17) @ limE
259
0e828e88 260 vmax.s32 d21, d31
5d8e3bf8 261 vmov.i32 d22, #0x1000
262 vmin.s32 d21, d22
263 add r3, r0, #4*8
264 vst1.16 d21[0], [r3] @ gteIR0
265
17ed0d69 266 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
267 add r2, r4, #0x400<<16
268 add r3, r5, #0x400<<16
269 lsrs r2, #16+11
270 orrne lr, #(1<<14) @ limG1
271 orrne lr, #(1<<31)
272 lsrs r3, #16+11
273 orrne lr, #(1<<13) @ limG2
5d8e3bf8 274 orrne lr, #(1<<31)
5d8e3bf8 275 adds r2, r4, #1
a53073ec 276 addsvc r3, r5, #1
5d8e3bf8 277 orrvs lr, #(1<<16) @ F
278 orrvs lr, #(1<<31)
279 subs r2, r4, #1
a53073ec 280 subsvc r3, r5, #1
5d8e3bf8 281 orrvs lr, #(1<<31)
282
283 ldr r4, [r0, #4*24] @ gteMAC0
284 orrvs lr, #(1<<15)
285
286 adds r3, r4, #1
287 orrvs lr, #(1<<16) @ F
288 orrvs lr, #(1<<31)
289 subs r2, r4, #1
290 orrvs lr, #(1<<15) @ F
291 orrvs lr, #(1<<31)
292 cmp r4, #0x1000
293 orrhi lr, #(1<<12) @ limH
294
295 str lr, [r0, #4*(32+31)] @ gteFLAG
296
17ed0d69 297 pop {r4-r6,pc}
5d8e3bf8 298 .size gteRTPS_neon, .-gteRTPS_neon
299
300
301
302.global gteRTPT_neon @ r0=CP2 (d,c),
303gteRTPT_neon:
304 push {r4-r11,lr}
305
c67af2ac 306 ldr_scratch r1
5d8e3bf8 307 mov r12, #0
308
309 rtpx_preload
310
311 vmov.i32 d22, #0x7fffffff
312 vmov.i32 d23, #0x80000000
313 mov r3, #3 @ counter
314 mov r2, r0 @ VXYZ(0)
3150:
316 vldmia r2!, {d8} @ VXYZ(v)
317 vmov.16 d8[3], r12 @ kill unused upper vector
318
319 rtpx_mac
8cfbda97 320 vmin.s32 d22, d8 @ min gteMAC|12
321 vmax.s32 d23, d8 @ max gteMAC|12
322 subs r3, #1
4cc3050c 323 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 324 bgt 0b
325
4cc3050c 326 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 327
328 @ - phase2 -
329 sub r1, r1, #8*2*4
330 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
331
332 vmov d20, d0 @ gteMAC3 v=0
333 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 334 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 335 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
336 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
337 vmov d21, d9 @ ... v=2
338
339 vmov.i32 q14, #0xffff @ 0xffff[32]
340 vmax.s32 q10, q15
341 vmov.i32 q13, #1
342 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 343 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 344 vmin.s16 d24, d10 @ | find min/max IR
345 vmax.s16 d25, d10 @ |
346
5d8e3bf8 347 add r3, r0, #4*19 @ ||
348 vld1.32 d14[0], [r3] @ || gteSZ3
349
8cfbda97 350 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
351 add r3, r0, #4*17
352 vst1.32 d20, [r3]! @ | writeback fSZ(v)
353 vand q11, q10, q11
354 vst1.32 d21[0], [r3] @ |
355 vmax.u32 q10, q11, q13 @ make divisor 1 if not
356 add r3, r1, #8*8
357 vstmia r3, {q12} @ min/max IR for flags
358 vcvt.f32.u32 q10, q10
359 vshl.u32 d13, d12, #16 @ | preparing gteH
360
361 @ while NEON's busy we calculate some flags on ARM
362 add r2, r1, #8*2*3
363 mov lr, #0 @ gteFLAG
364 ldmia r2, {r4-r7} @ min/max gteMAC|12
365 subs r2, r4, #1
366 orrvs lr, #(1<<31)|(1<<27)
367 subs r3, r5, #1
368 orrvs lr, #(1<<31)|(1<<26)
369 adds r2, r6, #1
370 orrvs lr, #(1<<30)
371 adds r3, r7, #1
372 orrvs lr, #(1<<29)
373 ldr r4, [r1, #0] @ gteMAC3 v=0
374 ldr r5, [r1, #8*2] @ ... v=1
375 ldr r6, [r1, #8*4] @ ... v=2
376
377 add r3, r0, #4*(32+24)
378 vld1.32 d4, [r3] @ || gteOF|XY
379 add r3, r0, #4*(32+27)
5d8e3bf8 380 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 381
382 @ divide
383.if 1
384 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 385 vmovl.s32 q2, d4 @ || gteOF|XY [64]
386 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 387 vrecps.f32 q12, q10, q11 @ step
388 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
4706bbe4 389 vmov.f32 q8, #0.5 @ |||
8cfbda97 390 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 391 add r3, r0, #4*16
392 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 393 vdup.32 q13, d13[0] @ |
4706bbe4 394@ vrecps.f32 q12, q10, q11 @ step
395@ vmul.f32 q11, q12, q11 @ better inv
8cfbda97 396 vmul.f32 q10, q13, q11 @ result
397.else
4706bbe4 398 vmov.f32 q8, #0.5 @ |||
5d8e3bf8 399 vmovl.s32 q2, d4 @ || gteOF|XY [64]
400 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 401 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
402 vdup.32 q13, d13[0] @ |
5d8e3bf8 403 add r3, r0, #4*16
404 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 405
406 vpush {q0}
407 vmov q0, q10 @ to test against C code
408 vdiv.f32 s0, s26, s0
409 vdiv.f32 s1, s26, s1
410 vdiv.f32 s2, s26, s2
411 vmov q10, q0
412 vpop {q0}
413.endif
414
5d8e3bf8 415 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 416 orr r7, r4, r5
417 add r4, r1, #8*8
418 orr r3, r7, r6
419 ldmia r4, {r7,r8,r10,r11} @ min/max IR
420
421 movs r3, r3, lsr #16
422 orrne lr, #(1<<31)
423 orrne lr, #(1<<18) @ fSZ (limD)
424
4706bbe4 425 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
8cfbda97 426 vcvt.u32.f32 q8, q10
427 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
428 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
429 add r6, r1, #8*10
430 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
431 vqshl.u32 q8, #15
432 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
433 vshr.u32 q8, #15 @ quotients (limE)
434 vdup.32 d24, d16[0]
435 vdup.32 d25, d16[1]
436 vdup.32 d26, d17[0] @ quotient (dup)
437
5d8e3bf8 438 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
439 mov r4, #0x10000
440 cmp r7, #1<<16
441 cmnvc r10, #1<<16
8cfbda97 442 orrvs lr, #(1<<31)
443 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 444 rsbs r2, r4, r7, lsl #16
445 cmnvc r4, r10, lsl #16
8cfbda97 446 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 447 rsbs r2, r4, r8, lsl #16
448 cmnvc r4, r11, lsl #16
8cfbda97 449 orrvs lr, #(1<<22) @ IR3/limB3
450
451 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
452 vmull.s32 q10, d20, d25 @ ... v=1
453 vmull.s32 q11, d22, d26 @ ... v=2
454 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
455 vadd.s64 q10, q2 @ ... v=1
456 vadd.s64 q11, q2 @ ... v=2
457 vqmovn.s64 d18, q9 @ saturate to 32 v=0
458 vqmovn.s64 d19, q10 @ ... v=1
459 vqmovn.s64 d20, q11 @ ... v=2
460 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
461 vmax.s32 d15, d18, d19 @ || for flags
462 vmin.s32 d14, d20
463 vmax.s32 d15, d20
464 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
465 vqshl.s32 d24, d20, #5 @ ... v=2
466 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 467 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
468 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 469 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
470 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 471 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
472 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
473 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
474 vmovn.s32 d13, q12 @ 3
475 vstmia r1, {d14-d16} @ || other cacheline than quotients
476 add r3, r0, #4*12
477 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
478 vst1.32 d13[0], [r3]
479
0e828e88 480 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 481 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
482
483 vmov.i32 d13, #0x1000
0e828e88 484 vshr.s32 d12, d26, #12
8cfbda97 485
486 add r3, r0, #4*24
487 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 488 vmax.s32 d12, d30
8cfbda97 489 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
490 vst1.32 d9[0], [r3]
491
492 vmin.s32 d12, d13 @ | gteIR0
493
8cfbda97 494 ldmia r6, {r4-r6} @ quotients
495 orr r4, r5
496 orr r4, r6
5d8e3bf8 497 add r3, r0, #4*8
8cfbda97 498 movs r4, r4, lsr #17
8cfbda97 499
8cfbda97 500 vst1.32 d12[0], [r3]! @ gteIR0
501 vst1.32 d10, [r3]! @ gteIR12
502 vst1.32 d11[0], [r3] @ ..3
503
17ed0d69 504 @ ~23 cycles
5d8e3bf8 505 orrne lr, #(1<<31) @ limE
506 orrne lr, #(1<<17) @ limE
8cfbda97 507 ldmia r1, {r4-r9}
17ed0d69 508 add r2, r4, #0x400<<16 @ min fSX
509 add r3, r6, #0x400<<16 @ max fSX
510 lsrs r2, #16+11
a53073ec 511 lsrseq r3, #16+11
17ed0d69 512 orrne lr, #(1<<31) @ limG1
513 orrne lr, #(1<<14)
514 add r2, r5, #0x400<<16 @ min fSY
515 add r3, r7, #0x400<<16 @ max fSY
516 lsrs r2, #16+11
a53073ec 517 lsrseq r3, #16+11
17ed0d69 518 orrne lr, #(1<<31) @ limG2
519 orrne lr, #(1<<13)
8cfbda97 520 adds r2, r9, #1
17ed0d69 521 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 522 subs r3, r8, #1
8cfbda97 523
524 ldr r4, [r0, #4*24] @ gteMAC0
525 orrvs lr, #(1<<15)
526
527 adds r3, r4, #1
528 orrvs lr, #(1<<16)
529 orrvs lr, #(1<<31) @ F
530 subs r2, r4, #1
531 orrvs lr, #(1<<15)
532 orrvs lr, #(1<<31) @ F
533 cmp r4, #0x1000
5d8e3bf8 534 orrhi lr, #(1<<12) @ limH
8cfbda97 535
536 str lr, [r0, #4*(32+31)] @ gteFLAG
537
538 pop {r4-r11,pc}
5d8e3bf8 539 .size gteRTPT_neon, .-gteRTPT_neon
540
541
542
054175e9 543@ note: non-std calling convention used
544@ r0 = CP2 (d,c) (must preserve)
545@ r1 = op
546@ r4,r5 = VXYZ(v) packed
547@ r6 = &MX11(mx)
548@ r7 = &CV1(cv)
549.global gteMVMVA_part_neon
550gteMVMVA_part_neon:
17ed0d69 551 uxth r5, r5
552 vmov.32 d8[0], r4
553 vmov.32 d8[1], r5 @ VXYZ(v)
054175e9 554 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
555 vldmia r7, {d4-d5} @ CVx/gteTR*
17ed0d69 556
17ed0d69 557 vmov.i32 q15, #0
558 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
559 vext.16 d1, d0, d1, #3 @ xx32 -> x321
560 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
561 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
562
563 vmull.s16 q8, d0, d8
564 vmull.s16 q9, d1, d8
565 vmull.s16 q10, d2, d8
566 vpadd.s32 d16, d16, d17
567 vpadd.s32 d17, d18, d19
568 vpadd.s32 d18, d20, d21
569 vpadal.s32 q2, q8
570 vpadal.s32 q3, q9
571 tst r1, #1<<19
572 beq 0f
573 vshr.s64 q2, q2, #12
574 vshr.s64 q3, q3, #12
5750:
576 vqmovn.s64 d8, q2 @ gteMAC|12
577 vqmovn.s64 d9, q3 @ gteMAC3
578
579 tst r1, #1<<10
580 add r3, r0, #4*25
581 vqmovn.s32 d10, q4 @ gteIR|123
582 vst1.32 d8, [r3]!
583 vst1.32 d9[0], [r3] @ wb gteMAC|123
584
585 beq 0f
586 vmax.s16 d10, d31
5870:
588 vmovl.s16 q9, d10 @ expand gteIR|123
589 add r3, r0, #4*9
590 vst1.32 d18, [r3]!
591 vst1.32 d19[0], [r3]
054175e9 592 bx lr
593 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
17ed0d69 594
054175e9 595
596@ get flags after gteMVMVA_part_neon operation
597.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
598gteMACtoIR_flags_neon:
599 push {r4,r5,lr}
600 tst r1, r1 @ lm
17ed0d69 601 mov lr, #0 @ gteFLAG
054175e9 602 mov r2, #0
17ed0d69 603 mov r12, #15
604 moveq r2, #0x8000 @ adj
605 moveq r12, #16 @ shift
606
607 add r3, r0, #4*25
608 ldmia r3, {r3-r5} @ gteMAC|123
609
610 do_mac_flags r3, r4, r5
611
612 add r3, r2
613 add r4, r2
614 add r5, r2
615 asrs r3, r12
616 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
617 asrs r4, r12
618 orrne lr, #(1<<31)
619 orrne lr, #(1<<23) @ IR2/limB2
620 asrs r5, r12
621 orrne lr, #(1<<22) @ IR3/limB3
622 str lr, [r0, #4*(32+31)] @ gteFLAG
623
054175e9 624 pop {r4,r5,pc}
625 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
17ed0d69 626
627
628
8cfbda97 629@ vim:filetype=armasm