Buildfix for LLVM
[pcsx_rearmed.git] / libpcsxcore / gte_neon.S
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
8cfbda97 5 * See the COPYING file in the top-level directory.
6 */
7
5c6457c3 8#include "arm_features.h"
6f173b35 9#include "new_dynarec/arm/linkage_offsets.h"
8cfbda97 10
a53073ec 11.syntax unified
8cfbda97 12.text
13.align 2
14
59774ed0 15@ XXX: gteMAC calc shouldn't be saturating, but it is here
16
5d8e3bf8 17@ approximate gteMAC|123 flags
18@ in: rr 123 as gteMAC|123
19@ trash: nothing
20.macro do_mac_flags rr1 rr2 rr3
21 cmp \rr1, #1
22 orrvs lr, #(1<<31)|(1<<27)
23 cmp \rr2, #1
24 orrvs lr, #(1<<31)|(1<<26)
25 cmp \rr3, #1
26 orrvs lr, #(1<<31)|(1<<25)
27 cmn \rr1, #1 @ same as adds ...
28 orrvs lr, #(1<<30)
29 cmn \rr2, #1
30 orrvs lr, #(1<<29)
31 cmn \rr3, #1
32 orrvs lr, #(1<<28)
33.endm
34
35@ approximate 3x gteMACn flags
36@ in: rr 123 as 3 instances gteMACn, *flags
37@ trash: nothing
38.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
39 cmp \rr1, #1
40 cmpvc \rr2, #1
41 cmpvc \rr3, #1
42 orrvs lr, #\nflags
43 cmn \rr1, #1 @ adds ...
44 cmnvc \rr2, #1
45 cmnvc \rr3, #1
46 orrvs lr, #\pflags
47.endm
48
17ed0d69 49@ get gteIR|123 flags from gteMAC|123
50@ in: rr 123 as gteMAC|123
5d8e3bf8 51@ trash: r2,r3
52.macro do_irs_flags rr1 rr2 rr3
17ed0d69 53 add r2, \rr1, #0x8000
54 add r3, \rr2, #0x8000
55 lsrs r2, #16
56 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
57 lsrs r3, #16
58 add r2, \rr3, #0x8000
59 orrne lr, #(1<<31)
60 orrne lr, #(1<<23) @ IR2/limB2
61 lsrs r2, #16
62 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 63.endm
64
65
66/*
5d8e3bf8 67 * RTPS/RTPT register map:
68 *
8cfbda97 69 * q | d | c code / phase 1 phase 2 scratch
70 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
71 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
72 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
73 * 3 * gteIR1-3 = gteIR1-3 /
74 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
75 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
76 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
77 * 7 0 gteDQB [s64] max gteMAC|12
78 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
79 * 9 * / gteMAC3 max gteIR|123
80 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
81 * 11 0 quotient 3
82 * 6 12 gteH (adj. for cmp)
83 * 13 gteH (float for div)
84 * ... <scratch>
85 * 15 30 0
86 * 31 0
87 */
8cfbda97 88
5d8e3bf8 89@ load gteR*, gteTR* and gteH (see map above), clear q15
90@ in: r0 - context
91@ trash: r3
92.macro rtpx_preload
8cfbda97 93 add r3, r0, #4*32
94 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 95 vmov.i32 q15, #0
8cfbda97 96 add r3, r0, #4*(32+5)
97 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 98 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
99 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 100 add r3, r0, #4*(32+26)
101 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 102 vshll.s32 q3, d5, #12 @ gteTRZ
103 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 104 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 105.endm
8cfbda97 106
5d8e3bf8 107@ do RTP* gteMAC* calculation
108@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
109@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
110@ trash: d16-d21
111.macro rtpx_mac
8cfbda97 112 vmull.s16 q8, d0, d8
113 vmull.s16 q9, d1, d8
114 vmull.s16 q10, d2, d8
115 vpaddl.s32 q8, q8
116 vpaddl.s32 q9, q9
117 vpaddl.s32 q10, q10
118 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
119 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
120 vadd.s64 d20, d21 @ QC
121 vadd.s64 d16, d4
122 vadd.s64 d18, d5
123 vadd.s64 d20, d6
124 vqshrn.s64 d8, q8, #12 @ gteMAC1
125 vqshrn.s64 d18, q9, #12 @ gteMAC2
126 vqshrn.s64 d9, q10, #12 @ gteMAC3
127 vsli.u64 d8, d18, #32 @ gteMAC|12
128 vmov.32 d9[1], r12
5d8e3bf8 129 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
130.endm
131
5c6457c3 132FUNCTION(gteRTPS_neon): @ r0=CP2 (d,c),
17ed0d69 133 push {r4-r6,lr}
5d8e3bf8 134
135@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
c6d5790c 136 ldr r1, [r0, #LO_cop2_to_scratch_buf]
5d8e3bf8 137 mov r12, #0
138
4cc3050c 139 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 140 rtpx_preload
141
4cc3050c 142@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 143 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 144 vmull.s16 q8, d0, d8
145 vmull.s16 q9, d1, d8
146 vmull.s16 q10, d2, d8
147 vpadd.s32 d16, d16, d17
148 vpadd.s32 d17, d18, d19
149 vpadd.s32 d18, d20, d21
150 vpadal.s32 q2, q8
151 vpadal.s32 q3, q9 @ d6, d18 is slow?
152 vqshrn.s64 d8, q2, #12 @ gteMAC|12
153 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 154
155 add r3, r0, #4*25
156 vst1.32 d8, [r3]!
157 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 158 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 159
160 add r3, r0, #4*17 @ gteSZ*
161 vldmia r3, {q7} @ d14,d15 gteSZ|123x
162 vmov.i32 d28, #0xffff @ 0xffff[32]
163 vmax.s32 d11, d9, d31
164 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
165 vmov.i32 d26, #1
166 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 167 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 168 vshl.u32 d13, d12, #16 @ | preparing gteH
169 add r3, r0, #4*9
170 vst1.32 d18, [r3]!
171 vst1.32 d19[0], [r3]
172
173 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
174 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
175
176 add r3, r0, #4*(32+24)
177 vld1.32 d4, [r3] @ || gteOF|XY
178 add r3, r0, #4*(32+27)
179 vld1.32 d6, [r3] @ || gteDQ|AB
180
181 vand d11, d16
182 vmovl.s32 q2, d4 @ || gteOF|XY [64]
183 vmax.u32 d11, d26 @ make divisor 1 if not
184 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
185 add r3, r0, #4*16 @ | gteSZ*
186 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
187
188 vcvt.f32.u32 d13, d13 @ gteH (float for div)
189 vcvt.f32.u32 d11, d11 @ divisor
190
191 @ divide.. it's not worth messing with reciprocals here
192 @ just for 1 value, let's just use VFP divider here
193 vdiv.f32 s22, s26, s22
194
02455d0d 195 vmov.f32 d20, #0.5
196 vadd.f32 d11, d20
5d8e3bf8 197 vcvt.u32.f32 d11, d11 @ quotient
198
199 @ while NEON's busy we calculate some flags on ARM
200 add r3, r0, #4*25
201 mov lr, #0 @ gteFLAG
202 ldmia r3, {r4-r6} @ gteMAC|123
203
4cc3050c 204 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 205 vqshl.u32 d11, #15
206
207 do_mac_flags r4, r5, r6
208
209 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 210
5d8e3bf8 211 do_irs_flags r4, r5, r6
212
213 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
214 add r3, r0, #4*13
215 vld1.32 d16, [r3] @ || load fS|XY12, new 01
216 vqmovn.s64 d18, q2 @ saturate to 32
217 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
218 vqshl.s32 d19, d18, #5 @ 11bit precision
219
220 ldr r4, [r1] @ quotient
5d8e3bf8 221 movs r3, r6, lsr #16
222 orrne lr, #(1<<31)
223 orrne lr, #(1<<18) @ fSZ (limD)
224
17ed0d69 225 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
226
5d8e3bf8 227 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
228 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
229 vmovn.s32 d18, q9 @ fS|XY2 [s16]
230
0e828e88 231 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 232 add r3, r0, #4*12
233 vst1.32 d16, [r3]! @ writeback fS|XY01
234 vst1.32 d18[0], [r3] @ ...2
235 add r3, r0, #4*24
0e828e88 236 vshr.s32 d21, d20, #12
5d8e3bf8 237 vst1.32 d20[0], [r3] @ gteMAC0
238
17ed0d69 239 movs r4, r4, lsr #17
240 orrne lr, #(1<<31)
241 orrne lr, #(1<<17) @ limE
242
0e828e88 243 vmax.s32 d21, d31
5d8e3bf8 244 vmov.i32 d22, #0x1000
245 vmin.s32 d21, d22
246 add r3, r0, #4*8
247 vst1.16 d21[0], [r3] @ gteIR0
248
17ed0d69 249 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
250 add r2, r4, #0x400<<16
251 add r3, r5, #0x400<<16
252 lsrs r2, #16+11
253 orrne lr, #(1<<14) @ limG1
254 orrne lr, #(1<<31)
255 lsrs r3, #16+11
256 orrne lr, #(1<<13) @ limG2
5d8e3bf8 257 orrne lr, #(1<<31)
5d8e3bf8 258 adds r2, r4, #1
a53073ec 259 addsvc r3, r5, #1
5d8e3bf8 260 orrvs lr, #(1<<16) @ F
261 orrvs lr, #(1<<31)
262 subs r2, r4, #1
a53073ec 263 subsvc r3, r5, #1
5d8e3bf8 264 orrvs lr, #(1<<31)
265
266 ldr r4, [r0, #4*24] @ gteMAC0
267 orrvs lr, #(1<<15)
268
269 adds r3, r4, #1
270 orrvs lr, #(1<<16) @ F
271 orrvs lr, #(1<<31)
272 subs r2, r4, #1
273 orrvs lr, #(1<<15) @ F
274 orrvs lr, #(1<<31)
275 cmp r4, #0x1000
276 orrhi lr, #(1<<12) @ limH
277
278 str lr, [r0, #4*(32+31)] @ gteFLAG
279
17ed0d69 280 pop {r4-r6,pc}
5d8e3bf8 281 .size gteRTPS_neon, .-gteRTPS_neon
282
283
284
5c6457c3 285FUNCTION(gteRTPT_neon): @ r0=CP2 (d,c),
5d8e3bf8 286 push {r4-r11,lr}
287
c6d5790c 288 ldr r1, [r0, #LO_cop2_to_scratch_buf]
5d8e3bf8 289 mov r12, #0
290
291 rtpx_preload
292
9c9d02e4 293 vmov.i32 d23, #1
294 vmov.i32 d22, #0x80000000
295 vsub.i32 d22, d22, d23
296 vmov.i32 d23, #0
5d8e3bf8 297 mov r3, #3 @ counter
298 mov r2, r0 @ VXYZ(0)
2990:
300 vldmia r2!, {d8} @ VXYZ(v)
301 vmov.16 d8[3], r12 @ kill unused upper vector
302
303 rtpx_mac
8cfbda97 304 vmin.s32 d22, d8 @ min gteMAC|12
305 vmax.s32 d23, d8 @ max gteMAC|12
306 subs r3, #1
4cc3050c 307 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 308 bgt 0b
309
4cc3050c 310 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 311
312 @ - phase2 -
313 sub r1, r1, #8*2*4
314 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
315
316 vmov d20, d0 @ gteMAC3 v=0
317 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 318 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 319 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
320 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
321 vmov d21, d9 @ ... v=2
322
323 vmov.i32 q14, #0xffff @ 0xffff[32]
324 vmax.s32 q10, q15
325 vmov.i32 q13, #1
326 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 327 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 328 vmin.s16 d24, d10 @ | find min/max IR
329 vmax.s16 d25, d10 @ |
330
5d8e3bf8 331 add r3, r0, #4*19 @ ||
332 vld1.32 d14[0], [r3] @ || gteSZ3
333
8cfbda97 334 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
335 add r3, r0, #4*17
336 vst1.32 d20, [r3]! @ | writeback fSZ(v)
337 vand q11, q10, q11
338 vst1.32 d21[0], [r3] @ |
339 vmax.u32 q10, q11, q13 @ make divisor 1 if not
340 add r3, r1, #8*8
341 vstmia r3, {q12} @ min/max IR for flags
342 vcvt.f32.u32 q10, q10
343 vshl.u32 d13, d12, #16 @ | preparing gteH
344
345 @ while NEON's busy we calculate some flags on ARM
346 add r2, r1, #8*2*3
347 mov lr, #0 @ gteFLAG
348 ldmia r2, {r4-r7} @ min/max gteMAC|12
349 subs r2, r4, #1
350 orrvs lr, #(1<<31)|(1<<27)
351 subs r3, r5, #1
352 orrvs lr, #(1<<31)|(1<<26)
353 adds r2, r6, #1
354 orrvs lr, #(1<<30)
355 adds r3, r7, #1
356 orrvs lr, #(1<<29)
357 ldr r4, [r1, #0] @ gteMAC3 v=0
358 ldr r5, [r1, #8*2] @ ... v=1
359 ldr r6, [r1, #8*4] @ ... v=2
360
361 add r3, r0, #4*(32+24)
362 vld1.32 d4, [r3] @ || gteOF|XY
363 add r3, r0, #4*(32+27)
5d8e3bf8 364 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 365
366 @ divide
367.if 1
368 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 369 vmovl.s32 q2, d4 @ || gteOF|XY [64]
370 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 371 vrecps.f32 q12, q10, q11 @ step
372 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
4706bbe4 373 vmov.f32 q8, #0.5 @ |||
8cfbda97 374 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 375 add r3, r0, #4*16
376 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 377 vdup.32 q13, d13[0] @ |
4706bbe4 378@ vrecps.f32 q12, q10, q11 @ step
379@ vmul.f32 q11, q12, q11 @ better inv
8cfbda97 380 vmul.f32 q10, q13, q11 @ result
381.else
4706bbe4 382 vmov.f32 q8, #0.5 @ |||
5d8e3bf8 383 vmovl.s32 q2, d4 @ || gteOF|XY [64]
384 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 385 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
386 vdup.32 q13, d13[0] @ |
5d8e3bf8 387 add r3, r0, #4*16
388 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 389
390 vpush {q0}
391 vmov q0, q10 @ to test against C code
392 vdiv.f32 s0, s26, s0
393 vdiv.f32 s1, s26, s1
394 vdiv.f32 s2, s26, s2
395 vmov q10, q0
396 vpop {q0}
397.endif
398
5d8e3bf8 399 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 400 orr r7, r4, r5
401 add r4, r1, #8*8
402 orr r3, r7, r6
403 ldmia r4, {r7,r8,r10,r11} @ min/max IR
404
405 movs r3, r3, lsr #16
406 orrne lr, #(1<<31)
407 orrne lr, #(1<<18) @ fSZ (limD)
408
4706bbe4 409 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
8cfbda97 410 vcvt.u32.f32 q8, q10
411 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
412 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
413 add r6, r1, #8*10
414 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
415 vqshl.u32 q8, #15
416 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
417 vshr.u32 q8, #15 @ quotients (limE)
418 vdup.32 d24, d16[0]
419 vdup.32 d25, d16[1]
420 vdup.32 d26, d17[0] @ quotient (dup)
421
5d8e3bf8 422 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
423 mov r4, #0x10000
424 cmp r7, #1<<16
425 cmnvc r10, #1<<16
8cfbda97 426 orrvs lr, #(1<<31)
427 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 428 rsbs r2, r4, r7, lsl #16
429 cmnvc r4, r10, lsl #16
8cfbda97 430 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 431 rsbs r2, r4, r8, lsl #16
432 cmnvc r4, r11, lsl #16
8cfbda97 433 orrvs lr, #(1<<22) @ IR3/limB3
434
435 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
436 vmull.s32 q10, d20, d25 @ ... v=1
437 vmull.s32 q11, d22, d26 @ ... v=2
438 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
439 vadd.s64 q10, q2 @ ... v=1
440 vadd.s64 q11, q2 @ ... v=2
441 vqmovn.s64 d18, q9 @ saturate to 32 v=0
442 vqmovn.s64 d19, q10 @ ... v=1
443 vqmovn.s64 d20, q11 @ ... v=2
444 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
445 vmax.s32 d15, d18, d19 @ || for flags
446 vmin.s32 d14, d20
447 vmax.s32 d15, d20
448 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
449 vqshl.s32 d24, d20, #5 @ ... v=2
450 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 451 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
452 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 453 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
454 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 455 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
456 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
457 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
458 vmovn.s32 d13, q12 @ 3
459 vstmia r1, {d14-d16} @ || other cacheline than quotients
460 add r3, r0, #4*12
461 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
462 vst1.32 d13[0], [r3]
463
0e828e88 464 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 465 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
466
467 vmov.i32 d13, #0x1000
0e828e88 468 vshr.s32 d12, d26, #12
8cfbda97 469
470 add r3, r0, #4*24
471 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 472 vmax.s32 d12, d30
8cfbda97 473 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
474 vst1.32 d9[0], [r3]
475
476 vmin.s32 d12, d13 @ | gteIR0
477
8cfbda97 478 ldmia r6, {r4-r6} @ quotients
479 orr r4, r5
480 orr r4, r6
5d8e3bf8 481 add r3, r0, #4*8
8cfbda97 482 movs r4, r4, lsr #17
8cfbda97 483
8cfbda97 484 vst1.32 d12[0], [r3]! @ gteIR0
485 vst1.32 d10, [r3]! @ gteIR12
486 vst1.32 d11[0], [r3] @ ..3
487
17ed0d69 488 @ ~23 cycles
5d8e3bf8 489 orrne lr, #(1<<31) @ limE
490 orrne lr, #(1<<17) @ limE
8cfbda97 491 ldmia r1, {r4-r9}
17ed0d69 492 add r2, r4, #0x400<<16 @ min fSX
493 add r3, r6, #0x400<<16 @ max fSX
494 lsrs r2, #16+11
a53073ec 495 lsrseq r3, #16+11
17ed0d69 496 orrne lr, #(1<<31) @ limG1
497 orrne lr, #(1<<14)
498 add r2, r5, #0x400<<16 @ min fSY
499 add r3, r7, #0x400<<16 @ max fSY
500 lsrs r2, #16+11
a53073ec 501 lsrseq r3, #16+11
17ed0d69 502 orrne lr, #(1<<31) @ limG2
503 orrne lr, #(1<<13)
8cfbda97 504 adds r2, r9, #1
17ed0d69 505 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 506 subs r3, r8, #1
8cfbda97 507
508 ldr r4, [r0, #4*24] @ gteMAC0
509 orrvs lr, #(1<<15)
510
511 adds r3, r4, #1
512 orrvs lr, #(1<<16)
513 orrvs lr, #(1<<31) @ F
514 subs r2, r4, #1
515 orrvs lr, #(1<<15)
516 orrvs lr, #(1<<31) @ F
517 cmp r4, #0x1000
5d8e3bf8 518 orrhi lr, #(1<<12) @ limH
8cfbda97 519
520 str lr, [r0, #4*(32+31)] @ gteFLAG
521
522 pop {r4-r11,pc}
5d8e3bf8 523 .size gteRTPT_neon, .-gteRTPT_neon
524
525
526
054175e9 527@ note: non-std calling convention used
528@ r0 = CP2 (d,c) (must preserve)
529@ r1 = op
530@ r4,r5 = VXYZ(v) packed
531@ r6 = &MX11(mx)
532@ r7 = &CV1(cv)
5c6457c3 533FUNCTION(gteMVMVA_part_neon):
17ed0d69 534 uxth r5, r5
535 vmov.32 d8[0], r4
536 vmov.32 d8[1], r5 @ VXYZ(v)
054175e9 537 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
538 vldmia r7, {d4-d5} @ CVx/gteTR*
17ed0d69 539
17ed0d69 540 vmov.i32 q15, #0
541 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
542 vext.16 d1, d0, d1, #3 @ xx32 -> x321
543 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
544 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
545
546 vmull.s16 q8, d0, d8
547 vmull.s16 q9, d1, d8
548 vmull.s16 q10, d2, d8
549 vpadd.s32 d16, d16, d17
550 vpadd.s32 d17, d18, d19
551 vpadd.s32 d18, d20, d21
552 vpadal.s32 q2, q8
553 vpadal.s32 q3, q9
554 tst r1, #1<<19
555 beq 0f
556 vshr.s64 q2, q2, #12
557 vshr.s64 q3, q3, #12
5580:
559 vqmovn.s64 d8, q2 @ gteMAC|12
560 vqmovn.s64 d9, q3 @ gteMAC3
561
562 tst r1, #1<<10
563 add r3, r0, #4*25
564 vqmovn.s32 d10, q4 @ gteIR|123
565 vst1.32 d8, [r3]!
566 vst1.32 d9[0], [r3] @ wb gteMAC|123
567
568 beq 0f
569 vmax.s16 d10, d31
5700:
571 vmovl.s16 q9, d10 @ expand gteIR|123
572 add r3, r0, #4*9
573 vst1.32 d18, [r3]!
574 vst1.32 d19[0], [r3]
054175e9 575 bx lr
576 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
17ed0d69 577
054175e9 578
579@ get flags after gteMVMVA_part_neon operation
5c6457c3 580FUNCTION(gteMACtoIR_flags_neon): @ r0=CP2 (d,c), r1=lm
054175e9 581 push {r4,r5,lr}
582 tst r1, r1 @ lm
17ed0d69 583 mov lr, #0 @ gteFLAG
054175e9 584 mov r2, #0
17ed0d69 585 mov r12, #15
586 moveq r2, #0x8000 @ adj
587 moveq r12, #16 @ shift
588
589 add r3, r0, #4*25
590 ldmia r3, {r3-r5} @ gteMAC|123
591
592 do_mac_flags r3, r4, r5
593
594 add r3, r2
595 add r4, r2
596 add r5, r2
597 asrs r3, r12
598 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
599 asrs r4, r12
600 orrne lr, #(1<<31)
601 orrne lr, #(1<<23) @ IR2/limB2
602 asrs r5, r12
603 orrne lr, #(1<<22) @ IR3/limB3
604 str lr, [r0, #4*(32+31)] @ gteFLAG
605
054175e9 606 pop {r4,r5,pc}
607 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
17ed0d69 608
609
610
8cfbda97 611@ vim:filetype=armasm