add some missing license headers
[pcsx_rearmed.git] / libpcsxcore / gte_neon.S
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
8cfbda97 5 * See the COPYING file in the top-level directory.
6 */
7
5c6457c3 8#include "arm_features.h"
c6d5790c 9#include "new_dynarec/linkage_offsets.h"
8cfbda97 10
a53073ec 11.syntax unified
8cfbda97 12.text
13.align 2
14
59774ed0 15@ XXX: gteMAC calc shouldn't be saturating, but it is here
16
5d8e3bf8 17@ approximate gteMAC|123 flags
18@ in: rr 123 as gteMAC|123
19@ trash: nothing
20.macro do_mac_flags rr1 rr2 rr3
21 cmp \rr1, #1
22 orrvs lr, #(1<<31)|(1<<27)
23 cmp \rr2, #1
24 orrvs lr, #(1<<31)|(1<<26)
25 cmp \rr3, #1
26 orrvs lr, #(1<<31)|(1<<25)
27 cmn \rr1, #1 @ same as adds ...
28 orrvs lr, #(1<<30)
29 cmn \rr2, #1
30 orrvs lr, #(1<<29)
31 cmn \rr3, #1
32 orrvs lr, #(1<<28)
33.endm
34
35@ approximate 3x gteMACn flags
36@ in: rr 123 as 3 instances gteMACn, *flags
37@ trash: nothing
38.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
39 cmp \rr1, #1
40 cmpvc \rr2, #1
41 cmpvc \rr3, #1
42 orrvs lr, #\nflags
43 cmn \rr1, #1 @ adds ...
44 cmnvc \rr2, #1
45 cmnvc \rr3, #1
46 orrvs lr, #\pflags
47.endm
48
17ed0d69 49@ get gteIR|123 flags from gteMAC|123
50@ in: rr 123 as gteMAC|123
5d8e3bf8 51@ trash: r2,r3
52.macro do_irs_flags rr1 rr2 rr3
17ed0d69 53 add r2, \rr1, #0x8000
54 add r3, \rr2, #0x8000
55 lsrs r2, #16
56 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
57 lsrs r3, #16
58 add r2, \rr3, #0x8000
59 orrne lr, #(1<<31)
60 orrne lr, #(1<<23) @ IR2/limB2
61 lsrs r2, #16
62 orrne lr, #(1<<22) @ IR3/limB3
8cfbda97 63.endm
64
65
66/*
5d8e3bf8 67 * RTPS/RTPT register map:
68 *
8cfbda97 69 * q | d | c code / phase 1 phase 2 scratch
70 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
71 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
72 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
73 * 3 * gteIR1-3 = gteIR1-3 /
74 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
75 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
76 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
77 * 7 0 gteDQB [s64] max gteMAC|12
78 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
79 * 9 * / gteMAC3 max gteIR|123
80 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
81 * 11 0 quotient 3
82 * 6 12 gteH (adj. for cmp)
83 * 13 gteH (float for div)
84 * ... <scratch>
85 * 15 30 0
86 * 31 0
87 */
8cfbda97 88
5d8e3bf8 89@ load gteR*, gteTR* and gteH (see map above), clear q15
90@ in: r0 - context
91@ trash: r3
92.macro rtpx_preload
8cfbda97 93 add r3, r0, #4*32
94 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 95 vmov.i32 q15, #0
8cfbda97 96 add r3, r0, #4*(32+5)
97 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 98 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
99 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 100 add r3, r0, #4*(32+26)
101 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 102 vshll.s32 q3, d5, #12 @ gteTRZ
103 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 104 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 105.endm
8cfbda97 106
5d8e3bf8 107@ do RTP* gteMAC* calculation
108@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
109@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
110@ trash: d16-d21
111.macro rtpx_mac
8cfbda97 112 vmull.s16 q8, d0, d8
113 vmull.s16 q9, d1, d8
114 vmull.s16 q10, d2, d8
115 vpaddl.s32 q8, q8
116 vpaddl.s32 q9, q9
117 vpaddl.s32 q10, q10
118 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
119 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
120 vadd.s64 d20, d21 @ QC
121 vadd.s64 d16, d4
122 vadd.s64 d18, d5
123 vadd.s64 d20, d6
124 vqshrn.s64 d8, q8, #12 @ gteMAC1
125 vqshrn.s64 d18, q9, #12 @ gteMAC2
126 vqshrn.s64 d9, q10, #12 @ gteMAC3
127 vsli.u64 d8, d18, #32 @ gteMAC|12
128 vmov.32 d9[1], r12
5d8e3bf8 129 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
130.endm
131
5c6457c3 132FUNCTION(gteRTPS_neon): @ r0=CP2 (d,c),
17ed0d69 133 push {r4-r6,lr}
5d8e3bf8 134
135@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
c6d5790c 136 ldr r1, [r0, #LO_cop2_to_scratch_buf]
5d8e3bf8 137 mov r12, #0
138
4cc3050c 139 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 140 rtpx_preload
141
4cc3050c 142@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 143 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 144 vmull.s16 q8, d0, d8
145 vmull.s16 q9, d1, d8
146 vmull.s16 q10, d2, d8
147 vpadd.s32 d16, d16, d17
148 vpadd.s32 d17, d18, d19
149 vpadd.s32 d18, d20, d21
150 vpadal.s32 q2, q8
151 vpadal.s32 q3, q9 @ d6, d18 is slow?
152 vqshrn.s64 d8, q2, #12 @ gteMAC|12
153 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 154
155 add r3, r0, #4*25
156 vst1.32 d8, [r3]!
157 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 158 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 159
160 add r3, r0, #4*17 @ gteSZ*
161 vldmia r3, {q7} @ d14,d15 gteSZ|123x
162 vmov.i32 d28, #0xffff @ 0xffff[32]
163 vmax.s32 d11, d9, d31
164 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
165 vmov.i32 d26, #1
166 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 167 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 168 vshl.u32 d13, d12, #16 @ | preparing gteH
169 add r3, r0, #4*9
170 vst1.32 d18, [r3]!
171 vst1.32 d19[0], [r3]
172
173 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
174 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
175
176 add r3, r0, #4*(32+24)
177 vld1.32 d4, [r3] @ || gteOF|XY
178 add r3, r0, #4*(32+27)
179 vld1.32 d6, [r3] @ || gteDQ|AB
180
181 vand d11, d16
182 vmovl.s32 q2, d4 @ || gteOF|XY [64]
183 vmax.u32 d11, d26 @ make divisor 1 if not
184 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
185 add r3, r0, #4*16 @ | gteSZ*
186 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
187
188 vcvt.f32.u32 d13, d13 @ gteH (float for div)
189 vcvt.f32.u32 d11, d11 @ divisor
190
191 @ divide.. it's not worth messing with reciprocals here
192 @ just for 1 value, let's just use VFP divider here
193 vdiv.f32 s22, s26, s22
194
02455d0d 195 vmov.f32 d20, #0.5
196 vadd.f32 d11, d20
5d8e3bf8 197 vcvt.u32.f32 d11, d11 @ quotient
198
199 @ while NEON's busy we calculate some flags on ARM
200 add r3, r0, #4*25
201 mov lr, #0 @ gteFLAG
202 ldmia r3, {r4-r6} @ gteMAC|123
203
4cc3050c 204 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 205 vqshl.u32 d11, #15
206
207 do_mac_flags r4, r5, r6
208
209 vshr.u32 d11, #15 @ quotient (limE)
5d8e3bf8 210
5d8e3bf8 211 do_irs_flags r4, r5, r6
212
213 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
214 add r3, r0, #4*13
215 vld1.32 d16, [r3] @ || load fS|XY12, new 01
216 vqmovn.s64 d18, q2 @ saturate to 32
217 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
218 vqshl.s32 d19, d18, #5 @ 11bit precision
219
220 ldr r4, [r1] @ quotient
5d8e3bf8 221 movs r3, r6, lsr #16
222 orrne lr, #(1<<31)
223 orrne lr, #(1<<18) @ fSZ (limD)
224
17ed0d69 225 vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
226
5d8e3bf8 227 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
228 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
229 vmovn.s32 d18, q9 @ fS|XY2 [s16]
230
0e828e88 231 vqmovn.s64 d20, q10 @ | gteMAC0
5d8e3bf8 232 add r3, r0, #4*12
233 vst1.32 d16, [r3]! @ writeback fS|XY01
234 vst1.32 d18[0], [r3] @ ...2
235 add r3, r0, #4*24
0e828e88 236 vshr.s32 d21, d20, #12
5d8e3bf8 237 vst1.32 d20[0], [r3] @ gteMAC0
238
17ed0d69 239 movs r4, r4, lsr #17
240 orrne lr, #(1<<31)
241 orrne lr, #(1<<17) @ limE
242
0e828e88 243 vmax.s32 d21, d31
5d8e3bf8 244 vmov.i32 d22, #0x1000
245 vmin.s32 d21, d22
246 add r3, r0, #4*8
247 vst1.16 d21[0], [r3] @ gteIR0
248
17ed0d69 249 ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
250 add r2, r4, #0x400<<16
251 add r3, r5, #0x400<<16
252 lsrs r2, #16+11
253 orrne lr, #(1<<14) @ limG1
254 orrne lr, #(1<<31)
255 lsrs r3, #16+11
256 orrne lr, #(1<<13) @ limG2
5d8e3bf8 257 orrne lr, #(1<<31)
5d8e3bf8 258 adds r2, r4, #1
a53073ec 259 addsvc r3, r5, #1
5d8e3bf8 260 orrvs lr, #(1<<16) @ F
261 orrvs lr, #(1<<31)
262 subs r2, r4, #1
a53073ec 263 subsvc r3, r5, #1
5d8e3bf8 264 orrvs lr, #(1<<31)
265
266 ldr r4, [r0, #4*24] @ gteMAC0
267 orrvs lr, #(1<<15)
268
269 adds r3, r4, #1
270 orrvs lr, #(1<<16) @ F
271 orrvs lr, #(1<<31)
272 subs r2, r4, #1
273 orrvs lr, #(1<<15) @ F
274 orrvs lr, #(1<<31)
275 cmp r4, #0x1000
276 orrhi lr, #(1<<12) @ limH
277
278 str lr, [r0, #4*(32+31)] @ gteFLAG
279
17ed0d69 280 pop {r4-r6,pc}
5d8e3bf8 281 .size gteRTPS_neon, .-gteRTPS_neon
282
283
284
5c6457c3 285FUNCTION(gteRTPT_neon): @ r0=CP2 (d,c),
5d8e3bf8 286 push {r4-r11,lr}
287
c6d5790c 288 ldr r1, [r0, #LO_cop2_to_scratch_buf]
5d8e3bf8 289 mov r12, #0
290
291 rtpx_preload
292
e2ae616a 293 vmvn.i32 d22, #0x80000000 @ #0x7fffffff
5d8e3bf8 294 vmov.i32 d23, #0x80000000
295 mov r3, #3 @ counter
296 mov r2, r0 @ VXYZ(0)
2970:
298 vldmia r2!, {d8} @ VXYZ(v)
299 vmov.16 d8[3], r12 @ kill unused upper vector
300
301 rtpx_mac
8cfbda97 302 vmin.s32 d22, d8 @ min gteMAC|12
303 vmax.s32 d23, d8 @ max gteMAC|12
304 subs r3, #1
4cc3050c 305 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 306 bgt 0b
307
4cc3050c 308 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 309
310 @ - phase2 -
311 sub r1, r1, #8*2*4
312 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
313
314 vmov d20, d0 @ gteMAC3 v=0
315 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 316 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 317 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
318 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
319 vmov d21, d9 @ ... v=2
320
321 vmov.i32 q14, #0xffff @ 0xffff[32]
322 vmax.s32 q10, q15
323 vmov.i32 q13, #1
324 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 325 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 326 vmin.s16 d24, d10 @ | find min/max IR
327 vmax.s16 d25, d10 @ |
328
5d8e3bf8 329 add r3, r0, #4*19 @ ||
330 vld1.32 d14[0], [r3] @ || gteSZ3
331
8cfbda97 332 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
333 add r3, r0, #4*17
334 vst1.32 d20, [r3]! @ | writeback fSZ(v)
335 vand q11, q10, q11
336 vst1.32 d21[0], [r3] @ |
337 vmax.u32 q10, q11, q13 @ make divisor 1 if not
338 add r3, r1, #8*8
339 vstmia r3, {q12} @ min/max IR for flags
340 vcvt.f32.u32 q10, q10
341 vshl.u32 d13, d12, #16 @ | preparing gteH
342
343 @ while NEON's busy we calculate some flags on ARM
344 add r2, r1, #8*2*3
345 mov lr, #0 @ gteFLAG
346 ldmia r2, {r4-r7} @ min/max gteMAC|12
347 subs r2, r4, #1
348 orrvs lr, #(1<<31)|(1<<27)
349 subs r3, r5, #1
350 orrvs lr, #(1<<31)|(1<<26)
351 adds r2, r6, #1
352 orrvs lr, #(1<<30)
353 adds r3, r7, #1
354 orrvs lr, #(1<<29)
355 ldr r4, [r1, #0] @ gteMAC3 v=0
356 ldr r5, [r1, #8*2] @ ... v=1
357 ldr r6, [r1, #8*4] @ ... v=2
358
359 add r3, r0, #4*(32+24)
360 vld1.32 d4, [r3] @ || gteOF|XY
361 add r3, r0, #4*(32+27)
5d8e3bf8 362 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 363
364 @ divide
365.if 1
366 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 367 vmovl.s32 q2, d4 @ || gteOF|XY [64]
368 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 369 vrecps.f32 q12, q10, q11 @ step
370 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
4706bbe4 371 vmov.f32 q8, #0.5 @ |||
8cfbda97 372 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 373 add r3, r0, #4*16
374 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 375 vdup.32 q13, d13[0] @ |
4706bbe4 376@ vrecps.f32 q12, q10, q11 @ step
377@ vmul.f32 q11, q12, q11 @ better inv
8cfbda97 378 vmul.f32 q10, q13, q11 @ result
379.else
4706bbe4 380 vmov.f32 q8, #0.5 @ |||
5d8e3bf8 381 vmovl.s32 q2, d4 @ || gteOF|XY [64]
382 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 383 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
384 vdup.32 q13, d13[0] @ |
5d8e3bf8 385 add r3, r0, #4*16
386 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 387
388 vpush {q0}
389 vmov q0, q10 @ to test against C code
390 vdiv.f32 s0, s26, s0
391 vdiv.f32 s1, s26, s1
392 vdiv.f32 s2, s26, s2
393 vmov q10, q0
394 vpop {q0}
395.endif
396
5d8e3bf8 397 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 398 orr r7, r4, r5
399 add r4, r1, #8*8
400 orr r3, r7, r6
401 ldmia r4, {r7,r8,r10,r11} @ min/max IR
402
403 movs r3, r3, lsr #16
404 orrne lr, #(1<<31)
405 orrne lr, #(1<<18) @ fSZ (limD)
406
4706bbe4 407 vadd.f32 q10, q8 @ adjust for vcvt rounding mode
8cfbda97 408 vcvt.u32.f32 q8, q10
409 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
410 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
411 add r6, r1, #8*10
412 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
413 vqshl.u32 q8, #15
414 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
415 vshr.u32 q8, #15 @ quotients (limE)
416 vdup.32 d24, d16[0]
417 vdup.32 d25, d16[1]
418 vdup.32 d26, d17[0] @ quotient (dup)
419
5d8e3bf8 420 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
421 mov r4, #0x10000
422 cmp r7, #1<<16
423 cmnvc r10, #1<<16
8cfbda97 424 orrvs lr, #(1<<31)
425 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 426 rsbs r2, r4, r7, lsl #16
427 cmnvc r4, r10, lsl #16
8cfbda97 428 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 429 rsbs r2, r4, r8, lsl #16
430 cmnvc r4, r11, lsl #16
8cfbda97 431 orrvs lr, #(1<<22) @ IR3/limB3
432
433 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
434 vmull.s32 q10, d20, d25 @ ... v=1
435 vmull.s32 q11, d22, d26 @ ... v=2
436 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
437 vadd.s64 q10, q2 @ ... v=1
438 vadd.s64 q11, q2 @ ... v=2
439 vqmovn.s64 d18, q9 @ saturate to 32 v=0
440 vqmovn.s64 d19, q10 @ ... v=1
441 vqmovn.s64 d20, q11 @ ... v=2
442 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
443 vmax.s32 d15, d18, d19 @ || for flags
444 vmin.s32 d14, d20
445 vmax.s32 d15, d20
446 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
447 vqshl.s32 d24, d20, #5 @ ... v=2
448 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 449 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
450 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 451 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
452 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
8cfbda97 453 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
454 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
455 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
456 vmovn.s32 d13, q12 @ 3
457 vstmia r1, {d14-d16} @ || other cacheline than quotients
458 add r3, r0, #4*12
459 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
460 vst1.32 d13[0], [r3]
461
0e828e88 462 vqmovn.s64 d26, q13 @ | gteMAC0
8cfbda97 463 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
464
465 vmov.i32 d13, #0x1000
0e828e88 466 vshr.s32 d12, d26, #12
8cfbda97 467
468 add r3, r0, #4*24
469 vst1.32 d26[0], [r3]! @ gteMAC0
0e828e88 470 vmax.s32 d12, d30
8cfbda97 471 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
472 vst1.32 d9[0], [r3]
473
474 vmin.s32 d12, d13 @ | gteIR0
475
8cfbda97 476 ldmia r6, {r4-r6} @ quotients
477 orr r4, r5
478 orr r4, r6
5d8e3bf8 479 add r3, r0, #4*8
8cfbda97 480 movs r4, r4, lsr #17
8cfbda97 481
8cfbda97 482 vst1.32 d12[0], [r3]! @ gteIR0
483 vst1.32 d10, [r3]! @ gteIR12
484 vst1.32 d11[0], [r3] @ ..3
485
17ed0d69 486 @ ~23 cycles
5d8e3bf8 487 orrne lr, #(1<<31) @ limE
488 orrne lr, #(1<<17) @ limE
8cfbda97 489 ldmia r1, {r4-r9}
17ed0d69 490 add r2, r4, #0x400<<16 @ min fSX
491 add r3, r6, #0x400<<16 @ max fSX
492 lsrs r2, #16+11
a53073ec 493 lsrseq r3, #16+11
17ed0d69 494 orrne lr, #(1<<31) @ limG1
495 orrne lr, #(1<<14)
496 add r2, r5, #0x400<<16 @ min fSY
497 add r3, r7, #0x400<<16 @ max fSY
498 lsrs r2, #16+11
a53073ec 499 lsrseq r3, #16+11
17ed0d69 500 orrne lr, #(1<<31) @ limG2
501 orrne lr, #(1<<13)
8cfbda97 502 adds r2, r9, #1
17ed0d69 503 orrvs lr, #(1<<16) @ F (31 already done by above)
8cfbda97 504 subs r3, r8, #1
8cfbda97 505
506 ldr r4, [r0, #4*24] @ gteMAC0
507 orrvs lr, #(1<<15)
508
509 adds r3, r4, #1
510 orrvs lr, #(1<<16)
511 orrvs lr, #(1<<31) @ F
512 subs r2, r4, #1
513 orrvs lr, #(1<<15)
514 orrvs lr, #(1<<31) @ F
515 cmp r4, #0x1000
5d8e3bf8 516 orrhi lr, #(1<<12) @ limH
8cfbda97 517
518 str lr, [r0, #4*(32+31)] @ gteFLAG
519
520 pop {r4-r11,pc}
5d8e3bf8 521 .size gteRTPT_neon, .-gteRTPT_neon
522
523
524
054175e9 525@ note: non-std calling convention used
526@ r0 = CP2 (d,c) (must preserve)
527@ r1 = op
528@ r4,r5 = VXYZ(v) packed
529@ r6 = &MX11(mx)
530@ r7 = &CV1(cv)
5c6457c3 531FUNCTION(gteMVMVA_part_neon):
17ed0d69 532 uxth r5, r5
533 vmov.32 d8[0], r4
534 vmov.32 d8[1], r5 @ VXYZ(v)
054175e9 535 vldmia r6, {d0-d2} @ MXxy/gteR* [16*9]
536 vldmia r7, {d4-d5} @ CVx/gteTR*
17ed0d69 537
17ed0d69 538 vmov.i32 q15, #0
539 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
540 vext.16 d1, d0, d1, #3 @ xx32 -> x321
541 vshll.s32 q3, d5, #12 @ gteTRZ/CV3
542 vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
543
544 vmull.s16 q8, d0, d8
545 vmull.s16 q9, d1, d8
546 vmull.s16 q10, d2, d8
547 vpadd.s32 d16, d16, d17
548 vpadd.s32 d17, d18, d19
549 vpadd.s32 d18, d20, d21
550 vpadal.s32 q2, q8
551 vpadal.s32 q3, q9
552 tst r1, #1<<19
553 beq 0f
554 vshr.s64 q2, q2, #12
555 vshr.s64 q3, q3, #12
5560:
557 vqmovn.s64 d8, q2 @ gteMAC|12
558 vqmovn.s64 d9, q3 @ gteMAC3
559
560 tst r1, #1<<10
561 add r3, r0, #4*25
562 vqmovn.s32 d10, q4 @ gteIR|123
563 vst1.32 d8, [r3]!
564 vst1.32 d9[0], [r3] @ wb gteMAC|123
565
566 beq 0f
567 vmax.s16 d10, d31
5680:
569 vmovl.s16 q9, d10 @ expand gteIR|123
570 add r3, r0, #4*9
571 vst1.32 d18, [r3]!
572 vst1.32 d19[0], [r3]
054175e9 573 bx lr
574 .size gteMVMVA_part_neon, .-gteMVMVA_part_neon
17ed0d69 575
054175e9 576
577@ get flags after gteMVMVA_part_neon operation
5c6457c3 578FUNCTION(gteMACtoIR_flags_neon): @ r0=CP2 (d,c), r1=lm
054175e9 579 push {r4,r5,lr}
580 tst r1, r1 @ lm
17ed0d69 581 mov lr, #0 @ gteFLAG
054175e9 582 mov r2, #0
17ed0d69 583 mov r12, #15
584 moveq r2, #0x8000 @ adj
585 moveq r12, #16 @ shift
586
587 add r3, r0, #4*25
588 ldmia r3, {r3-r5} @ gteMAC|123
589
590 do_mac_flags r3, r4, r5
591
592 add r3, r2
593 add r4, r2
594 add r5, r2
595 asrs r3, r12
596 orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
597 asrs r4, r12
598 orrne lr, #(1<<31)
599 orrne lr, #(1<<23) @ IR2/limB2
600 asrs r5, r12
601 orrne lr, #(1<<22) @ IR3/limB3
602 str lr, [r0, #4*(32+31)] @ gteFLAG
603
054175e9 604 pop {r4,r5,pc}
605 .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon
17ed0d69 606
607
608
8cfbda97 609@ vim:filetype=armasm