gte_neon: trying to improve code
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
11
12.bss
13.align 6 @ cacheline
14
15scratch:
16.rept 8*8*2/4
17 .word 0
18.endr
19
20.text
21.align 2
22
5d8e3bf8 23@ approximate gteMAC|123 flags
24@ in: rr 123 as gteMAC|123
25@ trash: nothing
26.macro do_mac_flags rr1 rr2 rr3
27 cmp \rr1, #1
28 orrvs lr, #(1<<31)|(1<<27)
29 cmp \rr2, #1
30 orrvs lr, #(1<<31)|(1<<26)
31 cmp \rr3, #1
32 orrvs lr, #(1<<31)|(1<<25)
33 cmn \rr1, #1 @ same as adds ...
34 orrvs lr, #(1<<30)
35 cmn \rr2, #1
36 orrvs lr, #(1<<29)
37 cmn \rr3, #1
38 orrvs lr, #(1<<28)
39.endm
40
41@ approximate 3x gteMACn flags
42@ in: rr 123 as 3 instances gteMACn, *flags
43@ trash: nothing
44.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
45 cmp \rr1, #1
46 cmpvc \rr2, #1
47 cmpvc \rr3, #1
48 orrvs lr, #\nflags
49 cmn \rr1, #1 @ adds ...
50 cmnvc \rr2, #1
51 cmnvc \rr3, #1
52 orrvs lr, #\pflags
53.endm
54
55@ approximate signed gteIR|123 [32] flags
56@ in: rr 123 as gteIR|123
57@ trash: r2,r3
58.macro do_irs_flags rr1 rr2 rr3
59 mov r2, #0x10000
60 cmn r2, \rr1, lsl #16 @ adds ...
61 rsbvcs r3, r2, \rr1, lsl #16
8cfbda97 62 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 63 cmn r2, \rr2, lsl #16
64 rsbvcs r3, r2, \rr2, lsl #16
8cfbda97 65 orrvs lr, #(1<<31)
66 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 67 cmn r2, \rr3, lsl #16
68 subvcs r3, r2, \rr3, lsl #16
8cfbda97 69 orrvs lr, #(1<<22) @ IR3/limB3
70.endm
71
72
73/*
5d8e3bf8 74 * RTPS/RTPT register map:
75 *
8cfbda97 76 * q | d | c code / phase 1 phase 2 scratch
77 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
78 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
79 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
80 * 3 * gteIR1-3 = gteIR1-3 /
81 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
82 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
83 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
84 * 7 0 gteDQB [s64] max gteMAC|12
85 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
86 * 9 * / gteMAC3 max gteIR|123
87 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
88 * 11 0 quotient 3
89 * 6 12 gteH (adj. for cmp)
90 * 13 gteH (float for div)
91 * ... <scratch>
92 * 15 30 0
93 * 31 0
94 */
8cfbda97 95
5d8e3bf8 96@ load gteR*, gteTR* and gteH (see map above), clear q15
97@ in: r0 - context
98@ trash: r3
99.macro rtpx_preload
8cfbda97 100 add r3, r0, #4*32
101 vldmia r3, {d0-d2} @ gteR* [16*9]
4cc3050c 102 vmov.i32 q15, #0
8cfbda97 103 add r3, r0, #4*(32+5)
104 vldmia r3, {d4-d5} @ gteTR*
4cc3050c 105 vext.16 d2, d1, d2, #2 @ xxx3 -> x321
106 vext.16 d1, d0, d1, #3 @ xx32 -> x321
8cfbda97 107 add r3, r0, #4*(32+26)
108 vld1.32 d11[0], [r3] @ gteH
5d8e3bf8 109 vshll.s32 q3, d5, #12 @ gteTRZ
110 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 111 vmovl.s16 q6, d11 @ gteH
5d8e3bf8 112.endm
8cfbda97 113
5d8e3bf8 114@ do RTP* gteMAC* calculation
115@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
116@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
117@ trash: d16-d21
118.macro rtpx_mac
8cfbda97 119 vmull.s16 q8, d0, d8
120 vmull.s16 q9, d1, d8
121 vmull.s16 q10, d2, d8
122 vpaddl.s32 q8, q8
123 vpaddl.s32 q9, q9
124 vpaddl.s32 q10, q10
125 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
126 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
127 vadd.s64 d20, d21 @ QC
128 vadd.s64 d16, d4
129 vadd.s64 d18, d5
130 vadd.s64 d20, d6
131 vqshrn.s64 d8, q8, #12 @ gteMAC1
132 vqshrn.s64 d18, q9, #12 @ gteMAC2
133 vqshrn.s64 d9, q10, #12 @ gteMAC3
134 vsli.u64 d8, d18, #32 @ gteMAC|12
135 vmov.32 d9[1], r12
5d8e3bf8 136 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
137.endm
138
139.global gteRTPS_neon @ r0=CP2 (d,c),
140gteRTPS_neon:
141 push {r4-r7,lr}
142
143@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
144 movw r1, #:lower16:scratch
145 movt r1, #:upper16:scratch
146 mov r12, #0
147
4cc3050c 148 vldmia r0, {d8} @ VXYZ(0)
5d8e3bf8 149 rtpx_preload
150
4cc3050c 151@ rtpx_mac @ slower here, faster in RTPT?
5d8e3bf8 152 vmov.16 d8[3], r12 @ kill unused upper vector
4cc3050c 153 vmull.s16 q8, d0, d8
154 vmull.s16 q9, d1, d8
155 vmull.s16 q10, d2, d8
156 vpadd.s32 d16, d16, d17
157 vpadd.s32 d17, d18, d19
158 vpadd.s32 d18, d20, d21
159 vpadal.s32 q2, q8
160 vpadal.s32 q3, q9 @ d6, d18 is slow?
161 vqshrn.s64 d8, q2, #12 @ gteMAC|12
162 vqshrn.s64 d9, q3, #12 @ gteMAC3
5d8e3bf8 163
164 add r3, r0, #4*25
165 vst1.32 d8, [r3]!
166 vst1.32 d9[0], [r3] @ wb gteMAC|123
4cc3050c 167 vqmovn.s32 d10, q4 @ gteIR|123
5d8e3bf8 168
169 add r3, r0, #4*17 @ gteSZ*
170 vldmia r3, {q7} @ d14,d15 gteSZ|123x
171 vmov.i32 d28, #0xffff @ 0xffff[32]
172 vmax.s32 d11, d9, d31
173 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
174 vmov.i32 d26, #1
175 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
4cc3050c 176 vmovl.s16 q9, d10 @ || expand gteIR|123
5d8e3bf8 177 vshl.u32 d13, d12, #16 @ | preparing gteH
178 add r3, r0, #4*9
179 vst1.32 d18, [r3]!
180 vst1.32 d19[0], [r3]
181
182 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
183 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
184
185 add r3, r0, #4*(32+24)
186 vld1.32 d4, [r3] @ || gteOF|XY
187 add r3, r0, #4*(32+27)
188 vld1.32 d6, [r3] @ || gteDQ|AB
189
190 vand d11, d16
191 vmovl.s32 q2, d4 @ || gteOF|XY [64]
192 vmax.u32 d11, d26 @ make divisor 1 if not
193 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
194 add r3, r0, #4*16 @ | gteSZ*
195 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
196
197 vcvt.f32.u32 d13, d13 @ gteH (float for div)
198 vcvt.f32.u32 d11, d11 @ divisor
199
200 @ divide.. it's not worth messing with reciprocals here
201 @ just for 1 value, let's just use VFP divider here
202 vdiv.f32 s22, s26, s22
203
204 vcvt.u32.f32 d11, d11 @ quotient
205
206 @ while NEON's busy we calculate some flags on ARM
207 add r3, r0, #4*25
208 mov lr, #0 @ gteFLAG
209 ldmia r3, {r4-r6} @ gteMAC|123
210
4cc3050c 211 vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
5d8e3bf8 212 vqshl.u32 d11, #15
213
214 do_mac_flags r4, r5, r6
215
216 vshr.u32 d11, #15 @ quotient (limE)
217 add r3, r0, #4*9
218
219 ldmia r3, {r4-r6} @ gteIR|123
220 do_irs_flags r4, r5, r6
221
222 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
223 add r3, r0, #4*13
224 vld1.32 d16, [r3] @ || load fS|XY12, new 01
225 vqmovn.s64 d18, q2 @ saturate to 32
226 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
227 vqshl.s32 d19, d18, #5 @ 11bit precision
228
229 ldr r4, [r1] @ quotient
230
231 mov r2, r1
232 vst1.32 d18, [r2]! @ || writeback fS|XY2 before limG
233 vst1.32 d19, [r2] @ || and after 11bit saturation
234
235 movs r3, r6, lsr #16
236 orrne lr, #(1<<31)
237 orrne lr, #(1<<18) @ fSZ (limD)
238
239 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
240 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
241 vmovn.s32 d18, q9 @ fS|XY2 [s16]
242
243 vqshrn.s64 d20, q10, #12 @ | gteMAC0
244 add r3, r0, #4*12
245 vst1.32 d16, [r3]! @ writeback fS|XY01
246 vst1.32 d18[0], [r3] @ ...2
247 add r3, r0, #4*24
248 vst1.32 d20[0], [r3] @ gteMAC0
249
250 vmax.s32 d21, d20, d31
251 vmov.i32 d22, #0x1000
252 vmin.s32 d21, d22
253 add r3, r0, #4*8
254 vst1.16 d21[0], [r3] @ gteIR0
255
256 movs r4, r4, lsr #17
257 orrne lr, #(1<<31)
258 orrne lr, #(1<<17) @ limE
259
260 ldmia r1, {r4-r7} @ fS|XY2 before limG, after 11bit sat
261 subs r2, r6, #1<<21
262 addvcs r3, r6, #1<<21
263 orrvs lr, #(1<<14) @ limG1
264 orrvs lr, #(1<<31)
265 subs r2, r7, #1<<21
266 addvcs r3, r7, #1<<21
267 orrvs lr, #(1<<13) @ limG2
268 orrvs lr, #(1<<31)
269 adds r2, r4, #1
270 addvcs r3, r5, #1
271 orrvs lr, #(1<<16) @ F
272 orrvs lr, #(1<<31)
273 subs r2, r4, #1
274 subvcs r3, r5, #1
275 orrvs lr, #(1<<31)
276
277 ldr r4, [r0, #4*24] @ gteMAC0
278 orrvs lr, #(1<<15)
279
280 adds r3, r4, #1
281 orrvs lr, #(1<<16) @ F
282 orrvs lr, #(1<<31)
283 subs r2, r4, #1
284 orrvs lr, #(1<<15) @ F
285 orrvs lr, #(1<<31)
286 cmp r4, #0x1000
287 orrhi lr, #(1<<12) @ limH
288
289 str lr, [r0, #4*(32+31)] @ gteFLAG
290
291 pop {r4-r7,pc}
292 .size gteRTPS_neon, .-gteRTPS_neon
293
294
295
296.global gteRTPT_neon @ r0=CP2 (d,c),
297gteRTPT_neon:
298 push {r4-r11,lr}
299
300 movw r1, #:lower16:scratch
301 movt r1, #:upper16:scratch
302 mov r12, #0
303
304 rtpx_preload
305
306 vmov.i32 d22, #0x7fffffff
307 vmov.i32 d23, #0x80000000
308 mov r3, #3 @ counter
309 mov r2, r0 @ VXYZ(0)
3100:
311 vldmia r2!, {d8} @ VXYZ(v)
312 vmov.16 d8[3], r12 @ kill unused upper vector
313
314 rtpx_mac
8cfbda97 315 vmin.s32 d22, d8 @ min gteMAC|12
316 vmax.s32 d23, d8 @ max gteMAC|12
317 subs r3, #1
4cc3050c 318 vst1.32 {d9,d10}, [r1, :128]!
8cfbda97 319 bgt 0b
320
4cc3050c 321 vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
8cfbda97 322
323 @ - phase2 -
324 sub r1, r1, #8*2*4
325 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
326
327 vmov d20, d0 @ gteMAC3 v=0
328 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 329 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 330 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
331 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
332 vmov d21, d9 @ ... v=2
333
334 vmov.i32 q14, #0xffff @ 0xffff[32]
335 vmax.s32 q10, q15
336 vmov.i32 q13, #1
337 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 338 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 339 vmin.s16 d24, d10 @ | find min/max IR
340 vmax.s16 d25, d10 @ |
341
5d8e3bf8 342 add r3, r0, #4*19 @ ||
343 vld1.32 d14[0], [r3] @ || gteSZ3
344
8cfbda97 345 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
346 add r3, r0, #4*17
347 vst1.32 d20, [r3]! @ | writeback fSZ(v)
348 vand q11, q10, q11
349 vst1.32 d21[0], [r3] @ |
350 vmax.u32 q10, q11, q13 @ make divisor 1 if not
351 add r3, r1, #8*8
352 vstmia r3, {q12} @ min/max IR for flags
353 vcvt.f32.u32 q10, q10
354 vshl.u32 d13, d12, #16 @ | preparing gteH
355
356 @ while NEON's busy we calculate some flags on ARM
357 add r2, r1, #8*2*3
358 mov lr, #0 @ gteFLAG
359 ldmia r2, {r4-r7} @ min/max gteMAC|12
360 subs r2, r4, #1
361 orrvs lr, #(1<<31)|(1<<27)
362 subs r3, r5, #1
363 orrvs lr, #(1<<31)|(1<<26)
364 adds r2, r6, #1
365 orrvs lr, #(1<<30)
366 adds r3, r7, #1
367 orrvs lr, #(1<<29)
368 ldr r4, [r1, #0] @ gteMAC3 v=0
369 ldr r5, [r1, #8*2] @ ... v=1
370 ldr r6, [r1, #8*4] @ ... v=2
371
372 add r3, r0, #4*(32+24)
373 vld1.32 d4, [r3] @ || gteOF|XY
374 add r3, r0, #4*(32+27)
5d8e3bf8 375 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 376
377 @ divide
378.if 1
379 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 380 vmovl.s32 q2, d4 @ || gteOF|XY [64]
381 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 382 vrecps.f32 q12, q10, q11 @ step
383 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
384 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 385 add r3, r0, #4*16
386 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 387 vdup.32 q13, d13[0] @ |
388@ vrecps.f32 q12, q10, q11 @ step
389@ vmul.f32 q11, q12, q11 @ better inv
390 vmul.f32 q10, q13, q11 @ result
391.else
5d8e3bf8 392 vmovl.s32 q2, d4 @ || gteOF|XY [64]
393 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 394 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
395 vdup.32 q13, d13[0] @ |
5d8e3bf8 396 add r3, r0, #4*16
397 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 398
399 vpush {q0}
400 vmov q0, q10 @ to test against C code
401 vdiv.f32 s0, s26, s0
402 vdiv.f32 s1, s26, s1
403 vdiv.f32 s2, s26, s2
404 vmov q10, q0
405 vpop {q0}
406.endif
407
5d8e3bf8 408 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 409 orr r7, r4, r5
410 add r4, r1, #8*8
411 orr r3, r7, r6
412 ldmia r4, {r7,r8,r10,r11} @ min/max IR
413
414 movs r3, r3, lsr #16
415 orrne lr, #(1<<31)
416 orrne lr, #(1<<18) @ fSZ (limD)
417
418@ vadd.f32 q10, q @ adjust for vcvt rounding mode
419 vcvt.u32.f32 q8, q10
420 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
421 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
422 add r6, r1, #8*10
423 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
424 vqshl.u32 q8, #15
425 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
426 vshr.u32 q8, #15 @ quotients (limE)
427 vdup.32 d24, d16[0]
428 vdup.32 d25, d16[1]
429 vdup.32 d26, d17[0] @ quotient (dup)
430
5d8e3bf8 431 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
432 mov r4, #0x10000
433 cmp r7, #1<<16
434 cmnvc r10, #1<<16
8cfbda97 435 orrvs lr, #(1<<31)
436 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 437 rsbs r2, r4, r7, lsl #16
438 cmnvc r4, r10, lsl #16
8cfbda97 439 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 440 rsbs r2, r4, r8, lsl #16
441 cmnvc r4, r11, lsl #16
8cfbda97 442 orrvs lr, #(1<<22) @ IR3/limB3
443
444 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
445 vmull.s32 q10, d20, d25 @ ... v=1
446 vmull.s32 q11, d22, d26 @ ... v=2
447 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
448 vadd.s64 q10, q2 @ ... v=1
449 vadd.s64 q11, q2 @ ... v=2
450 vqmovn.s64 d18, q9 @ saturate to 32 v=0
451 vqmovn.s64 d19, q10 @ ... v=1
452 vqmovn.s64 d20, q11 @ ... v=2
453 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
454 vmax.s32 d15, d18, d19 @ || for flags
455 vmin.s32 d14, d20
456 vmax.s32 d15, d20
457 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
458 vqshl.s32 d24, d20, #5 @ ... v=2
459 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 460 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
461 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 462 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
463 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
464 vqshl.s32 q7, #5 @ || min/max pairs shifted
465 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
466 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
467 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
468 vmovn.s32 d13, q12 @ 3
469 vstmia r1, {d14-d16} @ || other cacheline than quotients
470 add r3, r0, #4*12
471 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
472 vst1.32 d13[0], [r3]
473
474 vqshrn.s64 d26, q13, #12 @ | gteMAC0
475 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
476
477 vmov.i32 d13, #0x1000
478 vmax.s32 d12, d26, d30
479
480 add r3, r0, #4*24
481 vst1.32 d26[0], [r3]! @ gteMAC0
482 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
483 vst1.32 d9[0], [r3]
484
485 vmin.s32 d12, d13 @ | gteIR0
486
8cfbda97 487 ldmia r6, {r4-r6} @ quotients
488 orr r4, r5
489 orr r4, r6
5d8e3bf8 490 add r3, r0, #4*8
8cfbda97 491 movs r4, r4, lsr #17
8cfbda97 492
8cfbda97 493 vst1.32 d12[0], [r3]! @ gteIR0
494 vst1.32 d10, [r3]! @ gteIR12
495 vst1.32 d11[0], [r3] @ ..3
496
5d8e3bf8 497 @ ~20 cycles
498 orrne lr, #(1<<31) @ limE
499 orrne lr, #(1<<17) @ limE
8cfbda97 500 ldmia r1, {r4-r9}
501 subs r2, r4, #1<<21 @ min fSX
502 addvcs r3, r6, #1<<21 @ max fSX
503 orrvs lr, #(1<<31) @ limG1
504 orrvs lr, #(1<<14)
505 subs r2, r5, #1<<21 @ min fSY
506 addvcs r3, r7, #1<<21 @ max fSY
507 orrvs lr, #(1<<31) @ limG2
508 orrvs lr, #(1<<13)
509 adds r2, r9, #1
510 orrvs lr, #(1<<31) @ F
511 orrvs lr, #(1<<16)
512 subs r3, r8, #1
513 orrvs lr, #(1<<31) @ F
514
515 ldr r4, [r0, #4*24] @ gteMAC0
516 orrvs lr, #(1<<15)
517
518 adds r3, r4, #1
519 orrvs lr, #(1<<16)
520 orrvs lr, #(1<<31) @ F
521 subs r2, r4, #1
522 orrvs lr, #(1<<15)
523 orrvs lr, #(1<<31) @ F
524 cmp r4, #0x1000
5d8e3bf8 525 orrhi lr, #(1<<12) @ limH
8cfbda97 526
527 str lr, [r0, #4*(32+31)] @ gteFLAG
528
529 pop {r4-r11,pc}
5d8e3bf8 530 .size gteRTPT_neon, .-gteRTPT_neon
531
532
533
534@ the name is misnormer, this doesn't use NEON but oh well..
535.global gteNCLIP_neon @ r0=CP2 (d,c),
536gteNCLIP_neon:
537 push {r4-r6,lr}
538
539 add r1, r0, #4*12
540 ldmia r1, {r1-r3}
541 mov r4, r1, asr #16
542 mov r5, r2, asr #16
543 mov r6, r3, asr #16
544 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
545 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
546 sxth r1, r1
547 smull r1, r5, r1, r5 @ RdLo, RdHi
548 sub r6, r4 @ 2: gteSY2 - gteSY0
549 sxth r2, r2
550 smlal r1, r5, r2, r6
551 mov lr, #0 @ gteFLAG
552 sxth r3, r3
553 smlal r1, r5, r3, r12
554 mov r6, #1<<31
555 orr r6, #1<<15
556 movs r2, r1, lsl #1
557 adc r5, r5
558 cmp r5, #0
559 movtgt lr, #((1<<31)|(1<<16))>>16
560 mvngt r1, #1<<31 @ maxint
561 cmn r5, #1
562 movmi r1, #1<<31 @ minint
563 orrmi lr, r6
564 str r1, [r0, #4*24]
565 str lr, [r0, #4*(32+31)] @ gteFLAG
566
567 pop {r4-r6,pc}
568 .size gteNCLIP_neon, .-gteNCLIP_neon
569
8cfbda97 570
571@ vim:filetype=armasm