gte_neon: implement RTPS and NCLIP
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
11
12.bss
13.align 6 @ cacheline
14
15scratch:
16.rept 8*8*2/4
17 .word 0
18.endr
19
20.text
21.align 2
22
5d8e3bf8 23@ approximate gteMAC|123 flags
24@ in: rr 123 as gteMAC|123
25@ trash: nothing
26.macro do_mac_flags rr1 rr2 rr3
27 cmp \rr1, #1
28 orrvs lr, #(1<<31)|(1<<27)
29 cmp \rr2, #1
30 orrvs lr, #(1<<31)|(1<<26)
31 cmp \rr3, #1
32 orrvs lr, #(1<<31)|(1<<25)
33 cmn \rr1, #1 @ same as adds ...
34 orrvs lr, #(1<<30)
35 cmn \rr2, #1
36 orrvs lr, #(1<<29)
37 cmn \rr3, #1
38 orrvs lr, #(1<<28)
39.endm
40
41@ approximate 3x gteMACn flags
42@ in: rr 123 as 3 instances gteMACn, *flags
43@ trash: nothing
44.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
45 cmp \rr1, #1
46 cmpvc \rr2, #1
47 cmpvc \rr3, #1
48 orrvs lr, #\nflags
49 cmn \rr1, #1 @ adds ...
50 cmnvc \rr2, #1
51 cmnvc \rr3, #1
52 orrvs lr, #\pflags
53.endm
54
55@ approximate signed gteIR|123 [32] flags
56@ in: rr 123 as gteIR|123
57@ trash: r2,r3
58.macro do_irs_flags rr1 rr2 rr3
59 mov r2, #0x10000
60 cmn r2, \rr1, lsl #16 @ adds ...
61 rsbvcs r3, r2, \rr1, lsl #16
8cfbda97 62 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 63 cmn r2, \rr2, lsl #16
64 rsbvcs r3, r2, \rr2, lsl #16
8cfbda97 65 orrvs lr, #(1<<31)
66 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 67 cmn r2, \rr3, lsl #16
68 subvcs r3, r2, \rr3, lsl #16
8cfbda97 69 orrvs lr, #(1<<22) @ IR3/limB3
70.endm
71
72
73/*
5d8e3bf8 74 * RTPS/RTPT register map:
75 *
8cfbda97 76 * q | d | c code / phase 1 phase 2 scratch
77 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
78 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
79 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
80 * 3 * gteIR1-3 = gteIR1-3 /
81 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
82 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
83 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
84 * 7 0 gteDQB [s64] max gteMAC|12
85 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
86 * 9 * / gteMAC3 max gteIR|123
87 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
88 * 11 0 quotient 3
89 * 6 12 gteH (adj. for cmp)
90 * 13 gteH (float for div)
91 * ... <scratch>
92 * 15 30 0
93 * 31 0
94 */
8cfbda97 95
5d8e3bf8 96@ load gteR*, gteTR* and gteH (see map above), clear q15
97@ in: r0 - context
98@ trash: r3
99.macro rtpx_preload
8cfbda97 100 add r3, r0, #4*32
101 vldmia r3, {d0-d2} @ gteR* [16*9]
102 add r3, r0, #4*(32+5)
103 vldmia r3, {d4-d5} @ gteTR*
5d8e3bf8 104 vmov.i32 q15, #0
8cfbda97 105 vshl.i64 d2, d2, #32 @ |
106 add r3, r0, #4*(32+26)
107 vld1.32 d11[0], [r3] @ gteH
108 vsri.u64 d2, d1, #32 @ |
5d8e3bf8 109 vshll.s32 q3, d5, #12 @ gteTRZ
110 vshll.s32 q2, d4, #12 @ gteTR|XY
8cfbda97 111 vshl.i64 d1, d1, #16 @ |
8cfbda97 112 vmovl.s16 q6, d11 @ gteH
113 vsri.u64 d1, d0, #48 @ |
5d8e3bf8 114.endm
8cfbda97 115
5d8e3bf8 116@ do RTP* gteMAC* calculation
117@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
118@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
119@ trash: d16-d21
120.macro rtpx_mac
8cfbda97 121 vmull.s16 q8, d0, d8
122 vmull.s16 q9, d1, d8
123 vmull.s16 q10, d2, d8
124 vpaddl.s32 q8, q8
125 vpaddl.s32 q9, q9
126 vpaddl.s32 q10, q10
127 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
128 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
129 vadd.s64 d20, d21 @ QC
130 vadd.s64 d16, d4
131 vadd.s64 d18, d5
132 vadd.s64 d20, d6
133 vqshrn.s64 d8, q8, #12 @ gteMAC1
134 vqshrn.s64 d18, q9, #12 @ gteMAC2
135 vqshrn.s64 d9, q10, #12 @ gteMAC3
136 vsli.u64 d8, d18, #32 @ gteMAC|12
137 vmov.32 d9[1], r12
5d8e3bf8 138 vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
139.endm
140
141.global gteRTPS_neon @ r0=CP2 (d,c),
142gteRTPS_neon:
143 push {r4-r7,lr}
144
145@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
146 movw r1, #:lower16:scratch
147 movt r1, #:upper16:scratch
148 mov r12, #0
149
150 rtpx_preload
151
152 vldmia r0, {d8} @ VXYZ(0)
153 vmov.16 d8[3], r12 @ kill unused upper vector
154
155 rtpx_mac
156
157 add r3, r0, #4*25
158 vst1.32 d8, [r3]!
159 vst1.32 d9[0], [r3] @ wb gteMAC|123
160 vmovl.s16 q9, d10 @ expand gteIR|123
161
162 add r3, r0, #4*17 @ gteSZ*
163 vldmia r3, {q7} @ d14,d15 gteSZ|123x
164 vmov.i32 d28, #0xffff @ 0xffff[32]
165 vmax.s32 d11, d9, d31
166 vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
167 vmov.i32 d26, #1
168 vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
169 vshl.u32 d13, d12, #16 @ | preparing gteH
170 add r3, r0, #4*9
171 vst1.32 d18, [r3]!
172 vst1.32 d19[0], [r3]
173
174 vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
175 vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
176
177 add r3, r0, #4*(32+24)
178 vld1.32 d4, [r3] @ || gteOF|XY
179 add r3, r0, #4*(32+27)
180 vld1.32 d6, [r3] @ || gteDQ|AB
181
182 vand d11, d16
183 vmovl.s32 q2, d4 @ || gteOF|XY [64]
184 vmax.u32 d11, d26 @ make divisor 1 if not
185 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
186 add r3, r0, #4*16 @ | gteSZ*
187 vstmia r3, {q7} @ | d14,d15 gteSZ|123x
188
189 vcvt.f32.u32 d13, d13 @ gteH (float for div)
190 vcvt.f32.u32 d11, d11 @ divisor
191
192 @ divide.. it's not worth messing with reciprocals here
193 @ just for 1 value, let's just use VFP divider here
194 vdiv.f32 s22, s26, s22
195
196 vcvt.u32.f32 d11, d11 @ quotient
197
198 @ while NEON's busy we calculate some flags on ARM
199 add r3, r0, #4*25
200 mov lr, #0 @ gteFLAG
201 ldmia r3, {r4-r6} @ gteMAC|123
202
203 vst1.32 d11, [r1] @ wb quotient for flags (pre-limE)
204 vqshl.u32 d11, #15
205
206 do_mac_flags r4, r5, r6
207
208 vshr.u32 d11, #15 @ quotient (limE)
209 add r3, r0, #4*9
210
211 ldmia r3, {r4-r6} @ gteIR|123
212 do_irs_flags r4, r5, r6
213
214 vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
215 add r3, r0, #4*13
216 vld1.32 d16, [r3] @ || load fS|XY12, new 01
217 vqmovn.s64 d18, q2 @ saturate to 32
218 vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
219 vqshl.s32 d19, d18, #5 @ 11bit precision
220
221 ldr r4, [r1] @ quotient
222
223 mov r2, r1
224 vst1.32 d18, [r2]! @ || writeback fS|XY2 before limG
225 vst1.32 d19, [r2] @ || and after 11bit saturation
226
227 movs r3, r6, lsr #16
228 orrne lr, #(1<<31)
229 orrne lr, #(1<<18) @ fSZ (limD)
230
231 vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
232 vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
233 vmovn.s32 d18, q9 @ fS|XY2 [s16]
234
235 vqshrn.s64 d20, q10, #12 @ | gteMAC0
236 add r3, r0, #4*12
237 vst1.32 d16, [r3]! @ writeback fS|XY01
238 vst1.32 d18[0], [r3] @ ...2
239 add r3, r0, #4*24
240 vst1.32 d20[0], [r3] @ gteMAC0
241
242 vmax.s32 d21, d20, d31
243 vmov.i32 d22, #0x1000
244 vmin.s32 d21, d22
245 add r3, r0, #4*8
246 vst1.16 d21[0], [r3] @ gteIR0
247
248 movs r4, r4, lsr #17
249 orrne lr, #(1<<31)
250 orrne lr, #(1<<17) @ limE
251
252 ldmia r1, {r4-r7} @ fS|XY2 before limG, after 11bit sat
253 subs r2, r6, #1<<21
254 addvcs r3, r6, #1<<21
255 orrvs lr, #(1<<14) @ limG1
256 orrvs lr, #(1<<31)
257 subs r2, r7, #1<<21
258 addvcs r3, r7, #1<<21
259 orrvs lr, #(1<<13) @ limG2
260 orrvs lr, #(1<<31)
261 adds r2, r4, #1
262 addvcs r3, r5, #1
263 orrvs lr, #(1<<16) @ F
264 orrvs lr, #(1<<31)
265 subs r2, r4, #1
266 subvcs r3, r5, #1
267 orrvs lr, #(1<<31)
268
269 ldr r4, [r0, #4*24] @ gteMAC0
270 orrvs lr, #(1<<15)
271
272 adds r3, r4, #1
273 orrvs lr, #(1<<16) @ F
274 orrvs lr, #(1<<31)
275 subs r2, r4, #1
276 orrvs lr, #(1<<15) @ F
277 orrvs lr, #(1<<31)
278 cmp r4, #0x1000
279 orrhi lr, #(1<<12) @ limH
280
281 str lr, [r0, #4*(32+31)] @ gteFLAG
282
283 pop {r4-r7,pc}
284 .size gteRTPS_neon, .-gteRTPS_neon
285
286
287
288.global gteRTPT_neon @ r0=CP2 (d,c),
289gteRTPT_neon:
290 push {r4-r11,lr}
291
292 movw r1, #:lower16:scratch
293 movt r1, #:upper16:scratch
294 mov r12, #0
295
296 rtpx_preload
297
298 vmov.i32 d22, #0x7fffffff
299 vmov.i32 d23, #0x80000000
300 mov r3, #3 @ counter
301 mov r2, r0 @ VXYZ(0)
3020:
303 vldmia r2!, {d8} @ VXYZ(v)
304 vmov.16 d8[3], r12 @ kill unused upper vector
305
306 rtpx_mac
8cfbda97 307 vmin.s32 d22, d8 @ min gteMAC|12
308 vmax.s32 d23, d8 @ max gteMAC|12
309 subs r3, #1
310 vst1.32 {d9,d10}, [r1, :64]!
311 bgt 0b
312
313 vst1.32 {d22,d23}, [r1, :64]! @ min/max gteMAC|12 (for flags)
314
315 @ - phase2 -
316 sub r1, r1, #8*2*4
317 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
318
319 vmov d20, d0 @ gteMAC3 v=0
320 vmin.s16 d24, d1, d3 @ | find min IR
5d8e3bf8 321 vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
8cfbda97 322 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
323 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
324 vmov d21, d9 @ ... v=2
325
326 vmov.i32 q14, #0xffff @ 0xffff[32]
327 vmax.s32 q10, q15
328 vmov.i32 q13, #1
329 vdup.32 q11, d22[0] @ gteH/2
5d8e3bf8 330 vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
8cfbda97 331 vmin.s16 d24, d10 @ | find min/max IR
332 vmax.s16 d25, d10 @ |
333
5d8e3bf8 334 add r3, r0, #4*19 @ ||
335 vld1.32 d14[0], [r3] @ || gteSZ3
336
8cfbda97 337 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
338 add r3, r0, #4*17
339 vst1.32 d20, [r3]! @ | writeback fSZ(v)
340 vand q11, q10, q11
341 vst1.32 d21[0], [r3] @ |
342 vmax.u32 q10, q11, q13 @ make divisor 1 if not
343 add r3, r1, #8*8
344 vstmia r3, {q12} @ min/max IR for flags
345 vcvt.f32.u32 q10, q10
346 vshl.u32 d13, d12, #16 @ | preparing gteH
347
348 @ while NEON's busy we calculate some flags on ARM
349 add r2, r1, #8*2*3
350 mov lr, #0 @ gteFLAG
351 ldmia r2, {r4-r7} @ min/max gteMAC|12
352 subs r2, r4, #1
353 orrvs lr, #(1<<31)|(1<<27)
354 subs r3, r5, #1
355 orrvs lr, #(1<<31)|(1<<26)
356 adds r2, r6, #1
357 orrvs lr, #(1<<30)
358 adds r3, r7, #1
359 orrvs lr, #(1<<29)
360 ldr r4, [r1, #0] @ gteMAC3 v=0
361 ldr r5, [r1, #8*2] @ ... v=1
362 ldr r6, [r1, #8*4] @ ... v=2
363
364 add r3, r0, #4*(32+24)
365 vld1.32 d4, [r3] @ || gteOF|XY
366 add r3, r0, #4*(32+27)
5d8e3bf8 367 vld1.32 d6, [r3] @ || gteDQ|AB
8cfbda97 368
369 @ divide
370.if 1
371 vrecpe.f32 q11, q10 @ inv
5d8e3bf8 372 vmovl.s32 q2, d4 @ || gteOF|XY [64]
373 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 374 vrecps.f32 q12, q10, q11 @ step
375 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
376 vmul.f32 q11, q12, q11 @ better inv
5d8e3bf8 377 add r3, r0, #4*16
378 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 379 vdup.32 q13, d13[0] @ |
380@ vrecps.f32 q12, q10, q11 @ step
381@ vmul.f32 q11, q12, q11 @ better inv
382 vmul.f32 q10, q13, q11 @ result
383.else
5d8e3bf8 384 vmovl.s32 q2, d4 @ || gteOF|XY [64]
385 vmovl.s32 q3, d6 @ || gteDQ|AB [64]
8cfbda97 386 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
387 vdup.32 q13, d13[0] @ |
5d8e3bf8 388 add r3, r0, #4*16
389 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
8cfbda97 390
391 vpush {q0}
392 vmov q0, q10 @ to test against C code
393 vdiv.f32 s0, s26, s0
394 vdiv.f32 s1, s26, s1
395 vdiv.f32 s2, s26, s2
396 vmov q10, q0
397 vpop {q0}
398.endif
399
5d8e3bf8 400 do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
8cfbda97 401 orr r7, r4, r5
402 add r4, r1, #8*8
403 orr r3, r7, r6
404 ldmia r4, {r7,r8,r10,r11} @ min/max IR
405
406 movs r3, r3, lsr #16
407 orrne lr, #(1<<31)
408 orrne lr, #(1<<18) @ fSZ (limD)
409
410@ vadd.f32 q10, q @ adjust for vcvt rounding mode
411 vcvt.u32.f32 q8, q10
412 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
413 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
414 add r6, r1, #8*10
415 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
416 vqshl.u32 q8, #15
417 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
418 vshr.u32 q8, #15 @ quotients (limE)
419 vdup.32 d24, d16[0]
420 vdup.32 d25, d16[1]
421 vdup.32 d26, d17[0] @ quotient (dup)
422
5d8e3bf8 423 @ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
424 mov r4, #0x10000
425 cmp r7, #1<<16
426 cmnvc r10, #1<<16
8cfbda97 427 orrvs lr, #(1<<31)
428 orrvs lr, #(1<<23) @ IR2/limB2
5d8e3bf8 429 rsbs r2, r4, r7, lsl #16
430 cmnvc r4, r10, lsl #16
8cfbda97 431 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
5d8e3bf8 432 rsbs r2, r4, r8, lsl #16
433 cmnvc r4, r11, lsl #16
8cfbda97 434 orrvs lr, #(1<<22) @ IR3/limB3
435
436 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
437 vmull.s32 q10, d20, d25 @ ... v=1
438 vmull.s32 q11, d22, d26 @ ... v=2
439 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
440 vadd.s64 q10, q2 @ ... v=1
441 vadd.s64 q11, q2 @ ... v=2
442 vqmovn.s64 d18, q9 @ saturate to 32 v=0
443 vqmovn.s64 d19, q10 @ ... v=1
444 vqmovn.s64 d20, q11 @ ... v=2
445 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
446 vmax.s32 d15, d18, d19 @ || for flags
447 vmin.s32 d14, d20
448 vmax.s32 d15, d20
449 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
450 vqshl.s32 d24, d20, #5 @ ... v=2
451 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
5d8e3bf8 452 vpmin.s32 d16, d14, d31 @ || also find min/max in pair
453 vpmax.s32 d17, d15, d31 @ ||
8cfbda97 454 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
455 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
456 vqshl.s32 q7, #5 @ || min/max pairs shifted
457 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
458 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
459 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
460 vmovn.s32 d13, q12 @ 3
461 vstmia r1, {d14-d16} @ || other cacheline than quotients
462 add r3, r0, #4*12
463 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
464 vst1.32 d13[0], [r3]
465
466 vqshrn.s64 d26, q13, #12 @ | gteMAC0
467 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
468
469 vmov.i32 d13, #0x1000
470 vmax.s32 d12, d26, d30
471
472 add r3, r0, #4*24
473 vst1.32 d26[0], [r3]! @ gteMAC0
474 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
475 vst1.32 d9[0], [r3]
476
477 vmin.s32 d12, d13 @ | gteIR0
478
8cfbda97 479 ldmia r6, {r4-r6} @ quotients
480 orr r4, r5
481 orr r4, r6
5d8e3bf8 482 add r3, r0, #4*8
8cfbda97 483 movs r4, r4, lsr #17
8cfbda97 484
8cfbda97 485 vst1.32 d12[0], [r3]! @ gteIR0
486 vst1.32 d10, [r3]! @ gteIR12
487 vst1.32 d11[0], [r3] @ ..3
488
5d8e3bf8 489 @ ~20 cycles
490 orrne lr, #(1<<31) @ limE
491 orrne lr, #(1<<17) @ limE
8cfbda97 492 ldmia r1, {r4-r9}
493 subs r2, r4, #1<<21 @ min fSX
494 addvcs r3, r6, #1<<21 @ max fSX
495 orrvs lr, #(1<<31) @ limG1
496 orrvs lr, #(1<<14)
497 subs r2, r5, #1<<21 @ min fSY
498 addvcs r3, r7, #1<<21 @ max fSY
499 orrvs lr, #(1<<31) @ limG2
500 orrvs lr, #(1<<13)
501 adds r2, r9, #1
502 orrvs lr, #(1<<31) @ F
503 orrvs lr, #(1<<16)
504 subs r3, r8, #1
505 orrvs lr, #(1<<31) @ F
506
507 ldr r4, [r0, #4*24] @ gteMAC0
508 orrvs lr, #(1<<15)
509
510 adds r3, r4, #1
511 orrvs lr, #(1<<16)
512 orrvs lr, #(1<<31) @ F
513 subs r2, r4, #1
514 orrvs lr, #(1<<15)
515 orrvs lr, #(1<<31) @ F
516 cmp r4, #0x1000
5d8e3bf8 517 orrhi lr, #(1<<12) @ limH
8cfbda97 518
519 str lr, [r0, #4*(32+31)] @ gteFLAG
520
521 pop {r4-r11,pc}
5d8e3bf8 522 .size gteRTPT_neon, .-gteRTPT_neon
523
524
525
526@ the name is misnormer, this doesn't use NEON but oh well..
527.global gteNCLIP_neon @ r0=CP2 (d,c),
528gteNCLIP_neon:
529 push {r4-r6,lr}
530
531 add r1, r0, #4*12
532 ldmia r1, {r1-r3}
533 mov r4, r1, asr #16
534 mov r5, r2, asr #16
535 mov r6, r3, asr #16
536 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
537 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
538 sxth r1, r1
539 smull r1, r5, r1, r5 @ RdLo, RdHi
540 sub r6, r4 @ 2: gteSY2 - gteSY0
541 sxth r2, r2
542 smlal r1, r5, r2, r6
543 mov lr, #0 @ gteFLAG
544 sxth r3, r3
545 smlal r1, r5, r3, r12
546 mov r6, #1<<31
547 orr r6, #1<<15
548 movs r2, r1, lsl #1
549 adc r5, r5
550 cmp r5, #0
551 movtgt lr, #((1<<31)|(1<<16))>>16
552 mvngt r1, #1<<31 @ maxint
553 cmn r5, #1
554 movmi r1, #1<<31 @ minint
555 orrmi lr, r6
556 str r1, [r0, #4*24]
557 str lr, [r0, #4*(32+31)] @ gteFLAG
558
559 pop {r4-r6,pc}
560 .size gteNCLIP_neon, .-gteNCLIP_neon
561
8cfbda97 562
563@ vim:filetype=armasm