gte: starting some NEON code
[pcsx_rearmed.git] / libpcsxcore / gte_neon.s
CommitLineData
8cfbda97 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
11
12.bss
13.align 6 @ cacheline
14
15scratch:
16.rept 8*8*2/4
17 .word 0
18.endr
19
20.text
21.align 2
22
23@ approximate signed gteIR|123 flags
24@ in: rl/rh with packed gteIR|123
25@ trash: r2,r3,r4
26.macro do_irs_flags rl rh
27 mov r4, \rl, ror #16
28 adds r2, \rl, #1<<16
29 subvcs r3, \rl, #1<<16
30 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
31 adds r2, r4, #1<<16
32 subvcs r3, r4, #1<<16
33 mov r4, \rh, lsl #16
34 orrvs lr, #(1<<31)
35 orrvs lr, #(1<<23) @ IR2/limB2
36 adds r2, r4, #1<<16
37 subvcs r3, r4, #1<<16
38 orrvs lr, #(1<<22) @ IR3/limB3
39.endm
40
41
42/*
43 * q | d | c code / phase 1 phase 2 scratch
44 * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
45 * 1 gteR2* gteIR1-3 = gteIR1-3 / *
46 * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
47 * 3 * gteIR1-3 = gteIR1-3 /
48 * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
49 * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
50 * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
51 * 7 0 gteDQB [s64] max gteMAC|12
52 * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
53 * 9 * / gteMAC3 max gteIR|123
54 * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
55 * 11 0 quotient 3
56 * 6 12 gteH (adj. for cmp)
57 * 13 gteH (float for div)
58 * ... <scratch>
59 * 15 30 0
60 * 31 0
61 */
62.global gteRTPT_neon @ r0=CP2 (d,c),
63gteRTPT_neon:
64 push {r4-r11,lr}
65
66@ fmrx r4, fpscr @ vmrs?
67 movw r1, #:lower16:scratch
68 movt r1, #:upper16:scratch
69 mov r12, #0
70 veor q15, q15
71
72 add r3, r0, #4*32
73 vldmia r3, {d0-d2} @ gteR* [16*9]
74 add r3, r0, #4*(32+5)
75 vldmia r3, {d4-d5} @ gteTR*
76 vshl.i64 d2, d2, #32 @ |
77 add r3, r0, #4*(32+26)
78 vld1.32 d11[0], [r3] @ gteH
79 vsri.u64 d2, d1, #32 @ |
80 add r3, r0, #4*19
81 vld1.32 d14[0], [r3] @ gteSZ3
82 vshll.s32 q3, d5, #12
83 vshll.s32 q2, d4, #12 @ gteTRX
84 vshl.i64 d1, d1, #16 @ |
85 add r3, r0, #4*16
86 vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
87 vmovl.s16 q6, d11 @ gteH
88 vsri.u64 d1, d0, #48 @ |
89
90 vmov.i32 d22, #0x7fffffff
91 vmov.i32 d23, #0x80000000
92 mov r3, #3 @ counter
93 mov r2, r0 @ VXYZ(0)
940:
95 vldmia r2!, {d8} @ VXYZ(v)
96 vmov.16 d8[3], r12 @ kill unused upper vector
97
98 vmull.s16 q8, d0, d8
99 vmull.s16 q9, d1, d8
100 vmull.s16 q10, d2, d8
101 vpaddl.s32 q8, q8
102 vpaddl.s32 q9, q9
103 vpaddl.s32 q10, q10
104 vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
105 vadd.s64 d18, d19 @ d8[3]==0, so won't affect
106 vadd.s64 d20, d21 @ QC
107 vadd.s64 d16, d4
108 vadd.s64 d18, d5
109 vadd.s64 d20, d6
110 vqshrn.s64 d8, q8, #12 @ gteMAC1
111 vqshrn.s64 d18, q9, #12 @ gteMAC2
112 vqshrn.s64 d9, q10, #12 @ gteMAC3
113 vsli.u64 d8, d18, #32 @ gteMAC|12
114 vmov.32 d9[1], r12
115 vqmovn.s32 d10, q4 @ gteIR1-3; losing 2 cycles?
116 vmin.s32 d22, d8 @ min gteMAC|12
117 vmax.s32 d23, d8 @ max gteMAC|12
118 subs r3, #1
119 vst1.32 {d9,d10}, [r1, :64]!
120 bgt 0b
121
122 vst1.32 {d22,d23}, [r1, :64]! @ min/max gteMAC|12 (for flags)
123
124 @ - phase2 -
125 sub r1, r1, #8*2*4
126 vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
127
128 vmov d20, d0 @ gteMAC3 v=0
129 vmin.s16 d24, d1, d3 @ | find min IR
130 vshr.s32 d22, d12, #1 @ || gteH (adjust for cmp)
131 vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
132 vsli.u64 d20, d2, #32 @ gteMAC3 v=1
133 vmov d21, d9 @ ... v=2
134
135 vmov.i32 q14, #0xffff @ 0xffff[32]
136 vmax.s32 q10, q15
137 vmov.i32 q13, #1
138 vdup.32 q11, d22[0] @ gteH/2
139 vmin.u32 q10, q14 @ saturate to 0..0xffff - fSZ(v)
140 vmin.s16 d24, d10 @ | find min/max IR
141 vmax.s16 d25, d10 @ |
142
143 vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
144 add r3, r0, #4*17
145 vst1.32 d20, [r3]! @ | writeback fSZ(v)
146 vand q11, q10, q11
147 vst1.32 d21[0], [r3] @ |
148 vmax.u32 q10, q11, q13 @ make divisor 1 if not
149 add r3, r1, #8*8
150 vstmia r3, {q12} @ min/max IR for flags
151 vcvt.f32.u32 q10, q10
152 vshl.u32 d13, d12, #16 @ | preparing gteH
153
154 @ while NEON's busy we calculate some flags on ARM
155 add r2, r1, #8*2*3
156 mov lr, #0 @ gteFLAG
157 ldmia r2, {r4-r7} @ min/max gteMAC|12
158 subs r2, r4, #1
159 orrvs lr, #(1<<31)|(1<<27)
160 subs r3, r5, #1
161 orrvs lr, #(1<<31)|(1<<26)
162 adds r2, r6, #1
163 orrvs lr, #(1<<30)
164 adds r3, r7, #1
165 orrvs lr, #(1<<29)
166 ldr r4, [r1, #0] @ gteMAC3 v=0
167 ldr r5, [r1, #8*2] @ ... v=1
168 ldr r6, [r1, #8*4] @ ... v=2
169
170 add r3, r0, #4*(32+24)
171 vld1.32 d4, [r3] @ || gteOF|XY
172 add r3, r0, #4*(32+27)
173 vld1.32 d6, [r3] @ || gteDQAB
174
175 @ divide
176.if 1
177 vrecpe.f32 q11, q10 @ inv
178 vmovl.s32 q2, d4 @ || gteOFXY [64]
179 vmovl.s32 q3, d6 @ || gteDQAB [64]
180 vrecps.f32 q12, q10, q11 @ step
181 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
182 vmul.f32 q11, q12, q11 @ better inv
183 vdup.32 q13, d13[0] @ |
184@ vrecps.f32 q12, q10, q11 @ step
185@ vmul.f32 q11, q12, q11 @ better inv
186 vmul.f32 q10, q13, q11 @ result
187.else
188 vmovl.s32 q2, d4 @ || gteOFXY [64]
189 vmovl.s32 q3, d6 @ || gteDQAB [64]
190 vcvt.f32.u32 d13, d13 @ | gteH (float for div)
191 vdup.32 q13, d13[0] @ |
192
193 vpush {q0}
194 vmov q0, q10 @ to test against C code
195 vdiv.f32 s0, s26, s0
196 vdiv.f32 s1, s26, s1
197 vdiv.f32 s2, s26, s2
198 vmov q10, q0
199 vpop {q0}
200.endif
201
202@ approximate gteMACx flags
203@ in: rr 123 as gteMAC 123, *flags
204@ trash: r2,r3
205.macro do_mac_flags rr1 rr2 rr3 nflags pflags
206 subs r2, \rr1, #1
207 subvcs r3, \rr2, #1
208 subvcs r2, \rr3, #1
209 orrvs lr, #\nflags
210 adds r3, \rr1, #1
211 addvcs r2, \rr2, #1
212 addvcs r3, \rr3, #1
213 orrvs lr, #\pflags
214.endm
215
216 do_mac_flags r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
217 orr r7, r4, r5
218 add r4, r1, #8*8
219 orr r3, r7, r6
220 ldmia r4, {r7,r8,r10,r11} @ min/max IR
221
222 movs r3, r3, lsr #16
223 orrne lr, #(1<<31)
224 orrne lr, #(1<<18) @ fSZ (limD)
225
226@ vadd.f32 q10, q @ adjust for vcvt rounding mode
227 vcvt.u32.f32 q8, q10
228 vmovl.s16 q9, d1 @ expand gteIR|12 v=0
229 vmovl.s16 q10, d3 @ expand gteIR|12 v=1
230 add r6, r1, #8*10
231 vstmia r6, {q8} @ wb quotients for flags (pre-limE)
232 vqshl.u32 q8, #15
233 vmovl.s16 q11, d10 @ expand gteIR|12 v=2
234 vshr.u32 q8, #15 @ quotients (limE)
235 vdup.32 d24, d16[0]
236 vdup.32 d25, d16[1]
237 vdup.32 d26, d17[0] @ quotient (dup)
238
239 mov r4, r7, ror #16
240 mov r5, r10, ror #16
241 subs r2, r7, #1<<16
242 addvcs r3, r10, #1<<16
243 orrvs lr, #(1<<31)
244 orrvs lr, #(1<<23) @ IR2/limB2
245 subs r2, r4, #1<<16
246 addvcs r3, r5, #1<<16
247 mov r4, r8, lsl #16
248 mov r5, r11, lsl #16
249 orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
250 subs r2, r4, #1<<16
251 addvcs r3, r5, #1<<16
252 orrvs lr, #(1<<22) @ IR3/limB3
253
254 vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
255 vmull.s32 q10, d20, d25 @ ... v=1
256 vmull.s32 q11, d22, d26 @ ... v=2
257 vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
258 vadd.s64 q10, q2 @ ... v=1
259 vadd.s64 q11, q2 @ ... v=2
260 vqmovn.s64 d18, q9 @ saturate to 32 v=0
261 vqmovn.s64 d19, q10 @ ... v=1
262 vqmovn.s64 d20, q11 @ ... v=2
263 vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
264 vmax.s32 d15, d18, d19 @ || for flags
265 vmin.s32 d14, d20
266 vmax.s32 d15, d20
267 vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
268 vqshl.s32 d24, d20, #5 @ ... v=2
269 vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
270 vpmin.s32 d16, d14, d15 @ || also find min/max in pair
271 vpmax.s32 d17, d14, d15 @ ||
272 vshr.s32 q11, #16+5 @ can't vqshrn because of insn
273 vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
274 vqshl.s32 q7, #5 @ || min/max pairs shifted
275 vsli.u64 d16, d17, #32 @ || pack in-pair min/max
276 vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
277 vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
278 vmovn.s32 d13, q12 @ 3
279 vstmia r1, {d14-d16} @ || other cacheline than quotients
280 add r3, r0, #4*12
281 vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
282 vst1.32 d13[0], [r3]
283
284 vqshrn.s64 d26, q13, #12 @ | gteMAC0
285 vmovl.u16 q5, d10 @ expand gteIR|123 v=2
286
287 vmov.i32 d13, #0x1000
288 vmax.s32 d12, d26, d30
289
290 add r3, r0, #4*24
291 vst1.32 d26[0], [r3]! @ gteMAC0
292 vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
293 vst1.32 d9[0], [r3]
294
295 vmin.s32 d12, d13 @ | gteIR0
296
297 @ ~6 cycles
298 ldmia r6, {r4-r6} @ quotients
299 orr r4, r5
300 orr r4, r6
301 add r3, r0, #4*12
302 movs r4, r4, lsr #17
303 orrne lr, #(1<<31) @ limE
304 orrne lr, #(1<<17) @ limE
305
306 add r3, r0, #4*8
307 vst1.32 d12[0], [r3]! @ gteIR0
308 vst1.32 d10, [r3]! @ gteIR12
309 vst1.32 d11[0], [r3] @ ..3
310
311 @ ~19 cycles
312 ldmia r1, {r4-r9}
313 subs r2, r4, #1<<21 @ min fSX
314 addvcs r3, r6, #1<<21 @ max fSX
315 orrvs lr, #(1<<31) @ limG1
316 orrvs lr, #(1<<14)
317 subs r2, r5, #1<<21 @ min fSY
318 addvcs r3, r7, #1<<21 @ max fSY
319 orrvs lr, #(1<<31) @ limG2
320 orrvs lr, #(1<<13)
321 adds r2, r9, #1
322 orrvs lr, #(1<<31) @ F
323 orrvs lr, #(1<<16)
324 subs r3, r8, #1
325 orrvs lr, #(1<<31) @ F
326
327 ldr r4, [r0, #4*24] @ gteMAC0
328 orrvs lr, #(1<<15)
329
330 adds r3, r4, #1
331 orrvs lr, #(1<<16)
332 orrvs lr, #(1<<31) @ F
333 subs r2, r4, #1
334 orrvs lr, #(1<<15)
335 orrvs lr, #(1<<31) @ F
336 cmp r4, #0x1000
337 orrhi lr, #(1<<12)
338
339 str lr, [r0, #4*(32+31)] @ gteFLAG
340
341 pop {r4-r11,pc}
342
343@ vim:filetype=armasm