spu: make "simple" interpolation even more simple
[pcsx_rearmed.git] / libpcsxcore / gte_arm.s
CommitLineData
59774ed0 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011
3 *
0c2ca3ba 4 * This work is licensed under the terms of GNU GPL version 2 or later.
59774ed0 5 * See the COPYING file in the top-level directory.
6 */
7
8/* .equiv HAVE_ARMV7, 1 */
9
10.text
11.align 2
12
0c2ca3ba 13.macro sgnxt16 rd
59774ed0 14.if HAVE_ARMV7
0c2ca3ba 15 sxth \rd, \rd
59774ed0 16.else
0c2ca3ba 17 lsl \rd, \rd, #16
18 asr \rd, \rd, #16
59774ed0 19.endif
20.endm
21
0c2ca3ba 22@ prepare work reg for ssatx
23@ in: wr reg, bit to saturate to
24.macro ssatx_prep wr bit
25.if !HAVE_ARMV7
26 mov \wr, #(1<<(\bit-1))
27.endif
28.endm
29
30.macro ssatx rd wr bit
31.if HAVE_ARMV7
32 ssat \rd, #\bit, \rd
33.else
34 cmp \rd, \wr
35 subge \rd, \wr, #1
36 cmn \rd, \wr
37 rsblt \rd, \wr, #0
38.endif
39.endm
40
41.macro usat16_ rd rs
42.if HAVE_ARMV7
43 usat \rd, #16, \rs
44.else
45 subs \rd, \rs, #0
46 movlt \rd, #0
47 cmp \rd, #0x10000
48 movge \rd, #0x0ff00
49 orrge \rd, #0x000ff
50.endif
51.endm
52
53@ unsigned divide rd = rm / rs
54@ no div by 0 check
55@ in: rm, rs
56@ trash: rm rs
57.macro udiv rd rm rs
58 clz \rd, \rs
59 lsl \rs, \rs, \rd @ shift up divisor
60 orr \rd, \rd, #1<<31
61 lsr \rd, \rd, \rd
620:
63 cmp \rm, \rs
64 subcs \rm, \rs
65 adcs \rd, \rd, \rd
66 lsr \rs, #1
67 bcc 0b
68.endm
69
70
71@ calculate RTPS/RTPT MAC values
72@ in: r0 context, r8,r9 VXYZ
73@ out: r10-r12 MAC123
74@ trash: r1-r7
75.macro do_rtpx_mac
76 add r1, r0, #4*32
77 add r2, r0, #4*(32+5) @ gteTRX
78 ldmia r1!,{r5-r7} @ gteR1*,gteR2*
79 ldmia r2, {r10-r12}
80 smulbb r2, r5, r8 @ gteR11 * gteVX0
81 smultt r3, r5, r8 @ gteR12 * gteVY0
82 smulbb r4, r6, r9 @ gteR13 * gteVZ0
83 qadd r2, r2, r3
84 asr r4, r4, #1 @ prevent oflow, lose a bit
85 add r3, r4, r2, asr #1
86 add r10,r10,r3, asr #11 @ gteMAC1
87 smultb r2, r6, r8 @ gteR21 * gteVX0
88 smulbt r3, r7, r8 @ gteR22 * gteVY0
89 smultb r4, r7, r9 @ gteR23 * gteVZ0
90 ldmia r1!,{r5-r6} @ gteR3*
91 qadd r2, r2, r3
92 asr r4, r4, #1
93 add r3, r4, r2, asr #1
94 add r11,r11,r3, asr #11 @ gteMAC2
95 @ be more accurate for gteMAC3, since it's also a divider
96 smulbb r2, r5, r8 @ gteR31 * gteVX0
97 smultt r3, r5, r8 @ gteR32 * gteVY0
98 smulbb r4, r6, r9 @ gteR33 * gteVZ0
99 qadd r2, r2, r3
100 asr r3, r4, #31 @ expand to 64bit
101 adds r1, r2, r4
102 adc r3, r2, asr #31 @ 64bit sum in r3,r1
103 add r12,r12,r3, lsl #20
104 add r12,r12,r1, lsr #12 @ gteMAC3
105.endm
106
107
108.global gteRTPS_nf_arm @ r0=CP2 (d,c),
109gteRTPS_nf_arm:
110 push {r4-r11,lr}
111
112 ldmia r0, {r8,r9} @ VXYZ(0)
113 do_rtpx_mac
114 add r1, r0, #4*25 @ gteMAC1
115 add r2, r0, #4*17 @ gteSZ1
116 stmia r1, {r10-r12} @ gteMAC123 save
117 ldmia r2, {r3-r5}
118 add r1, r0, #4*16 @ gteSZ0
119 add r2, r0, #4*9 @ gteIR1
120 ssatx_prep r6, 16
121 usat16_ lr, r12 @ limD
122 ssatx r10,r6, 16
123 ssatx r11,r6, 16
124 ssatx r12,r6, 16
125 stmia r1, {r3-r5,lr} @ gteSZ*
126 ldr r3, [r0,#4*(32+26)] @ gteH
127 stmia r2, {r10,r11,r12} @ gteIR123 save
128 cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ?
129 mov r9, #1<<30
130 bhs 1f
131.if 1
132 lsl r3, #16
133 udiv r9, r3, lr
134.else
135 push {r0, r12}
136 mov r0, r3
137 mov r1, lr
138 bl DIVIDE
139 mov r9, r0
140 pop {r0, r12}
141.endif
1421:
143 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
144 cmp r9, #0x20000
145 add r1, r0, #4*12 @ gteSXY0
146 movhs r9, #0x20000
147 ldmia r1, {r2-r4}
148 /* quotient */ subhs r9, #1
149 mov r2, #0
150 smlal r6, r2, r10, r9
151 stmia r1!,{r3,r4} @ shift gteSXY
152 mov r3, #0
153 smlal r7, r3, r11, r9
154 lsr r6, #16
155 /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)]
156 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
157 ssatx_prep r2, 11
158 lsr r7, #16
159 /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11
160 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
161 ssatx r6, r2, 11 @ gteSX2
162 ssatx r7, r2, 11 @ gteSY2
163 strh r6, [r1]
164 strh r7, [r1, #2]
165 str r4, [r0,#4*24] @ gteMAC0
166 asrs r4, #12
167 movmi r4, #0
168 cmp r4, #0x1000 @ limH
169 movgt r4, #0x1000
170 str r4, [r0,#4*8] @ gteIR0
171
172 pop {r4-r11,pc}
173 .size gteRTPS_nf_arm, .-gteRTPS_nf_arm
174
175
176.global gteRTPT_nf_arm @ r0=CP2 (d,c),
177gteRTPT_nf_arm:
178 ldr r1, [r0, #4*19] @ gteSZ3
179 push {r4-r11,lr}
180 str r1, [r0, #4*16] @ gteSZ0
181 mov lr, #0
182
183rtpt_arm_loop:
184 add r1, r0, lr, lsl #1
185 ldrd r8, [r1] @ VXYZ(v)
186 do_rtpx_mac
187
188 ssatx_prep r6, 16
189 usat16_ r2, r12 @ limD
190 add r1, r0, #4*25 @ gteMAC1
191 ldr r3, [r0,#4*(32+26)] @ gteH
192 stmia r1, {r10-r12} @ gteMAC123 save
193 add r1, r0, #4*17
194 ssatx r10,r6, 16
195 ssatx r11,r6, 16
196 ssatx r12,r6, 16
197 str r2, [r1, lr] @ fSZ(v)
198 cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ?
199 mov r9, #1<<30
200 bhs 1f
201.if 1
202 lsl r3, #16
203 udiv r9, r3, r2
204.else
205 push {r0, r12, lr}
206 mov r0, r3
207 mov r1, r2
208 bl DIVIDE
209 mov r9, r0
210 pop {r0, r12, lr}
211.endif
2121:
213 cmp r9, #0x20000
214 add r1, r0, #4*12
215 movhs r9, #0x20000
216 ldrd r6, [r0,#4*(32+24)] @ gteOFXY
217 /* quotient */ subhs r9, #1
218 mov r2, #0
219 smlal r6, r2, r10, r9
220 mov r3, #0
221 smlal r7, r3, r11, r9
222 lsr r6, #16
223 orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16
224 ssatx_prep r2, 11
225 lsr r7, #16
226 orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16
227 ssatx r6, r2, 11 @ gteSX(v)
228 ssatx r7, r2, 11 @ gteSY(v)
229 strh r6, [r1, lr]!
230 add lr, #4
231 strh r7, [r1, #2]
232 cmp lr, #12
233 blt rtpt_arm_loop
234
235 ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB
236 add r1, r0, #4*9 @ gteIR1
237 mla r3, r4, r9, r5 @ gteDQB + gteDQA * q
238 stmia r1, {r10,r11,r12} @ gteIR123 save
239
240 str r3, [r0,#4*24] @ gteMAC0
241 asrs r3, #12
242 movmi r3, #0
243 cmp r3, #0x1000 @ limH
244 movgt r3, #0x1000
245 str r3, [r0,#4*8] @ gteIR0
246
247 pop {r4-r11,pc}
248 .size gteRTPT_nf_arm, .-gteRTPT_nf_arm
249
59774ed0 250
251.global gteNCLIP_arm @ r0=CP2 (d,c),
252gteNCLIP_arm:
253 push {r4-r6,lr}
254
255 add r1, r0, #4*12
256 ldmia r1, {r1-r3}
257 mov r4, r1, asr #16
258 mov r5, r2, asr #16
259 mov r6, r3, asr #16
260 sub r12, r4, r5 @ 3: gteSY0 - gteSY1
261 sub r5, r5, r6 @ 1: gteSY1 - gteSY2
262 sgnxt16 r1
263 smull r1, r5, r1, r5 @ RdLo, RdHi
264 sub r6, r4 @ 2: gteSY2 - gteSY0
265 sgnxt16 r2
266 smlal r1, r5, r2, r6
267 mov lr, #0 @ gteFLAG
268 sgnxt16 r3
269 smlal r1, r5, r3, r12
270 mov r6, #1<<31
271 orr r6, #1<<15
272 movs r2, r1, lsl #1
273 adc r5, r5
274 cmp r5, #0
275.if HAVE_ARMV7
276 movtgt lr, #((1<<31)|(1<<16))>>16
277.else
278 movgt lr, #(1<<31)
279 orrgt lr, #(1<<16)
280.endif
281 mvngt r1, #1<<31 @ maxint
282 cmn r5, #1
283 movmi r1, #1<<31 @ minint
284 orrmi lr, r6
285 str r1, [r0, #4*24]
286 str lr, [r0, #4*(32+31)] @ gteFLAG
287
288 pop {r4-r6,pc}
289 .size gteNCLIP_arm, .-gteNCLIP_arm
290
291
292@ vim:filetype=armasm
293