59774ed0 |
1 | /* |
2 | * (C) GraÅžvydas "notaz" Ignotas, 2011 |
3 | * |
0c2ca3ba |
4 | * This work is licensed under the terms of GNU GPL version 2 or later. |
59774ed0 |
5 | * See the COPYING file in the top-level directory. |
6 | */ |
7 | |
8 | /* .equiv HAVE_ARMV7, 1 */ |
9 | |
10 | .text |
11 | .align 2 |
12 | |
0c2ca3ba |
13 | .macro sgnxt16 rd |
59774ed0 |
14 | .if HAVE_ARMV7 |
0c2ca3ba |
15 | sxth \rd, \rd |
59774ed0 |
16 | .else |
0c2ca3ba |
17 | lsl \rd, \rd, #16 |
18 | asr \rd, \rd, #16 |
59774ed0 |
19 | .endif |
20 | .endm |
21 | |
0c2ca3ba |
22 | @ prepare work reg for ssatx |
23 | @ in: wr reg, bit to saturate to |
24 | .macro ssatx_prep wr bit |
25 | .if !HAVE_ARMV7 |
26 | mov \wr, #(1<<(\bit-1)) |
27 | .endif |
28 | .endm |
29 | |
30 | .macro ssatx rd wr bit |
31 | .if HAVE_ARMV7 |
32 | ssat \rd, #\bit, \rd |
33 | .else |
34 | cmp \rd, \wr |
35 | subge \rd, \wr, #1 |
36 | cmn \rd, \wr |
37 | rsblt \rd, \wr, #0 |
38 | .endif |
39 | .endm |
40 | |
41 | .macro usat16_ rd rs |
42 | .if HAVE_ARMV7 |
43 | usat \rd, #16, \rs |
44 | .else |
45 | subs \rd, \rs, #0 |
46 | movlt \rd, #0 |
47 | cmp \rd, #0x10000 |
48 | movge \rd, #0x0ff00 |
49 | orrge \rd, #0x000ff |
50 | .endif |
51 | .endm |
52 | |
53 | @ unsigned divide rd = rm / rs |
54 | @ no div by 0 check |
55 | @ in: rm, rs |
56 | @ trash: rm rs |
57 | .macro udiv rd rm rs |
58 | clz \rd, \rs |
59 | lsl \rs, \rs, \rd @ shift up divisor |
60 | orr \rd, \rd, #1<<31 |
61 | lsr \rd, \rd, \rd |
62 | 0: |
63 | cmp \rm, \rs |
64 | subcs \rm, \rs |
65 | adcs \rd, \rd, \rd |
66 | lsr \rs, #1 |
67 | bcc 0b |
68 | .endm |
69 | |
70 | |
71 | @ calculate RTPS/RTPT MAC values |
72 | @ in: r0 context, r8,r9 VXYZ |
73 | @ out: r10-r12 MAC123 |
74 | @ trash: r1-r7 |
75 | .macro do_rtpx_mac |
76 | add r1, r0, #4*32 |
77 | add r2, r0, #4*(32+5) @ gteTRX |
78 | ldmia r1!,{r5-r7} @ gteR1*,gteR2* |
79 | ldmia r2, {r10-r12} |
80 | smulbb r2, r5, r8 @ gteR11 * gteVX0 |
81 | smultt r3, r5, r8 @ gteR12 * gteVY0 |
82 | smulbb r4, r6, r9 @ gteR13 * gteVZ0 |
83 | qadd r2, r2, r3 |
84 | asr r4, r4, #1 @ prevent oflow, lose a bit |
85 | add r3, r4, r2, asr #1 |
86 | add r10,r10,r3, asr #11 @ gteMAC1 |
87 | smultb r2, r6, r8 @ gteR21 * gteVX0 |
88 | smulbt r3, r7, r8 @ gteR22 * gteVY0 |
89 | smultb r4, r7, r9 @ gteR23 * gteVZ0 |
90 | ldmia r1!,{r5-r6} @ gteR3* |
91 | qadd r2, r2, r3 |
92 | asr r4, r4, #1 |
93 | add r3, r4, r2, asr #1 |
94 | add r11,r11,r3, asr #11 @ gteMAC2 |
95 | @ be more accurate for gteMAC3, since it's also a divider |
96 | smulbb r2, r5, r8 @ gteR31 * gteVX0 |
97 | smultt r3, r5, r8 @ gteR32 * gteVY0 |
98 | smulbb r4, r6, r9 @ gteR33 * gteVZ0 |
99 | qadd r2, r2, r3 |
100 | asr r3, r4, #31 @ expand to 64bit |
101 | adds r1, r2, r4 |
102 | adc r3, r2, asr #31 @ 64bit sum in r3,r1 |
103 | add r12,r12,r3, lsl #20 |
104 | add r12,r12,r1, lsr #12 @ gteMAC3 |
105 | .endm |
106 | |
107 | |
108 | .global gteRTPS_nf_arm @ r0=CP2 (d,c), |
109 | gteRTPS_nf_arm: |
110 | push {r4-r11,lr} |
111 | |
112 | ldmia r0, {r8,r9} @ VXYZ(0) |
113 | do_rtpx_mac |
114 | add r1, r0, #4*25 @ gteMAC1 |
115 | add r2, r0, #4*17 @ gteSZ1 |
116 | stmia r1, {r10-r12} @ gteMAC123 save |
117 | ldmia r2, {r3-r5} |
118 | add r1, r0, #4*16 @ gteSZ0 |
119 | add r2, r0, #4*9 @ gteIR1 |
120 | ssatx_prep r6, 16 |
121 | usat16_ lr, r12 @ limD |
122 | ssatx r10,r6, 16 |
123 | ssatx r11,r6, 16 |
124 | ssatx r12,r6, 16 |
125 | stmia r1, {r3-r5,lr} @ gteSZ* |
126 | ldr r3, [r0,#4*(32+26)] @ gteH |
127 | stmia r2, {r10,r11,r12} @ gteIR123 save |
128 | cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ? |
129 | mov r9, #1<<30 |
130 | bhs 1f |
131 | .if 1 |
132 | lsl r3, #16 |
133 | udiv r9, r3, lr |
134 | .else |
135 | push {r0, r12} |
136 | mov r0, r3 |
137 | mov r1, lr |
138 | bl DIVIDE |
139 | mov r9, r0 |
140 | pop {r0, r12} |
141 | .endif |
142 | 1: |
143 | ldrd r6, [r0,#4*(32+24)] @ gteOFXY |
144 | cmp r9, #0x20000 |
145 | add r1, r0, #4*12 @ gteSXY0 |
146 | movhs r9, #0x20000 |
147 | ldmia r1, {r2-r4} |
148 | /* quotient */ subhs r9, #1 |
149 | mov r2, #0 |
150 | smlal r6, r2, r10, r9 |
151 | stmia r1!,{r3,r4} @ shift gteSXY |
152 | mov r3, #0 |
153 | smlal r7, r3, r11, r9 |
154 | lsr r6, #16 |
155 | /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)] |
156 | orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 |
157 | ssatx_prep r2, 11 |
158 | lsr r7, #16 |
159 | /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11 |
160 | orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 |
161 | ssatx r6, r2, 11 @ gteSX2 |
162 | ssatx r7, r2, 11 @ gteSY2 |
163 | strh r6, [r1] |
164 | strh r7, [r1, #2] |
165 | str r4, [r0,#4*24] @ gteMAC0 |
166 | asrs r4, #12 |
167 | movmi r4, #0 |
168 | cmp r4, #0x1000 @ limH |
169 | movgt r4, #0x1000 |
170 | str r4, [r0,#4*8] @ gteIR0 |
171 | |
172 | pop {r4-r11,pc} |
173 | .size gteRTPS_nf_arm, .-gteRTPS_nf_arm |
174 | |
175 | |
176 | .global gteRTPT_nf_arm @ r0=CP2 (d,c), |
177 | gteRTPT_nf_arm: |
178 | ldr r1, [r0, #4*19] @ gteSZ3 |
179 | push {r4-r11,lr} |
180 | str r1, [r0, #4*16] @ gteSZ0 |
181 | mov lr, #0 |
182 | |
183 | rtpt_arm_loop: |
184 | add r1, r0, lr, lsl #1 |
185 | ldrd r8, [r1] @ VXYZ(v) |
186 | do_rtpx_mac |
187 | |
188 | ssatx_prep r6, 16 |
189 | usat16_ r2, r12 @ limD |
190 | add r1, r0, #4*25 @ gteMAC1 |
191 | ldr r3, [r0,#4*(32+26)] @ gteH |
192 | stmia r1, {r10-r12} @ gteMAC123 save |
193 | add r1, r0, #4*17 |
194 | ssatx r10,r6, 16 |
195 | ssatx r11,r6, 16 |
196 | ssatx r12,r6, 16 |
197 | str r2, [r1, lr] @ fSZ(v) |
198 | cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ? |
199 | mov r9, #1<<30 |
200 | bhs 1f |
201 | .if 1 |
202 | lsl r3, #16 |
203 | udiv r9, r3, r2 |
204 | .else |
205 | push {r0, r12, lr} |
206 | mov r0, r3 |
207 | mov r1, r2 |
208 | bl DIVIDE |
209 | mov r9, r0 |
210 | pop {r0, r12, lr} |
211 | .endif |
212 | 1: |
213 | cmp r9, #0x20000 |
214 | add r1, r0, #4*12 |
215 | movhs r9, #0x20000 |
216 | ldrd r6, [r0,#4*(32+24)] @ gteOFXY |
217 | /* quotient */ subhs r9, #1 |
218 | mov r2, #0 |
219 | smlal r6, r2, r10, r9 |
220 | mov r3, #0 |
221 | smlal r7, r3, r11, r9 |
222 | lsr r6, #16 |
223 | orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 |
224 | ssatx_prep r2, 11 |
225 | lsr r7, #16 |
226 | orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 |
227 | ssatx r6, r2, 11 @ gteSX(v) |
228 | ssatx r7, r2, 11 @ gteSY(v) |
229 | strh r6, [r1, lr]! |
230 | add lr, #4 |
231 | strh r7, [r1, #2] |
232 | cmp lr, #12 |
233 | blt rtpt_arm_loop |
234 | |
235 | ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB |
236 | add r1, r0, #4*9 @ gteIR1 |
237 | mla r3, r4, r9, r5 @ gteDQB + gteDQA * q |
238 | stmia r1, {r10,r11,r12} @ gteIR123 save |
239 | |
240 | str r3, [r0,#4*24] @ gteMAC0 |
241 | asrs r3, #12 |
242 | movmi r3, #0 |
243 | cmp r3, #0x1000 @ limH |
244 | movgt r3, #0x1000 |
245 | str r3, [r0,#4*8] @ gteIR0 |
246 | |
247 | pop {r4-r11,pc} |
248 | .size gteRTPT_nf_arm, .-gteRTPT_nf_arm |
249 | |
59774ed0 |
250 | |
251 | .global gteNCLIP_arm @ r0=CP2 (d,c), |
252 | gteNCLIP_arm: |
253 | push {r4-r6,lr} |
254 | |
255 | add r1, r0, #4*12 |
256 | ldmia r1, {r1-r3} |
257 | mov r4, r1, asr #16 |
258 | mov r5, r2, asr #16 |
259 | mov r6, r3, asr #16 |
260 | sub r12, r4, r5 @ 3: gteSY0 - gteSY1 |
261 | sub r5, r5, r6 @ 1: gteSY1 - gteSY2 |
262 | sgnxt16 r1 |
263 | smull r1, r5, r1, r5 @ RdLo, RdHi |
264 | sub r6, r4 @ 2: gteSY2 - gteSY0 |
265 | sgnxt16 r2 |
266 | smlal r1, r5, r2, r6 |
267 | mov lr, #0 @ gteFLAG |
268 | sgnxt16 r3 |
269 | smlal r1, r5, r3, r12 |
270 | mov r6, #1<<31 |
271 | orr r6, #1<<15 |
272 | movs r2, r1, lsl #1 |
273 | adc r5, r5 |
274 | cmp r5, #0 |
275 | .if HAVE_ARMV7 |
276 | movtgt lr, #((1<<31)|(1<<16))>>16 |
277 | .else |
278 | movgt lr, #(1<<31) |
279 | orrgt lr, #(1<<16) |
280 | .endif |
281 | mvngt r1, #1<<31 @ maxint |
282 | cmn r5, #1 |
283 | movmi r1, #1<<31 @ minint |
284 | orrmi lr, r6 |
285 | str r1, [r0, #4*24] |
286 | str lr, [r0, #4*(32+31)] @ gteFLAG |
287 | |
288 | pop {r4-r6,pc} |
289 | .size gteNCLIP_arm, .-gteNCLIP_arm |
290 | |
291 | |
292 | @ vim:filetype=armasm |
293 | |