| 1 | /* |
| 2 | * (C) GraÅžvydas "notaz" Ignotas, 2011 |
| 3 | * |
| 4 | * This work is licensed under the terms of GNU GPL version 2 or later. |
| 5 | * See the COPYING file in the top-level directory. |
| 6 | */ |
| 7 | |
| 8 | #include "arm_features.h" |
| 9 | |
| 10 | .text |
| 11 | .align 2 |
| 12 | |
| 13 | .macro sgnxt16 rd rs |
| 14 | #ifdef HAVE_ARMV7 |
| 15 | sxth \rd, \rs |
| 16 | #else |
| 17 | lsl \rd, \rs, #16 |
| 18 | asr \rd, \rd, #16 |
| 19 | #endif |
| 20 | .endm |
| 21 | |
| 22 | @ prepare work reg for ssatx |
| 23 | @ in: wr reg, bit to saturate to |
| 24 | .macro ssatx_prep wr bit |
| 25 | #ifndef HAVE_ARMV7 |
| 26 | mov \wr, #(1<<(\bit-1)) |
| 27 | #endif |
| 28 | .endm |
| 29 | |
| 30 | .macro ssatx rd wr bit |
| 31 | #ifdef HAVE_ARMV7 |
| 32 | ssat \rd, #\bit, \rd |
| 33 | #else |
| 34 | cmp \rd, \wr |
| 35 | subge \rd, \wr, #1 |
| 36 | cmn \rd, \wr |
| 37 | rsblt \rd, \wr, #0 |
| 38 | #endif |
| 39 | .endm |
| 40 | |
| 41 | @ prepare work reg for ssatx0 (sat to 0..2^(bit-1)) |
| 42 | @ in: wr reg, bit to saturate to |
| 43 | .macro ssatx0_prep wr bit |
| 44 | mov \wr, #(1<<(\bit-1)) |
| 45 | .endm |
| 46 | |
| 47 | .macro ssatx0 rd wr bit |
| 48 | cmp \rd, \wr |
| 49 | subge \rd, \wr, #1 |
| 50 | cmn \rd, #0 |
| 51 | movlt \rd, #0 |
| 52 | .endm |
| 53 | |
| 54 | .macro usat16_ rd rs |
| 55 | #ifdef HAVE_ARMV7 |
| 56 | usat \rd, #16, \rs |
| 57 | #else |
| 58 | subs \rd, \rs, #0 |
| 59 | movlt \rd, #0 |
| 60 | cmp \rd, #0x10000 |
| 61 | movge \rd, #0x0ff00 |
| 62 | orrge \rd, #0x000ff |
| 63 | #endif |
| 64 | .endm |
| 65 | |
| 66 | #ifdef HAVE_ARMV5 |
| 67 | |
| 68 | .macro udiv_ rd rm rs |
| 69 | lsl \rm, #16 |
| 70 | clz \rd, \rs |
| 71 | lsl \rs, \rs, \rd @ shift up divisor |
| 72 | orr \rd, \rd, #1<<31 |
| 73 | lsr \rd, \rd, \rd |
| 74 | 0: |
| 75 | cmp \rm, \rs |
| 76 | subcs \rm, \rs |
| 77 | adcs \rd, \rd, \rd |
| 78 | lsr \rs, #1 |
| 79 | bcc 0b |
| 80 | .endm |
| 81 | |
| 82 | .macro newton_step rcp den zero t1 t2 |
| 83 | umull \t2, \t1, \den, \rcp @ \t2 is dummy |
| 84 | sub \t1, \zero, \t1, lsl #2 |
| 85 | smlal \t2, \rcp, \t1, \rcp |
| 86 | .endm |
| 87 | |
| 88 | .macro udiv_newton rd rm rs t1 t2 t3 t4 |
| 89 | lsl \rd, \rm, #16 |
| 90 | clz \t1, \rs |
| 91 | mov \t2, #0 |
| 92 | lsl \rs, \t1 @ normalize for the algo |
| 93 | mov \rm, #0x4d000000 @ initial estimate ~1.2 |
| 94 | |
| 95 | newton_step \rm, \rs, \t2, \t3, \t4 |
| 96 | newton_step \rm, \rs, \t2, \t3, \t4 |
| 97 | newton_step \rm, \rs, \t2, \t3, \t4 |
| 98 | newton_step \rm, \rs, \t2, \t3, \t4 |
| 99 | |
| 100 | umull \t4, \rd, \rm, \rd |
| 101 | rsb \t2, \t1, #30 @ here t1 is 1..15 |
| 102 | mov \rd, \rd, lsr \t2 |
| 103 | .endm |
| 104 | |
| 105 | @ unsigned divide rd = rm / rs; 16.16 result |
| 106 | @ no div by 0 check |
| 107 | @ in: rm, rs |
| 108 | @ trash: rm rs t* |
| 109 | .macro udiv rd rm rs t1 t2 t3 t4 |
| 110 | @udiv_ \rd, \rm, \rs |
| 111 | udiv_newton \rd, \rm, \rs, \t1, \t2, \t3, \t4 |
| 112 | .endm |
| 113 | |
| 114 | @ calculate RTPS/RTPT MAC values |
| 115 | @ in: r0 context, r8,r9 VXYZ |
| 116 | @ out: r10-r12 MAC123 |
| 117 | @ trash: r1-r7 |
| 118 | .macro do_rtpx_mac |
| 119 | add r1, r0, #4*32 |
| 120 | add r2, r0, #4*(32+5) @ gteTRX |
| 121 | ldmia r1!,{r5-r7} @ gteR1*,gteR2* |
| 122 | ldmia r2, {r10-r12} |
| 123 | smulbb r2, r5, r8 @ gteR11 * gteVX0 |
| 124 | smultt r3, r5, r8 @ gteR12 * gteVY0 |
| 125 | smulbb r4, r6, r9 @ gteR13 * gteVZ0 |
| 126 | qadd r2, r2, r3 |
| 127 | asr r4, r4, #1 @ prevent oflow, lose a bit |
| 128 | add r3, r4, r2, asr #1 |
| 129 | add r10,r10,r3, asr #11 @ gteMAC1 |
| 130 | smultb r2, r6, r8 @ gteR21 * gteVX0 |
| 131 | smulbt r3, r7, r8 @ gteR22 * gteVY0 |
| 132 | smultb r4, r7, r9 @ gteR23 * gteVZ0 |
| 133 | ldmia r1!,{r5-r6} @ gteR3* |
| 134 | qadd r2, r2, r3 |
| 135 | asr r4, r4, #1 |
| 136 | add r3, r4, r2, asr #1 |
| 137 | add r11,r11,r3, asr #11 @ gteMAC2 |
| 138 | @ be more accurate for gteMAC3, since it's also a divider |
| 139 | smulbb r2, r5, r8 @ gteR31 * gteVX0 |
| 140 | smultt r3, r5, r8 @ gteR32 * gteVY0 |
| 141 | smulbb r4, r6, r9 @ gteR33 * gteVZ0 |
| 142 | qadd r2, r2, r3 |
| 143 | asr r3, r4, #31 @ expand to 64bit |
| 144 | adds r1, r2, r4 |
| 145 | adc r3, r2, asr #31 @ 64bit sum in r3,r1 |
| 146 | add r12,r12,r3, lsl #20 |
| 147 | add r12,r12,r1, lsr #12 @ gteMAC3 |
| 148 | .endm |
| 149 | |
| 150 | |
| 151 | .global gteRTPS_nf_arm @ r0=CP2 (d,c), |
| 152 | gteRTPS_nf_arm: |
| 153 | push {r4-r11,lr} |
| 154 | |
| 155 | ldmia r0, {r8,r9} @ VXYZ(0) |
| 156 | do_rtpx_mac |
| 157 | add r1, r0, #4*25 @ gteMAC1 |
| 158 | add r2, r0, #4*17 @ gteSZ1 |
| 159 | stmia r1, {r10-r12} @ gteMAC123 save |
| 160 | ldmia r2, {r3-r5} |
| 161 | add r1, r0, #4*16 @ gteSZ0 |
| 162 | add r2, r0, #4*9 @ gteIR1 |
| 163 | ssatx_prep r6, 16 |
| 164 | usat16_ lr, r12 @ limD |
| 165 | ssatx r10,r6, 16 |
| 166 | ssatx r11,r6, 16 |
| 167 | ssatx r12,r6, 16 |
| 168 | stmia r1, {r3-r5,lr} @ gteSZ* |
| 169 | ldr r3, [r0,#4*(32+26)] @ gteH |
| 170 | stmia r2, {r10,r11,r12} @ gteIR123 save |
| 171 | cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ? |
| 172 | mov r9, #1<<30 |
| 173 | bhs 1f |
| 174 | .if 1 |
| 175 | udiv r9, r3, lr, r1, r2, r6, r7 |
| 176 | .else |
| 177 | push {r0, r12} |
| 178 | mov r0, r3 |
| 179 | mov r1, lr |
| 180 | bl DIVIDE |
| 181 | mov r9, r0 |
| 182 | pop {r0, r12} |
| 183 | .endif |
| 184 | 1: |
| 185 | ldrd r6, [r0,#4*(32+24)] @ gteOFXY |
| 186 | cmp r9, #0x20000 |
| 187 | add r1, r0, #4*12 @ gteSXY0 |
| 188 | movhs r9, #0x20000 |
| 189 | ldmia r1, {r2-r4} |
| 190 | /* quotient */ subhs r9, #1 |
| 191 | mov r2, r6, asr #31 |
| 192 | smlal r6, r2, r10, r9 |
| 193 | stmia r1!,{r3,r4} @ shift gteSXY |
| 194 | mov r3, r7, asr #31 |
| 195 | smlal r7, r3, r11, r9 |
| 196 | lsr r6, #16 |
| 197 | /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)] |
| 198 | orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 |
| 199 | ssatx_prep r2, 11 |
| 200 | lsr r7, #16 |
| 201 | /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11 |
| 202 | orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 |
| 203 | ssatx r6, r2, 11 @ gteSX2 |
| 204 | ssatx r7, r2, 11 @ gteSY2 |
| 205 | strh r6, [r1] |
| 206 | strh r7, [r1, #2] |
| 207 | str r4, [r0,#4*24] @ gteMAC0 |
| 208 | asrs r4, #12 |
| 209 | movmi r4, #0 |
| 210 | cmp r4, #0x1000 @ limH |
| 211 | movgt r4, #0x1000 |
| 212 | str r4, [r0,#4*8] @ gteIR0 |
| 213 | |
| 214 | pop {r4-r11,pc} |
| 215 | .size gteRTPS_nf_arm, .-gteRTPS_nf_arm |
| 216 | |
| 217 | |
| 218 | .global gteRTPT_nf_arm @ r0=CP2 (d,c), |
| 219 | gteRTPT_nf_arm: |
| 220 | ldr r1, [r0, #4*19] @ gteSZ3 |
| 221 | push {r4-r11,lr} |
| 222 | str r1, [r0, #4*16] @ gteSZ0 |
| 223 | mov lr, #0 |
| 224 | |
| 225 | rtpt_arm_loop: |
| 226 | add r1, r0, lr, lsl #1 |
| 227 | ldrd r8, [r1] @ VXYZ(v) |
| 228 | do_rtpx_mac |
| 229 | |
| 230 | ssatx_prep r6, 16 |
| 231 | usat16_ r2, r12 @ limD |
| 232 | add r1, r0, #4*25 @ gteMAC1 |
| 233 | ldr r3, [r0,#4*(32+26)] @ gteH |
| 234 | stmia r1, {r10-r12} @ gteMAC123 save |
| 235 | add r1, r0, #4*17 |
| 236 | ssatx r10,r6, 16 |
| 237 | ssatx r11,r6, 16 |
| 238 | ssatx r12,r6, 16 |
| 239 | str r2, [r1, lr] @ fSZ(v) |
| 240 | cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ? |
| 241 | mov r9, #1<<30 |
| 242 | bhs 1f |
| 243 | .if 1 |
| 244 | udiv r9, r3, r2, r1, r4, r6, r7 |
| 245 | .else |
| 246 | push {r0, r12, lr} |
| 247 | mov r0, r3 |
| 248 | mov r1, r2 |
| 249 | bl DIVIDE |
| 250 | mov r9, r0 |
| 251 | pop {r0, r12, lr} |
| 252 | .endif |
| 253 | 1: cmp r9, #0x20000 |
| 254 | add r1, r0, #4*12 |
| 255 | movhs r9, #0x20000 |
| 256 | ldrd r6, [r0,#4*(32+24)] @ gteOFXY |
| 257 | /* quotient */ subhs r9, #1 |
| 258 | mov r2, r6, asr #31 |
| 259 | smlal r6, r2, r10, r9 |
| 260 | mov r3, r7, asr #31 |
| 261 | smlal r7, r3, r11, r9 |
| 262 | lsr r6, #16 |
| 263 | orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 |
| 264 | ssatx_prep r2, 11 |
| 265 | lsr r7, #16 |
| 266 | orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 |
| 267 | ssatx r6, r2, 11 @ gteSX(v) |
| 268 | ssatx r7, r2, 11 @ gteSY(v) |
| 269 | strh r6, [r1, lr]! |
| 270 | add lr, #4 |
| 271 | strh r7, [r1, #2] |
| 272 | cmp lr, #12 |
| 273 | blt rtpt_arm_loop |
| 274 | |
| 275 | ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB |
| 276 | add r1, r0, #4*9 @ gteIR1 |
| 277 | mla r3, r4, r9, r5 @ gteDQB + gteDQA * q |
| 278 | stmia r1, {r10,r11,r12} @ gteIR123 save |
| 279 | |
| 280 | str r3, [r0,#4*24] @ gteMAC0 |
| 281 | asrs r3, #12 |
| 282 | movmi r3, #0 |
| 283 | cmp r3, #0x1000 @ limH |
| 284 | movgt r3, #0x1000 |
| 285 | str r3, [r0,#4*8] @ gteIR0 |
| 286 | |
| 287 | pop {r4-r11,pc} |
| 288 | .size gteRTPT_nf_arm, .-gteRTPT_nf_arm |
| 289 | |
| 290 | |
| 291 | @ note: not std calling convention used |
| 292 | @ r0 = CP2 (d,c) (must preserve) |
| 293 | @ r1 = needs_shift12 |
| 294 | @ r4,r5 = VXYZ(v) packed |
| 295 | @ r6 = &MX11(mx) |
| 296 | @ r7 = &CV1(cv) |
| 297 | .macro mvma_op do_flags |
| 298 | push {r8-r11} |
| 299 | |
| 300 | .if \do_flags |
| 301 | ands r3, r1, #1 @ gteFLAG, shift_need |
| 302 | .else |
| 303 | tst r1, #1 |
| 304 | .endif |
| 305 | ldmia r7, {r7-r9} @ CV123 |
| 306 | ldmia r6!,{r10-r12} @ MX1*,MX2* |
| 307 | asr r1, r7, #20 |
| 308 | lsl r7, #12 @ expand to 64bit |
| 309 | smlalbb r7, r1, r10, r4 @ MX11 * vx |
| 310 | smlaltt r7, r1, r10, r4 @ MX12 * vy |
| 311 | smlalbb r7, r1, r11, r5 @ MX13 * vz |
| 312 | lsrne r7, #12 |
| 313 | orrne r7, r1, lsl #20 @ gteMAC0 |
| 314 | .if \do_flags |
| 315 | asrne r1, #20 |
| 316 | adds r2, r7, #0x80000000 |
| 317 | adcs r1, #0 |
| 318 | orrgt r3, #(1<<30) |
| 319 | orrmi r3, #(1<<31)|(1<<27) |
| 320 | tst r3, #1 @ repeat shift test |
| 321 | .endif |
| 322 | asr r1, r8, #20 |
| 323 | lsl r8, #12 @ expand to 64bit |
| 324 | smlaltb r8, r1, r11, r4 @ MX21 * vx |
| 325 | smlalbt r8, r1, r12, r4 @ MX22 * vy |
| 326 | smlaltb r8, r1, r12, r5 @ MX23 * vz |
| 327 | lsrne r8, #12 |
| 328 | orrne r8, r1, lsl #20 @ gteMAC1 |
| 329 | .if \do_flags |
| 330 | asrne r1, #20 |
| 331 | adds r2, r8, #0x80000000 |
| 332 | adcs r1, #0 |
| 333 | orrgt r3, #(1<<29) |
| 334 | orrmi r3, #(1<<31)|(1<<26) |
| 335 | tst r3, #1 @ repeat shift test |
| 336 | .endif |
| 337 | ldmia r6!,{r10-r11} @ MX3* |
| 338 | asr r1, r9, #20 |
| 339 | lsl r9, #12 @ expand to 64bit |
| 340 | smlalbb r9, r1, r10, r4 @ MX31 * vx |
| 341 | smlaltt r9, r1, r10, r4 @ MX32 * vy |
| 342 | smlalbb r9, r1, r11, r5 @ MX33 * vz |
| 343 | lsrne r9, #12 |
| 344 | orrne r9, r1, lsl #20 @ gteMAC2 |
| 345 | .if \do_flags |
| 346 | asrne r1, #20 |
| 347 | adds r2, r9, #0x80000000 |
| 348 | adcs r1, #0 |
| 349 | orrgt r3, #(1<<28) |
| 350 | orrmi r3, #(1<<31)|(1<<25) |
| 351 | bic r3, #1 |
| 352 | .else |
| 353 | mov r3, #0 |
| 354 | .endif |
| 355 | str r3, [r0, #4*(32+31)] @ gteFLAG |
| 356 | add r1, r0, #4*25 |
| 357 | stmia r1, {r7-r9} |
| 358 | |
| 359 | pop {r8-r11} |
| 360 | bx lr |
| 361 | .endm |
| 362 | |
| 363 | .global gteMVMVA_part_arm |
| 364 | gteMVMVA_part_arm: |
| 365 | mvma_op 1 |
| 366 | .size gteMVMVA_part_arm, .-gteMVMVA_part_arm |
| 367 | |
| 368 | .global gteMVMVA_part_nf_arm |
| 369 | gteMVMVA_part_nf_arm: |
| 370 | mvma_op 0 |
| 371 | .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm |
| 372 | |
| 373 | @ common version of MVMVA with cv3 (== 0) and shift12, |
| 374 | @ can't overflow so no gteMAC flags needed |
| 375 | @ note: not std calling convention used |
| 376 | @ r0 = CP2 (d,c) (must preserve) |
| 377 | @ r4,r5 = VXYZ(v) packed |
| 378 | @ r6 = &MX11(mx) |
| 379 | .global gteMVMVA_part_cv3sh12_arm |
| 380 | gteMVMVA_part_cv3sh12_arm: |
| 381 | push {r8-r9} |
| 382 | ldmia r6!,{r7-r9} @ MX1*,MX2* |
| 383 | smulbb r1, r7, r4 @ MX11 * vx |
| 384 | smultt r2, r7, r4 @ MX12 * vy |
| 385 | smulbb r3, r8, r5 @ MX13 * vz |
| 386 | qadd r1, r1, r2 |
| 387 | asr r3, #1 @ prevent oflow, lose a bit |
| 388 | add r1, r3, r1, asr #1 |
| 389 | asr r7, r1, #11 |
| 390 | smultb r1, r8, r4 @ MX21 * vx |
| 391 | smulbt r2, r9, r4 @ MX22 * vy |
| 392 | smultb r3, r9, r5 @ MX23 * vz |
| 393 | qadd r1, r1, r2 |
| 394 | asr r3, #1 |
| 395 | add r1, r3, r1, asr #1 |
| 396 | asr r8, r1, #11 |
| 397 | ldmia r6, {r6,r9} @ MX3* |
| 398 | smulbb r1, r6, r4 @ MX31 * vx |
| 399 | smultt r2, r6, r4 @ MX32 * vy |
| 400 | smulbb r3, r9, r5 @ MX33 * vz |
| 401 | qadd r1, r1, r2 |
| 402 | asr r3, #1 |
| 403 | add r1, r3, r1, asr #1 |
| 404 | asr r9, r1, #11 |
| 405 | add r1, r0, #4*25 |
| 406 | mov r2, #0 |
| 407 | stmia r1, {r7-r9} |
| 408 | str r2, [r0, #4*(32+31)] @ gteFLAG |
| 409 | pop {r8-r9} |
| 410 | bx lr |
| 411 | .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm |
| 412 | |
| 413 | #endif /* HAVE_ARMV5 */ |
| 414 | |
| 415 | .global gteNCLIP_arm @ r0=CP2 (d,c), |
| 416 | gteNCLIP_arm: |
| 417 | push {r4-r6,lr} |
| 418 | ldrsh r4, [r0, #4*12+2] |
| 419 | ldrsh r5, [r0, #4*13+2] |
| 420 | ldrsh r6, [r0, #4*14+2] |
| 421 | ldrsh lr, [r0, #4*12] |
| 422 | ldrsh r2, [r0, #4*13] |
| 423 | sub r12, r4, r5 @ 3: gteSY0 - gteSY1 |
| 424 | sub r5, r5, r6 @ 1: gteSY1 - gteSY2 |
| 425 | smull r1, r5, lr, r5 @ RdLo, RdHi |
| 426 | sub r6, r4 @ 2: gteSY2 - gteSY0 |
| 427 | ldrsh r3, [r0, #4*14] |
| 428 | smlal r1, r5, r2, r6 |
| 429 | mov lr, #0 @ gteFLAG |
| 430 | smlal r1, r5, r3, r12 |
| 431 | mov r6, #1<<31 |
| 432 | orr r6, #1<<15 |
| 433 | movs r2, r1, lsl #1 |
| 434 | adc r5, r5 |
| 435 | cmp r5, #0 |
| 436 | #ifdef HAVE_ARMV7 |
| 437 | movtgt lr, #((1<<31)|(1<<16))>>16 |
| 438 | #else |
| 439 | movgt lr, #(1<<31) |
| 440 | orrgt lr, #(1<<16) |
| 441 | #endif |
| 442 | cmn r5, #1 |
| 443 | orrmi lr, r6 |
| 444 | str r1, [r0, #4*24] |
| 445 | str lr, [r0, #4*(32+31)] @ gteFLAG |
| 446 | |
| 447 | pop {r4-r6,pc} |
| 448 | .size gteNCLIP_arm, .-gteNCLIP_arm |
| 449 | |
| 450 | |
| 451 | .macro gteMACtoIR lm |
| 452 | ldr r2, [r0, #4*25] @ gteMAC1 |
| 453 | mov r1, #1<<15 |
| 454 | ldr r12,[r0, #4*(32+31)] @ gteFLAG |
| 455 | cmp r2, r1 |
| 456 | subge r2, r1, #1 |
| 457 | orrge r12, #(1<<31)|(1<<24) |
| 458 | .if \lm |
| 459 | cmp r2, #0 |
| 460 | movlt r2, #0 |
| 461 | .else |
| 462 | cmn r2, r1 |
| 463 | rsblt r2, r1, #0 |
| 464 | .endif |
| 465 | str r2, [r0, #4*9] |
| 466 | #ifdef HAVE_ARMV5 |
| 467 | ldrd r2, [r0, #4*26] @ gteMAC23 |
| 468 | #else |
| 469 | ldr r2, [r0, #4*26] |
| 470 | ldr r3, [r0, #4*27] |
| 471 | #endif |
| 472 | orrlt r12, #(1<<31)|(1<<24) |
| 473 | cmp r2, r1 |
| 474 | subge r2, r1, #1 |
| 475 | orrge r12, #1<<23 |
| 476 | orrge r12, #1<<31 |
| 477 | .if \lm |
| 478 | cmp r2, #0 |
| 479 | movlt r2, #0 |
| 480 | .else |
| 481 | cmn r2, r1 |
| 482 | rsblt r2, r1, #0 |
| 483 | .endif |
| 484 | orrlt r12, #1<<23 |
| 485 | orrlt r12, #1<<31 |
| 486 | cmp r3, r1 |
| 487 | subge r3, r1, #1 |
| 488 | orrge r12, #1<<22 |
| 489 | .if \lm |
| 490 | cmp r3, #0 |
| 491 | movlt r3, #0 |
| 492 | .else |
| 493 | cmn r3, r1 |
| 494 | rsblt r3, r1, #0 |
| 495 | .endif |
| 496 | orrlt r12, #1<<22 |
| 497 | #ifdef HAVE_ARMV5 |
| 498 | strd r2, [r0, #4*10] @ gteIR23 |
| 499 | #else |
| 500 | str r2, [r0, #4*10] |
| 501 | str r3, [r0, #4*11] |
| 502 | #endif |
| 503 | str r12,[r0, #4*(32+31)] @ gteFLAG |
| 504 | bx lr |
| 505 | .endm |
| 506 | |
| 507 | .global gteMACtoIR_lm0 @ r0=CP2 (d,c) |
| 508 | gteMACtoIR_lm0: |
| 509 | gteMACtoIR 0 |
| 510 | .size gteMACtoIR_lm0, .-gteMACtoIR_lm0 |
| 511 | |
| 512 | .global gteMACtoIR_lm1 @ r0=CP2 (d,c) |
| 513 | gteMACtoIR_lm1: |
| 514 | gteMACtoIR 1 |
| 515 | .size gteMACtoIR_lm1, .-gteMACtoIR_lm1 |
| 516 | |
| 517 | |
| 518 | .global gteMACtoIR_lm0_nf @ r0=CP2 (d,c) |
| 519 | gteMACtoIR_lm0_nf: |
| 520 | add r12, r0, #4*25 |
| 521 | ldmia r12, {r1-r3} |
| 522 | ssatx_prep r12, 16 |
| 523 | ssatx r1, r12, 16 |
| 524 | ssatx r2, r12, 16 |
| 525 | ssatx r3, r12, 16 |
| 526 | add r12, r0, #4*9 |
| 527 | stmia r12, {r1-r3} |
| 528 | bx lr |
| 529 | .size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf |
| 530 | |
| 531 | |
| 532 | .global gteMACtoIR_lm1_nf @ r0=CP2 (d,c) |
| 533 | gteMACtoIR_lm1_nf: |
| 534 | add r12, r0, #4*25 |
| 535 | ldmia r12, {r1-r3} |
| 536 | ssatx0_prep r12, 16 |
| 537 | ssatx0 r1, r12, 16 |
| 538 | ssatx0 r2, r12, 16 |
| 539 | ssatx0 r3, r12, 16 |
| 540 | add r12, r0, #4*9 |
| 541 | stmia r12, {r1-r3} |
| 542 | bx lr |
| 543 | .size gteMACtoIR_lm1_nf, .-gteMACtoIR_lm1_nf |
| 544 | |
| 545 | |
| 546 | .if 0 |
| 547 | .global gteMVMVA_test |
| 548 | gteMVMVA_test: |
| 549 | push {r4-r7,lr} |
| 550 | push {r1} |
| 551 | and r2, r1, #0x18000 @ v |
| 552 | cmp r2, #0x18000 @ v == 3? |
| 553 | addeq r4, r0, #4*9 |
| 554 | addne r3, r0, r2, lsr #12 |
| 555 | ldmeqia r4, {r3-r5} |
| 556 | ldmneia r3, {r4,r5} |
| 557 | lsleq r3, #16 |
| 558 | lsreq r3, #16 |
| 559 | orreq r4, r3, r4, lsl #16 @ r4,r5 = VXYZ(v) |
| 560 | @and r5, #0xffff |
| 561 | add r12, r0, #4*32 |
| 562 | and r3, r1, #0x60000 @ mx |
| 563 | lsr r3, #17 |
| 564 | add r6, r12, r3, lsl #5 |
| 565 | cmp r3, #3 |
| 566 | adreq r6, zeroes |
| 567 | and r2, r1, #0x06000 @ cv |
| 568 | lsr r2, #13 |
| 569 | add r7, r12, r2, lsl #5 |
| 570 | add r7, #4*5 |
| 571 | cmp r2, #3 |
| 572 | adreq r7, zeroes |
| 573 | .if 1 |
| 574 | adr lr, 1f |
| 575 | bne 0f |
| 576 | tst r1, #1<<19 |
| 577 | bne gteMVMVA_part_cv3sh12_arm |
| 578 | 0: |
| 579 | and r1, #1<<19 |
| 580 | lsr r1, #19 |
| 581 | b gteMVMVA_part_arm |
| 582 | 1: |
| 583 | pop {r1} |
| 584 | tst r1, #1<<10 |
| 585 | adr lr, 0f |
| 586 | beq gteMACtoIR_lm0 |
| 587 | bne gteMACtoIR_lm1 |
| 588 | 0: |
| 589 | .else |
| 590 | bl gteMVMVA_part_neon |
| 591 | pop {r1} |
| 592 | and r1, #1<<10 |
| 593 | bl gteMACtoIR_flags_neon |
| 594 | .endif |
| 595 | pop {r4-r7,pc} |
| 596 | |
| 597 | zeroes: |
| 598 | .word 0,0,0,0,0 |
| 599 | .endif |
| 600 | |
| 601 | |
| 602 | @ vim:filetype=armasm |
| 603 | |