| 1 | /* |
| 2 | * (C) GraÅžvydas "notaz" Ignotas, 2011 |
| 3 | * |
| 4 | * This work is licensed under the terms of GNU GPL version 2 or later. |
| 5 | * See the COPYING file in the top-level directory. |
| 6 | */ |
| 7 | |
| 8 | /* .equiv HAVE_ARMV7, 1 */ |
| 9 | |
| 10 | .text |
| 11 | .align 2 |
| 12 | |
| 13 | .macro sgnxt16 rd |
| 14 | .if HAVE_ARMV7 |
| 15 | sxth \rd, \rd |
| 16 | .else |
| 17 | lsl \rd, \rd, #16 |
| 18 | asr \rd, \rd, #16 |
| 19 | .endif |
| 20 | .endm |
| 21 | |
| 22 | @ prepare work reg for ssatx |
| 23 | @ in: wr reg, bit to saturate to |
| 24 | .macro ssatx_prep wr bit |
| 25 | .if !HAVE_ARMV7 |
| 26 | mov \wr, #(1<<(\bit-1)) |
| 27 | .endif |
| 28 | .endm |
| 29 | |
| 30 | .macro ssatx rd wr bit |
| 31 | .if HAVE_ARMV7 |
| 32 | ssat \rd, #\bit, \rd |
| 33 | .else |
| 34 | cmp \rd, \wr |
| 35 | subge \rd, \wr, #1 |
| 36 | cmn \rd, \wr |
| 37 | rsblt \rd, \wr, #0 |
| 38 | .endif |
| 39 | .endm |
| 40 | |
| 41 | .macro usat16_ rd rs |
| 42 | .if HAVE_ARMV7 |
| 43 | usat \rd, #16, \rs |
| 44 | .else |
| 45 | subs \rd, \rs, #0 |
| 46 | movlt \rd, #0 |
| 47 | cmp \rd, #0x10000 |
| 48 | movge \rd, #0x0ff00 |
| 49 | orrge \rd, #0x000ff |
| 50 | .endif |
| 51 | .endm |
| 52 | |
| 53 | @ unsigned divide rd = rm / rs |
| 54 | @ no div by 0 check |
| 55 | @ in: rm, rs |
| 56 | @ trash: rm rs |
| 57 | .macro udiv rd rm rs |
| 58 | clz \rd, \rs |
| 59 | lsl \rs, \rs, \rd @ shift up divisor |
| 60 | orr \rd, \rd, #1<<31 |
| 61 | lsr \rd, \rd, \rd |
| 62 | 0: |
| 63 | cmp \rm, \rs |
| 64 | subcs \rm, \rs |
| 65 | adcs \rd, \rd, \rd |
| 66 | lsr \rs, #1 |
| 67 | bcc 0b |
| 68 | .endm |
| 69 | |
| 70 | |
| 71 | @ calculate RTPS/RTPT MAC values |
| 72 | @ in: r0 context, r8,r9 VXYZ |
| 73 | @ out: r10-r12 MAC123 |
| 74 | @ trash: r1-r7 |
| 75 | .macro do_rtpx_mac |
| 76 | add r1, r0, #4*32 |
| 77 | add r2, r0, #4*(32+5) @ gteTRX |
| 78 | ldmia r1!,{r5-r7} @ gteR1*,gteR2* |
| 79 | ldmia r2, {r10-r12} |
| 80 | smulbb r2, r5, r8 @ gteR11 * gteVX0 |
| 81 | smultt r3, r5, r8 @ gteR12 * gteVY0 |
| 82 | smulbb r4, r6, r9 @ gteR13 * gteVZ0 |
| 83 | qadd r2, r2, r3 |
| 84 | asr r4, r4, #1 @ prevent oflow, lose a bit |
| 85 | add r3, r4, r2, asr #1 |
| 86 | add r10,r10,r3, asr #11 @ gteMAC1 |
| 87 | smultb r2, r6, r8 @ gteR21 * gteVX0 |
| 88 | smulbt r3, r7, r8 @ gteR22 * gteVY0 |
| 89 | smultb r4, r7, r9 @ gteR23 * gteVZ0 |
| 90 | ldmia r1!,{r5-r6} @ gteR3* |
| 91 | qadd r2, r2, r3 |
| 92 | asr r4, r4, #1 |
| 93 | add r3, r4, r2, asr #1 |
| 94 | add r11,r11,r3, asr #11 @ gteMAC2 |
| 95 | @ be more accurate for gteMAC3, since it's also a divider |
| 96 | smulbb r2, r5, r8 @ gteR31 * gteVX0 |
| 97 | smultt r3, r5, r8 @ gteR32 * gteVY0 |
| 98 | smulbb r4, r6, r9 @ gteR33 * gteVZ0 |
| 99 | qadd r2, r2, r3 |
| 100 | asr r3, r4, #31 @ expand to 64bit |
| 101 | adds r1, r2, r4 |
| 102 | adc r3, r2, asr #31 @ 64bit sum in r3,r1 |
| 103 | add r12,r12,r3, lsl #20 |
| 104 | add r12,r12,r1, lsr #12 @ gteMAC3 |
| 105 | .endm |
| 106 | |
| 107 | |
| 108 | .global gteRTPS_nf_arm @ r0=CP2 (d,c), |
| 109 | gteRTPS_nf_arm: |
| 110 | push {r4-r11,lr} |
| 111 | |
| 112 | ldmia r0, {r8,r9} @ VXYZ(0) |
| 113 | do_rtpx_mac |
| 114 | add r1, r0, #4*25 @ gteMAC1 |
| 115 | add r2, r0, #4*17 @ gteSZ1 |
| 116 | stmia r1, {r10-r12} @ gteMAC123 save |
| 117 | ldmia r2, {r3-r5} |
| 118 | add r1, r0, #4*16 @ gteSZ0 |
| 119 | add r2, r0, #4*9 @ gteIR1 |
| 120 | ssatx_prep r6, 16 |
| 121 | usat16_ lr, r12 @ limD |
| 122 | ssatx r10,r6, 16 |
| 123 | ssatx r11,r6, 16 |
| 124 | ssatx r12,r6, 16 |
| 125 | stmia r1, {r3-r5,lr} @ gteSZ* |
| 126 | ldr r3, [r0,#4*(32+26)] @ gteH |
| 127 | stmia r2, {r10,r11,r12} @ gteIR123 save |
| 128 | cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ? |
| 129 | mov r9, #1<<30 |
| 130 | bhs 1f |
| 131 | .if 1 |
| 132 | lsl r3, #16 |
| 133 | udiv r9, r3, lr |
| 134 | .else |
| 135 | push {r0, r12} |
| 136 | mov r0, r3 |
| 137 | mov r1, lr |
| 138 | bl DIVIDE |
| 139 | mov r9, r0 |
| 140 | pop {r0, r12} |
| 141 | .endif |
| 142 | 1: |
| 143 | ldrd r6, [r0,#4*(32+24)] @ gteOFXY |
| 144 | cmp r9, #0x20000 |
| 145 | add r1, r0, #4*12 @ gteSXY0 |
| 146 | movhs r9, #0x20000 |
| 147 | ldmia r1, {r2-r4} |
| 148 | /* quotient */ subhs r9, #1 |
| 149 | mov r2, #0 |
| 150 | smlal r6, r2, r10, r9 |
| 151 | stmia r1!,{r3,r4} @ shift gteSXY |
| 152 | mov r3, #0 |
| 153 | smlal r7, r3, r11, r9 |
| 154 | lsr r6, #16 |
| 155 | /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)] |
| 156 | orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 |
| 157 | ssatx_prep r2, 11 |
| 158 | lsr r7, #16 |
| 159 | /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11 |
| 160 | orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 |
| 161 | ssatx r6, r2, 11 @ gteSX2 |
| 162 | ssatx r7, r2, 11 @ gteSY2 |
| 163 | strh r6, [r1] |
| 164 | strh r7, [r1, #2] |
| 165 | str r4, [r0,#4*24] @ gteMAC0 |
| 166 | asrs r4, #12 |
| 167 | movmi r4, #0 |
| 168 | cmp r4, #0x1000 @ limH |
| 169 | movgt r4, #0x1000 |
| 170 | str r4, [r0,#4*8] @ gteIR0 |
| 171 | |
| 172 | pop {r4-r11,pc} |
| 173 | .size gteRTPS_nf_arm, .-gteRTPS_nf_arm |
| 174 | |
| 175 | |
| 176 | .global gteRTPT_nf_arm @ r0=CP2 (d,c), |
| 177 | gteRTPT_nf_arm: |
| 178 | ldr r1, [r0, #4*19] @ gteSZ3 |
| 179 | push {r4-r11,lr} |
| 180 | str r1, [r0, #4*16] @ gteSZ0 |
| 181 | mov lr, #0 |
| 182 | |
| 183 | rtpt_arm_loop: |
| 184 | add r1, r0, lr, lsl #1 |
| 185 | ldrd r8, [r1] @ VXYZ(v) |
| 186 | do_rtpx_mac |
| 187 | |
| 188 | ssatx_prep r6, 16 |
| 189 | usat16_ r2, r12 @ limD |
| 190 | add r1, r0, #4*25 @ gteMAC1 |
| 191 | ldr r3, [r0,#4*(32+26)] @ gteH |
| 192 | stmia r1, {r10-r12} @ gteMAC123 save |
| 193 | add r1, r0, #4*17 |
| 194 | ssatx r10,r6, 16 |
| 195 | ssatx r11,r6, 16 |
| 196 | ssatx r12,r6, 16 |
| 197 | str r2, [r1, lr] @ fSZ(v) |
| 198 | cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ? |
| 199 | mov r9, #1<<30 |
| 200 | bhs 1f |
| 201 | .if 1 |
| 202 | lsl r3, #16 |
| 203 | udiv r9, r3, r2 |
| 204 | .else |
| 205 | push {r0, r12, lr} |
| 206 | mov r0, r3 |
| 207 | mov r1, r2 |
| 208 | bl DIVIDE |
| 209 | mov r9, r0 |
| 210 | pop {r0, r12, lr} |
| 211 | .endif |
| 212 | 1: |
| 213 | cmp r9, #0x20000 |
| 214 | add r1, r0, #4*12 |
| 215 | movhs r9, #0x20000 |
| 216 | ldrd r6, [r0,#4*(32+24)] @ gteOFXY |
| 217 | /* quotient */ subhs r9, #1 |
| 218 | mov r2, #0 |
| 219 | smlal r6, r2, r10, r9 |
| 220 | mov r3, #0 |
| 221 | smlal r7, r3, r11, r9 |
| 222 | lsr r6, #16 |
| 223 | orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 |
| 224 | ssatx_prep r2, 11 |
| 225 | lsr r7, #16 |
| 226 | orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 |
| 227 | ssatx r6, r2, 11 @ gteSX(v) |
| 228 | ssatx r7, r2, 11 @ gteSY(v) |
| 229 | strh r6, [r1, lr]! |
| 230 | add lr, #4 |
| 231 | strh r7, [r1, #2] |
| 232 | cmp lr, #12 |
| 233 | blt rtpt_arm_loop |
| 234 | |
| 235 | ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB |
| 236 | add r1, r0, #4*9 @ gteIR1 |
| 237 | mla r3, r4, r9, r5 @ gteDQB + gteDQA * q |
| 238 | stmia r1, {r10,r11,r12} @ gteIR123 save |
| 239 | |
| 240 | str r3, [r0,#4*24] @ gteMAC0 |
| 241 | asrs r3, #12 |
| 242 | movmi r3, #0 |
| 243 | cmp r3, #0x1000 @ limH |
| 244 | movgt r3, #0x1000 |
| 245 | str r3, [r0,#4*8] @ gteIR0 |
| 246 | |
| 247 | pop {r4-r11,pc} |
| 248 | .size gteRTPT_nf_arm, .-gteRTPT_nf_arm |
| 249 | |
| 250 | |
| 251 | .global gteNCLIP_arm @ r0=CP2 (d,c), |
| 252 | gteNCLIP_arm: |
| 253 | push {r4-r6,lr} |
| 254 | |
| 255 | add r1, r0, #4*12 |
| 256 | ldmia r1, {r1-r3} |
| 257 | mov r4, r1, asr #16 |
| 258 | mov r5, r2, asr #16 |
| 259 | mov r6, r3, asr #16 |
| 260 | sub r12, r4, r5 @ 3: gteSY0 - gteSY1 |
| 261 | sub r5, r5, r6 @ 1: gteSY1 - gteSY2 |
| 262 | sgnxt16 r1 |
| 263 | smull r1, r5, r1, r5 @ RdLo, RdHi |
| 264 | sub r6, r4 @ 2: gteSY2 - gteSY0 |
| 265 | sgnxt16 r2 |
| 266 | smlal r1, r5, r2, r6 |
| 267 | mov lr, #0 @ gteFLAG |
| 268 | sgnxt16 r3 |
| 269 | smlal r1, r5, r3, r12 |
| 270 | mov r6, #1<<31 |
| 271 | orr r6, #1<<15 |
| 272 | movs r2, r1, lsl #1 |
| 273 | adc r5, r5 |
| 274 | cmp r5, #0 |
| 275 | .if HAVE_ARMV7 |
| 276 | movtgt lr, #((1<<31)|(1<<16))>>16 |
| 277 | .else |
| 278 | movgt lr, #(1<<31) |
| 279 | orrgt lr, #(1<<16) |
| 280 | .endif |
| 281 | mvngt r1, #1<<31 @ maxint |
| 282 | cmn r5, #1 |
| 283 | movmi r1, #1<<31 @ minint |
| 284 | orrmi lr, r6 |
| 285 | str r1, [r0, #4*24] |
| 286 | str lr, [r0, #4*(32+31)] @ gteFLAG |
| 287 | |
| 288 | pop {r4-r6,pc} |
| 289 | .size gteNCLIP_arm, .-gteNCLIP_arm |
| 290 | |
| 291 | |
| 292 | @ vim:filetype=armasm |
| 293 | |