| 1 | /* |
| 2 | * (C) GraÅžvydas "notaz" Ignotas, 2014 |
| 3 | * |
| 4 | * This work is licensed under the terms of GNU GPL version 2 or later. |
| 5 | * See the COPYING file in the top-level directory. |
| 6 | */ |
| 7 | |
| 8 | #include "arm_features.h" |
| 9 | #include "RenderBase_neon.h" |
| 10 | |
| 11 | .syntax unified |
| 12 | .text |
| 13 | .align 3 |
| 14 | |
| 15 | /* |
| 16 | * ProcessVertexData register map: |
| 17 | * |
| 18 | * q | d | c code |
| 19 | * ... |
| 20 | * 12 24 gRSPworldProject _11,_12,_13,_14 |
| 21 | * 25 |
| 22 | * 13 26 gRSPworldProject _21,_22,_23,_24 |
| 23 | * 27 |
| 24 | * 14 28 gRSPworldProject _31,_32,_33,_34 |
| 25 | * 29 |
| 26 | * 15 30 gRSPworldProject _41,_42,_43,_44 |
| 27 | * 31 |
| 28 | * |
| 29 | * r4 vtx[], 16 bytes: |
| 30 | * short y, x, flag, z, tv, tu; |
| 31 | * / uint8 a, b, g, r; |
| 32 | * \ char a, z, y, x; |
| 33 | * |
| 34 | * outputs: |
| 35 | * r0 - XVECTOR4 *g_vtxTransformed |
| 36 | * r1 - XVECTOR4 *g_vecProjected |
| 37 | * r2 - uint32 *g_dwVtxDifColor |
| 38 | * r3 - VECTOR2 *g_fVtxTxtCoords |
| 39 | * sp+00 - float *g_fFogCoord |
| 40 | * r6 sp+04 - uint32 *g_clipFlag2 |
| 41 | * inputs: |
| 42 | * r11 sp+08 - uint32 dwNum |
| 43 | * r10 sp+0c - int neon_flags |
| 44 | * r4 sp+10 - FiddledVtx vtx[], (r4 [0], r5 [1]) |
| 45 | * r7 sp+14 - Light *gRSPlights |
| 46 | * sp+18 - float *fRSPAmbientLightRGBA |
| 47 | * sp+1c - XMATRIX *gRSPworldProject |
| 48 | * sp+20 - XMATRIX *gRSPmodelViewTop |
| 49 | * sp+24 - uint32 gRSPnumLights |
| 50 | * sp+28 - float gRSPfFogMin |
| 51 | * sp+2c - uint32 primitiveColor |
| 52 | * sp+30 - uint32 primitiveColor |
| 53 | */ |
| 54 | FUNCTION(pv_neon): |
| 55 | ldr r12, [sp, #0x10] |
| 56 | pld [r12] |
| 57 | |
| 58 | push {r4-r11,lr} |
| 59 | vpush {q4-q7} |
| 60 | |
| 61 | mov r4, r12 @ vtx |
| 62 | ldr r12, [sp, #0x64+0x1c] |
| 63 | vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject |
| 64 | vld1.32 {q14,q15}, [r12, :128] |
| 65 | ldr r6, [sp, #0x64+0x04] @ g_clipFlag2 |
| 66 | add r5, r4, #16 @ vtx + 1 |
| 67 | ldr r11, [sp, #0x64+0x08] @ dwNum |
| 68 | ldr r10, [sp, #0x64+0x0c] @ neon_flags |
| 69 | |
| 70 | 0: |
| 71 | vld1.16 d12, [r4]! @ vtx[0] .z .flag .x .y (reg) |
| 72 | vmovl.s16 q6, d12 |
| 73 | vld1.16 d14, [r5]! @ vtx[1] .z .flag .x .y |
| 74 | vmovl.s16 q7, d14 |
| 75 | vcvt.f32.s32 q6, q6 @ q6 = vtx_raw0 |
| 76 | vcvt.f32.s32 q7, q7 @ q7 = vtx_raw1 |
| 77 | vdup.32 q0, d12[1] @ vtx_raw0.x (dup) |
| 78 | vdup.32 q1, d12[0] @ vtx_raw0.y (dup) |
| 79 | vdup.32 q2, d13[1] @ vtx_raw0.z (dup) |
| 80 | vdup.32 q3, d14[1] @ vtx_raw1.x (dup) |
| 81 | vdup.32 q4, d14[0] @ vtx_raw1.y (dup) |
| 82 | vdup.32 q5, d15[1] @ vtx_raw1.z (dup) |
| 83 | /* note: order of operations matters greatly, |
| 84 | * may cause like 20 fraction bits to differ! */ |
| 85 | vmul.f32 q0, q0, q12 |
| 86 | vmul.f32 q3, q3, q12 |
| 87 | vmla.f32 q0, q1, q13 |
| 88 | vmla.f32 q3, q4, q13 |
| 89 | vmul.f32 q2, q2, q14 @ yes, mul+add is |
| 90 | vmul.f32 q5, q5, q14 @ faster than mla |
| 91 | vadd.f32 q0, q2 |
| 92 | vadd.f32 q3, q5 |
| 93 | vadd.f32 q0, q15 @ q0 = g_vtxTransformed[i] |
| 94 | vadd.f32 q3, q15 @ q3 = g_vtxTransformed[i + 1] |
| 95 | |
| 96 | vld1.16 d16[1], [r4]! @ [0].v |
| 97 | vmov d2, d1 |
| 98 | vld1.16 d16[0], [r4]! @ [0].u |
| 99 | vsri.64 d2, d7, #32 |
| 100 | vld1.16 d18[1], [r5]! @ [0].v |
| 101 | #if 1 |
| 102 | vrecpe.f32 d4, d2 @ inv [0][1] .w |
| 103 | vld1.16 d18[0], [r5]! @ [0].u |
| 104 | vrecps.f32 d5, d2, d4 @ step |
| 105 | vmovl.s16 q8, d16 |
| 106 | /* g_vtxTransformed[0] */ vst1.32 {q0}, [r0, :128]! |
| 107 | vmovl.s16 q9, d18 |
| 108 | vcvt.f32.s32 d16, d16 |
| 109 | vcvt.f32.s32 d18, d18 |
| 110 | vmul.f32 d4, d5, d4 @ better inv |
| 111 | bic r9, r5, #63 |
| 112 | pld [r9, #64] |
| 113 | vrecps.f32 d5, d2, d4 @ step |
| 114 | cmp r11, #1 |
| 115 | /* u,v g_fVtxTxtCoords[0] */ vst1.32 {d16}, [r3]! |
| 116 | beq 99f |
| 117 | /* g_vtxTransformed[1] */ vst1.32 {q3}, [r0, :128]! |
| 118 | /* ... [1] */ vst1.32 {d18}, [r3]! |
| 119 | 99: |
| 120 | vmov.f32 d20, #1.0 |
| 121 | vmov.f32 d21, #-1.0 |
| 122 | vmul.f32 d4, d5, d4 @ better inv [0][1] .w |
| 123 | #if 0 |
| 124 | vrecps.f32 d5, d2, d4 @ step |
| 125 | vmul.f32 d4, d5, d4 @ better inv |
| 126 | #endif |
| 127 | #else |
| 128 | mov r12, #0x3f800000 @ 1.0f |
| 129 | vmov.f32 s6, r12 |
| 130 | vdiv.f32 s8, s6, s4 |
| 131 | vdiv.f32 s9, s6, s5 |
| 132 | #error incomplete |
| 133 | #endif |
| 134 | |
| 135 | mov r8, #X_CLIP_MAX |
| 136 | mov r9, #Y_CLIP_MAX |
| 137 | vmov d22, r8, r9 |
| 138 | vmul.f32 q0, q0, d4[1] @ .x .y .z .w *= [0] .w |
| 139 | vmul.f32 q1, q3, d4[0] |
| 140 | vshr.u64 d5, d4, #32 @ [0] .w |
| 141 | mov r8, #X_CLIP_MIN |
| 142 | mov r9, #Y_CLIP_MIN |
| 143 | vmov d23, r8, r9 |
| 144 | vsli.64 d3, d4, #32 @ insert [1] .w |
| 145 | vsli.64 d1, d5, #32 |
| 146 | vsli.u64 d5, d4, #32 @ [0] [1] .w |
| 147 | vcgt.f32 d6, d0, d20 @ .xy > 1.0? |
| 148 | vcgt.f32 d7, d21, d0 |
| 149 | vcgt.f32 d4, d5, #0 @ .w > 0? |
| 150 | vst1.32 {q0}, [r1]! @ g_vecProjected[0] |
| 151 | vcgt.f32 d8, d2, d20 |
| 152 | vcgt.f32 d9, d21, d2 |
| 153 | vld1.32 d0[0], [r4]! @ mem: [0] .azyx |
| 154 | vand q3, q11 |
| 155 | vand q4, q11 |
| 156 | cmp r11, #1 |
| 157 | beq 99f |
| 158 | vst1.32 {q1}, [r1]! @ g_vecProjected[1] |
| 159 | 99: |
| 160 | vorr d6, d6, d7 |
| 161 | vorr d7, d8, d9 |
| 162 | vld1.32 d0[1], [r5]! @ mem: [1] .azyx |
| 163 | vpadd.u32 d6, d7 |
| 164 | vrev32.8 d0, d0 @ make 0xaazzyyxx [1][0] |
| 165 | vsli.u64 d1, d3, #32 @ d3 = [1] [0] .z |
| 166 | vmovl.s8 q4, d0 |
| 167 | vand d6, d4 |
| 168 | vmovl.s16 q1, d8 |
| 169 | vmovl.s16 q2, d9 |
| 170 | vst1.32 {d6}, [r6]! @ g_clipFlag2 |
| 171 | |
| 172 | tst r10, #PV_NEON_ENABLE_LIGHT |
| 173 | beq pv_neon_no_light |
| 174 | @ pv_neon_light: |
| 175 | @ live NEON registers: |
| 176 | @ d1 = [1][0] .z (must preserve) |
| 177 | @ q1,q2 = azyx [1][0] |
| 178 | @ q12+ = gRSPworldProject |
| 179 | ldr r12, [sp, #0x64+0x20] |
| 180 | vcvt.f32.s32 q1, q1 |
| 181 | vcvt.f32.s32 q2, q2 |
| 182 | vld1.32 {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop |
| 183 | vld1.32 {q10}, [r12, :128] |
| 184 | |
| 185 | vdup.32 q5, d4[0] @ [1] .x (dup) |
| 186 | vdup.32 q6, d4[1] @ [1] .y (dup) |
| 187 | vdup.32 q7, d5[0] @ [1] .z (dup) |
| 188 | vdup.32 q2, d2[0] @ [0] .x (dup) |
| 189 | vdup.32 q3, d2[1] @ [0] .y (dup) |
| 190 | vdup.32 q4, d3[0] @ [0] .z (dup) |
| 191 | vmul.f32 q2, q2, q8 |
| 192 | vmul.f32 q5, q5, q8 |
| 193 | vmla.f32 q2, q3, q9 |
| 194 | vmla.f32 q5, q6, q9 |
| 195 | vmul.f32 q4, q4, q10 |
| 196 | vmul.f32 q7, q7, q10 |
| 197 | vadd.f32 q4, q2 @ q4 = temp[0] .xyz0 |
| 198 | vadd.f32 q5, q7 @ q5 = temp[1] .xyz0 |
| 199 | vmul.f32 q2, q4, q4 @ temp .xyz0 ^2 |
| 200 | vmul.f32 q3, q5, q5 |
| 201 | vpadd.f32 d2, d4, d5 |
| 202 | vpadd.f32 d3, d6, d7 |
| 203 | movw r8, #0x0000ffff |
| 204 | movt r8, #0x7f7f @ max normal float, ~3.4e+38 |
| 205 | vdup.32 d4, r8 |
| 206 | vpadd.f32 d2, d2, d3 @ d2 = [1][0] x^2 + y^2 + z^2 |
| 207 | vcgt.f32 d5, d2, #0 |
| 208 | vbif d2, d4, d5 @ if (d2 == 0) d2 = MAXFLOAT |
| 209 | |
| 210 | vrsqrte.f32 d3, d2 @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum |
| 211 | vmul.f32 d4, d3, d2 |
| 212 | ldr r9, [sp, #0x64+0x18] @ &fRSPAmbientLightRGBA |
| 213 | ldr r7, [sp, #0x64+0x14] @ gRSPlights |
| 214 | ldr r8, [sp, #0x64+0x24] @ gRSPnumLights |
| 215 | vrsqrts.f32 d4, d3, d4 @ step |
| 216 | vld1.32 {q6}, [r9] @ rgb |
| 217 | vld1.32 {q7}, [r9] @ rgb |
| 218 | vmul.f32 d3, d3, d4 @ 1/sqrt(d2) |
| 219 | #if 0 /* not necessary? */ |
| 220 | vmul.f32 d4, d3, d2 |
| 221 | vrsqrts.f32 d4, d3, d4 @ step |
| 222 | vmul.f32 d3, d3, d4 @ 1/sqrt(d2) |
| 223 | #endif |
| 224 | vmul.f32 q2, q4, d3[0] @ q2 = normal[0] .xyz |
| 225 | vmul.f32 q3, q5, d3[1] @ q3 = normal[1] .xyz |
| 226 | |
| 227 | 1: |
| 228 | vld1.32 {q8}, [r7] |
| 229 | vmul.f32 q4, q8, q2 @ gRSPlights[l] * normal |
| 230 | vmul.f32 q5, q8, q3 |
| 231 | vpadd.f32 d8, d8, d9 |
| 232 | vpadd.f32 d10, d10, d11 |
| 233 | vpadd.f32 d8, d8, d10 @ d8 = [1][0] fCosT |
| 234 | vcgt.f32 d9, d8, #0 @ if (!(fCosT > 0)) |
| 235 | vand d8, d9 @ fCosT = 0 |
| 236 | add r9, r7, #OFFSETOF_Light_fr |
| 237 | vld1.32 {q8}, [r9] @ .fr .fg .fb |
| 238 | vdup.32 q5, d8[1] @ [1] fCosT (dup) |
| 239 | vdup.32 q4, d8[0] @ |
| 240 | vmla.f32 q7, q8, q5 @ .rgb += frgb * fCosT |
| 241 | vmla.f32 q6, q8, q4 |
| 242 | add r7, #SIZEOF_Light |
| 243 | subs r8, #1 |
| 244 | bgt 1b |
| 245 | |
| 246 | movt r8, #0x437f @ float 255 |
| 247 | vdup.32 q8, r8 |
| 248 | vcgt.f32 q4, q6, q8 @ if (.rgb > 255) |
| 249 | vcgt.f32 q5, q7, q8 |
| 250 | vbit q6, q8, q4 @ .rgb = 255 |
| 251 | vbit q7, q8, q5 |
| 252 | vcvt.u32.f32 q6, q6 |
| 253 | vcvt.u32.f32 q7, q7 |
| 254 | ldrb r8, [r4, #-4] @ .a from vtx |
| 255 | ldrb r9, [r5, #-4] |
| 256 | vext.32 q4, q6, q6, #3 @ reg: .abgr -> .bgra |
| 257 | vext.32 q5, q7, q7, #3 |
| 258 | vmov.32 d8[0], r8 @ use .a from input |
| 259 | vmov.32 d10[0], r9 |
| 260 | vmovn.u32 d8, q4 |
| 261 | vmovn.u32 d10, q5 |
| 262 | vmovn.u16 d0, q4 |
| 263 | vmovn.u16 d2, q5 |
| 264 | vsli.u64 d0, d2, #32 |
| 265 | vrev32.8 d0, d0 @ 0xbbggrraa -> 0xaarrggbb |
| 266 | b pv_neon_fog_alpha |
| 267 | |
| 268 | pv_neon_no_light: |
| 269 | tst r10, #PV_NEON_ENABLE_SHADE |
| 270 | vldr d0, [sp, #0x64+0x2c] @ primitiveColor [0] [1] |
| 271 | beq pv_neon_fog_alpha |
| 272 | @ easier to do with ARM |
| 273 | ldr r8, [r4, #-4] |
| 274 | ldr r9, [r5, #-4] |
| 275 | ror r8, #8 @ mem: .argb -> .rgba |
| 276 | ror r9, #8 @ reg: 0xbbggrraa -> .. |
| 277 | vmov d0, r8, r9 |
| 278 | |
| 279 | pv_neon_fog_alpha: |
| 280 | tst r10, #PV_NEON_FOG_ALPHA |
| 281 | beq pv_neon_next |
| 282 | vmov.f32 d20, #1.0 |
| 283 | vcgt.f32 d2, d1, d20 @ [0] [1] .z > 1.0? |
| 284 | vcgt.f32 d3, d1, #0 @ > 0? |
| 285 | movw r8, #0 |
| 286 | movt r8, #0x4f7f @ r8 = (float)(255<<24) |
| 287 | vbit d1, d20, d2 @ make 1.0 if needed |
| 288 | vand d1, d3 |
| 289 | vdup.32 d4, r8 |
| 290 | vmul.f32 d1, d1, d4 |
| 291 | vcvt.u32.f32 d1, d1 |
| 292 | vmov.u32 d5, #0xff000000 |
| 293 | vbit d0, d1, d5 |
| 294 | |
| 295 | pv_neon_next: |
| 296 | subs r11, #2 |
| 297 | vst1.32 {d0}, [r2]! @ g_dwVtxDifColor |
| 298 | add r4, #16 |
| 299 | add r5, #16 |
| 300 | bgt 0b |
| 301 | nop |
| 302 | |
| 303 | vpop {q4-q7} |
| 304 | pop {r4-r11,pc} |
| 305 | .size pv_neon, .-pv_neon |
| 306 | |
| 307 | |
| 308 | @ (float *d, const float *m1, const float *m2, const float *s) |
| 309 | FUNCTION(multiply_subtract2): |
| 310 | vld1.32 {d1}, [r1] |
| 311 | vld1.32 {d2}, [r2] |
| 312 | vmul.f32 d0, d1, d2 |
| 313 | vld1.32 {d3}, [r3] |
| 314 | vsub.f32 d0, d3 |
| 315 | vst1.32 {d0}, [r0] |
| 316 | bx lr |
| 317 | .size multiply_subtract2, .-multiply_subtract2 |
| 318 | |
| 319 | |
| 320 | @ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2) |
| 321 | FUNCTION(tv_direction): |
| 322 | vld1.32 {q0}, [r0] |
| 323 | vld1.32 {q2}, [r2] |
| 324 | vld1.32 {q1}, [r1] |
| 325 | vsub.f32 d6, d4, d0 @ d6 = V2,V1 |
| 326 | vsub.f32 d7, d4, d2 @ d7 = W2,W1 |
| 327 | vmul.f32 d1, d5 @ d1 = v0.w * v2.w |
| 328 | vrev64.32 d7, d7 |
| 329 | vmul.f32 d6, d7 @ d6 = V2*W1,V1*W2 |
| 330 | vmul.f32 d1, d3 @ d1 *= v1.w |
| 331 | vshr.u64 d7, d6, #32 |
| 332 | vsub.f32 d6, d7 @ d6[0] = V1*W2 - V2*W1 |
| 333 | vshr.u64 d1, d1, #32 |
| 334 | vmul.f32 d0, d1, d6 |
| 335 | vmov.32 r0, d0[0] |
| 336 | bx lr |
| 337 | |
| 338 | |
| 339 | @ vim:filetype=armasm:expandtab |