X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=source%2Fgles2rice%2Fsrc%2FRenderBase_neon.S;h=da769c7a99039f0f6e6a7868be0decc69d8ec9c9;hb=42669f3e5b8ddb0f03dda8ea5b3313a1dc0ae749;hp=08df333791f98b31d257ee2672967a8217e4a201;hpb=61b9f2dfb3e20d2e2e7efda30cf459df5134d88f;p=mupen64plus-pandora.git diff --git a/source/gles2rice/src/RenderBase_neon.S b/source/gles2rice/src/RenderBase_neon.S index 08df333..da769c7 100644 --- a/source/gles2rice/src/RenderBase_neon.S +++ b/source/gles2rice/src/RenderBase_neon.S @@ -103,17 +103,20 @@ FUNCTION(pv_neon): vld1.16 d18[0], [r5]! @ [0].u vrecps.f32 d5, d2, d4 @ step vmovl.s16 q8, d16 - /* write g_vtxTransformed */ vst1.32 {q0}, [r0, :128]! + /* g_vtxTransformed[0] */ vst1.32 {q0}, [r0, :128]! vmovl.s16 q9, d18 - /* ... [1] */ vst1.32 {q3}, [r0, :128]! vcvt.f32.s32 d16, d16 vcvt.f32.s32 d18, d18 vmul.f32 d4, d5, d4 @ better inv bic r9, r5, #63 pld [r9, #64] vrecps.f32 d5, d2, d4 @ step - /* wrt u,v to g_fVtxTxtCoords */ vst1.32 {d16}, [r3]! + cmp r11, #1 + /* u,v g_fVtxTxtCoords[0] */ vst1.32 {d16}, [r3]! + beq 99f + /* g_vtxTransformed[1] */ vst1.32 {q3}, [r0, :128]! /* ... [1] */ vst1.32 {d18}, [r3]! + 99: vmov.f32 d20, #1.0 vmov.f32 d21, #-1.0 vmul.f32 d4, d5, d4 @ better inv [0][1] .w @@ -144,12 +147,16 @@ FUNCTION(pv_neon): vcgt.f32 d6, d0, d20 @ .xy > 1.0? vcgt.f32 d7, d21, d0 vcgt.f32 d4, d5, #0 @ .w > 0? - vst1.32 {q0,q1}, [r1]! @ wrt g_vecProjected + vst1.32 {q0}, [r1]! @ g_vecProjected[0] vcgt.f32 d8, d2, d20 vcgt.f32 d9, d21, d2 vld1.32 d0[0], [r4]! @ mem: [0] .azyx vand q3, q11 vand q4, q11 + cmp r11, #1 + beq 99f + vst1.32 {q1}, [r1]! @ g_vecProjected[1] +99: vorr d6, d6, d7 vorr d7, d8, d9 vld1.32 d0[1], [r5]! @ mem: [1] .azyx @@ -298,4 +305,35 @@ pv_neon_next: .size pv_neon, .-pv_neon +@ (float *d, const float *m1, const float *m2, const float *s) +FUNCTION(multiply_subtract2): + vld1.32 {d1}, [r1] + vld1.32 {d2}, [r2] + vmul.f32 d0, d1, d2 + vld1.32 {d3}, [r3] + vsub.f32 d0, d3 + vst1.32 {d0}, [r0] + bx lr + .size multiply_subtract2, .-multiply_subtract2 + + +@ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2) +FUNCTION(tv_direction): + vld1.32 {q0}, [r0] + vld1.32 {q2}, [r2] + vld1.32 {q1}, [r1] + vsub.f32 d6, d4, d0 @ d6 = V2,V1 + vsub.f32 d7, d4, d2 @ d7 = W2,W1 + vmul.f32 d1, d5 @ d1 = v0.w * v2.w + vrev64.32 d7, d7 + vmul.f32 d6, d7 @ d6 = V2*W1,V1*W2 + vmul.f32 d1, d3 @ d1 *= v1.w + vshr.u64 d7, d6, #32 + vsub.f32 d6, d7 @ d6[0] = V1*W2 - V2*W1 + vshr.u64 d1, d1, #32 + vmul.f32 d0, d1, d6 + vmov.32 r0, d0[0] + bx lr + + @ vim:filetype=armasm:expandtab