ProcessVertexData = ProcessVertexDataSSE;
}
else
+#elif defined(__ARM_NEON__)
+ if( !g_curRomInfo.bPrimaryDepthHack && options.enableHackForGames != HACK_FOR_NASCAR && options.enableHackForGames != HACK_FOR_ZELDA_MM && !options.bWinFrameMode)
+ {
+ ProcessVertexData = ProcessVertexDataNEON;
+ }
+ else
#endif
{
ProcessVertexData = ProcessVertexDataNoSSE;
DEBUGGER_PAUSE_AND_DUMP(NEXT_VERTEX_CMD,{TRACE0("Paused at Vertex Cmd");});
}
+/* NEON code */
+
+#include "RenderBase_neon.h"
+
+extern "C" void pv_neon(XVECTOR4 *g_vtxTransformed, XVECTOR4 *g_vecProjected,
+ uint32 *g_dwVtxDifColor, VECTOR2 *g_fVtxTxtCoords,
+ float *g_fFogCoord, uint32 *g_clipFlag2,
+ uint32 dwNum, int neon_state,
+ const FiddledVtx *vtx,
+ const Light *gRSPlights, const float *fRSPAmbientLightRGBA,
+ const XMATRIX *gRSPworldProject, const XMATRIX *gRSPmodelViewTop,
+ uint32 gRSPnumLights, float gRSPfFogMin,
+ uint32 primitiveColor, uint32 primitiveColor_);
+
+// debug
+//#define DO_CMP
+#ifdef DO_CMP
+// note: don't forget -fno-associative-math
+static XVECTOR4 n_transformed[2], n_projected[2];
+static uint32 n_color[2];
+static VECTOR2 n_vtxcoords[2];
+static float n_fogcoord[2];
+static uint32 n_clipflag2[2];
+
+static int do_cmp_f(void *a, void *b, int c)
+{
+ int *ia = (int *)a, *ib = (int *)b;
+ for (int i = 0; i < c; i++) {
+ int di = abs(ia[i] - ib[i]);
+ if (di > 7) {
+ printf("di: %d\n", di);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int do_cmp_c(uint32 a, uint32 b)
+{
+ if (abs(((a >> 0) & 0xff) - ((b >> 0) & 0xff)) > 1)
+ return 1;
+ if (abs(((a >> 8) & 0xff) - ((b >> 8) & 0xff)) > 1)
+ return 1;
+ if (abs(((a >> 16) & 0xff) - ((b >> 16) & 0xff)) > 1)
+ return 1;
+ if (abs(((a >> 24) & 0xff) - ((b >> 24) & 0xff)) > 1)
+ return 1;
+
+ return 0;
+}
+
+static void do_cmp(int i, int s, int neon_state)
+{
+ static int ccnt;
+ int bad = 0;
+
+ // if (memcmp(&n_transformed, &g_vtxTransformed[i], sizeof(XVECTOR4)))
+ if (do_cmp_f(&n_transformed[s], &g_vtxTransformed[i], 4)) {
+ printf("transformed:\n%13.8e %13.8e %13.8e %13.8e\n"
+ "%13.8e %13.8e %13.8e %13.8e\n",
+ n_transformed[s].x, n_transformed[s].y,
+ n_transformed[s].z, n_transformed[s].w,
+ g_vtxTransformed[i].x, g_vtxTransformed[i].y,
+ g_vtxTransformed[i].z, g_vtxTransformed[i].w);
+ bad = 1;
+ }
+ if (do_cmp_f(&n_projected[s], &g_vecProjected[i], 4)) {
+ printf("projected:\n%13.8e %13.8e %13.8e %13.8e |%08x\n"
+ "%13.8e %13.8e %13.8e %13.8e |%08x\n",
+ n_projected[s].x, n_projected[s].y,
+ n_projected[s].z, n_projected[s].w,
+ *(uint32 *)&n_projected[s].w,
+ g_vecProjected[i].x, g_vecProjected[i].y,
+ g_vecProjected[i].z, g_vecProjected[i].w,
+ *(uint32 *)&g_vecProjected[i].w);
+ bad = 1;
+ }
+ if (n_vtxcoords[s].x != g_fVtxTxtCoords[i].x
+ || n_vtxcoords[s].y != g_fVtxTxtCoords[i].y)
+ {
+ printf("vtxcoords:\n%13.8e %13.8e\n%13.8e %13.8e\n",
+ n_vtxcoords[s].x, n_vtxcoords[s].y,
+ g_fVtxTxtCoords[i].x, g_fVtxTxtCoords[i].y);
+ bad = 1;
+ }
+ if (n_clipflag2[s] != g_clipFlag2[i]) {
+ printf("clipflag2: %08x %08x\n", n_clipflag2[s], g_clipFlag2[i]);
+ bad = 1;
+ }
+ if (do_cmp_c(n_color[s], g_dwVtxDifColor[i])) {
+ printf("n_color: %08x %08x\n", n_color[s], g_dwVtxDifColor[i]);
+ bad = 1;
+ }
+ if (!(neon_state & PV_NEON_ENABLE_SHADE))
+ printf("!ENABLE_SHADE!\n");
+ if (bad) {
+ printf("%d s=%d, state %02x\n", ccnt, s, neon_state);
+ printf(".w %08x %08x\n",
+ *(uint32 *)&n_projected[s].w, *(uint32 *)&g_vecProjected[i].w);
+ exit(ccnt);
+ }
+ ccnt++;
+}
+#endif
+
+void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum)
+{
+ if (gRSP.bTextureGen && gRSP.bLightingEnable) {
+ ProcessVertexDataNoSSE(dwAddr, dwV0,dwNum);
+ return;
+ }
+
+ // assumtions:
+ // - g_clipFlag is not used at all
+ // - g_fFogCoord is not used at all
+ // - g_vtxNonTransformed is not used after ProcessVertexData*() returns
+ // - g_normal - same
+
+ int neon_state = 0;
+ if ( gRSP.bLightingEnable )
+ neon_state |= PV_NEON_ENABLE_LIGHT;
+ if ( (gRDP.geometryMode & G_SHADE) || gRSP.ucode >= 5 )
+ neon_state |= PV_NEON_ENABLE_SHADE;
+ if ( gRSP.bFogEnabled )
+ neon_state |= PV_NEON_ENABLE_FOG;
+ if ( gRDP.geometryMode & G_FOG )
+ neon_state |= PV_NEON_FOG_ALPHA;
+
+ uint32 i;
+#ifdef DO_CMP
+ uint32 s = 0;
+#endif
+
+ UpdateCombinedMatrix();
+
+ // This function is called upon SPvertex
+ // - do vertex matrix transform
+ // - do vertex lighting
+ // - do texture cooridinate transform if needed
+ // - calculate normal vector
+
+ // Output: - g_vecProjected[i] -> transformed vertex x,y,z
+ // - g_vecProjected[i].w -> saved vertex 1/w
+ // - g_vtxTransformed[i]
+ // - g_dwVtxDifColor[i] -> vertex color
+ // - g_fVtxTxtCoords[i] -> vertex texture cooridinates
+ // - g_fFogCoord[i] -> unused
+ // - g_clipFlag2[i]
+
+ const FiddledVtx * pVtxBase = (const FiddledVtx*)(g_pRDRAMu8 + dwAddr);
+ g_pVtxBase = (FiddledVtx *)pVtxBase;
+
+ gRSPmodelViewTop._14 = gRSPmodelViewTop._24 =
+ gRSPmodelViewTop._34 = 0;
+
+ // SP_Timing(RSP_GBI0_Vtx);
+ status.SPCycleCount += Timing_RSP_GBI0_Vtx * dwNum;
+
+//#define DO_CC
+#ifdef DO_CC
+ asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(i));
+ i |= 5; // master enable, ccnt reset
+ i &= ~8; // ccnt divider 0
+ asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(i));
+ // enable cycle counter
+ asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31));
+ unsigned int cc_start;
+ asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc_start));
+#endif
+
+#if 1
+ i = dwV0;
+ pv_neon(&g_vtxTransformed[i], &g_vecProjected[i],
+ &g_dwVtxDifColor[i], &g_fVtxTxtCoords[i],
+ &g_fFogCoord[i], &g_clipFlag2[i],
+ dwNum, neon_state, &pVtxBase[i - dwV0],
+ gRSPlights, gRSP.fAmbientColors,
+ &gRSPworldProject, &gRSPmodelViewTop,
+ gRSPnumLights, gRSPfFogMin,
+ gRDP.primitiveColor, gRDP.primitiveColor);
+#else
+ for (i = dwV0; i < dwV0 + dwNum; i++)
+ {
+#ifdef DO_CMP
+ if (!(s & 1))
+ pv_neon(n_transformed, n_projected,
+ n_color, n_vtxcoords,
+ n_fogcoord, n_clipflag2,
+ 1, neon_state, &pVtxBase[i - dwV0],
+ gRSPlights, gRSP.fAmbientColors,
+ &gRSPworldProject, &gRSPmodelViewTop,
+ gRSPnumLights, gRSPfFogMin,
+ gRDP.primitiveColor, gRDP.primitiveColor);
+#endif
+
+ const FiddledVtx & vert = pVtxBase[i - dwV0];
+ XVECTOR3 vtx_raw; // was g_vtxNonTransformed
+
+ vtx_raw.x = (float)vert.x;
+ vtx_raw.y = (float)vert.y;
+ vtx_raw.z = (float)vert.z;
+
+ Vec3Transform(&g_vtxTransformed[i], &vtx_raw, &gRSPworldProject); // Convert to w=1
+
+ g_vecProjected[i].w = 1.0f / g_vtxTransformed[i].w;
+ g_vecProjected[i].x = g_vtxTransformed[i].x * g_vecProjected[i].w;
+ g_vecProjected[i].y = g_vtxTransformed[i].y * g_vecProjected[i].w;
+ g_vecProjected[i].z = g_vtxTransformed[i].z * g_vecProjected[i].w;
+
+ // RSP_Vtx_Clipping(i);
+ g_clipFlag2[i] = 0;
+ if( g_vecProjected[i].w > 0 )
+ {
+ if( g_vecProjected[i].x > 1 ) g_clipFlag2[i] |= X_CLIP_MAX;
+ if( g_vecProjected[i].x < -1 ) g_clipFlag2[i] |= X_CLIP_MIN;
+ if( g_vecProjected[i].y > 1 ) g_clipFlag2[i] |= Y_CLIP_MAX;
+ if( g_vecProjected[i].y < -1 ) g_clipFlag2[i] |= Y_CLIP_MIN;
+ }
+
+ if( neon_state & PV_NEON_ENABLE_LIGHT )
+ {
+ XVECTOR3 normal; // was g_normal
+ float r, g, b;
+
+ normal.x = (float)vert.norma.nx;
+ normal.y = (float)vert.norma.ny;
+ normal.z = (float)vert.norma.nz;
+
+ Vec3TransformNormal(normal, gRSPmodelViewTop);
+
+ r = gRSP.fAmbientLightR;
+ g = gRSP.fAmbientLightG;
+ b = gRSP.fAmbientLightB;
+
+ for (unsigned int l=0; l < gRSPnumLights; l++)
+ {
+ float fCosT = normal.x * gRSPlights[l].x + normal.y * gRSPlights[l].y + normal.z * gRSPlights[l].z;
+
+ if (fCosT > 0 )
+ {
+ r += gRSPlights[l].fr * fCosT;
+ g += gRSPlights[l].fg * fCosT;
+ b += gRSPlights[l].fb * fCosT;
+ }
+ }
+ if (r > 255) r = 255;
+ if (g > 255) g = 255;
+ if (b > 255) b = 255;
+ g_dwVtxDifColor[i] = ((vert.rgba.a<<24)|(((uint32)r)<<16)|(((uint32)g)<<8)|((uint32)b));
+ }
+ else if( neon_state & PV_NEON_ENABLE_SHADE )
+ {
+ IColor &color = *(IColor*)&g_dwVtxDifColor[i];
+ color.b = vert.rgba.r;
+ color.g = vert.rgba.g;
+ color.r = vert.rgba.b;
+ color.a = vert.rgba.a;
+ }
+ else
+ g_dwVtxDifColor[i] = gRDP.primitiveColor; // FLAT shade
+
+ // ReplaceAlphaWithFogFactor(i);
+ if( neon_state & PV_NEON_FOG_ALPHA )
+ {
+ // Use fog factor to replace vertex alpha
+ if( g_vecProjected[i].z > 1 )
+ *(((uint8*)&(g_dwVtxDifColor[i]))+3) = 0xFF;
+ // missing 'else' in original code??
+ else if( g_vecProjected[i].z < 0 )
+ *(((uint8*)&(g_dwVtxDifColor[i]))+3) = 0;
+ else
+ *(((uint8*)&(g_dwVtxDifColor[i]))+3) = (uint8)(g_vecProjected[i].z*255);
+ }
+
+ g_fVtxTxtCoords[i].x = (float)vert.tu;
+ g_fVtxTxtCoords[i].y = (float)vert.tv;
+#ifdef DO_CMP
+ do_cmp(i, s++ & 1, neon_state);
+#endif
+ }
+#endif
+#ifdef DO_CC
+ static int total, total_c;
+ unsigned int cc;
+ asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc));
+ total += cc - cc_start;
+ total_c += dwNum;
+ if (total_c > 20000) {
+ printf("%.u\n", total / total_c);
+ total = total_c = 0;
+ }
+#endif
+}
+
bool PrepareTriangle(uint32 dwV0, uint32 dwV1, uint32 dwV2)
{
if( status.isVertexShaderEnabled || status.bUseHW_T_L )