$(SRCDIR)/osal_files_unix.c
endif
+ifeq ($(CPU),ARM)
+SOURCE += \
+ $(SRCDIR)/RenderBase_neon.S
+endif
+
# generate a list of object files build, make a temporary directory for them
OBJECTS := $(patsubst $(SRCDIR)/%.c, $(OBJDIR)/%.o, $(filter %.c, $(SOURCE)))
OBJECTS += $(patsubst $(SRCDIR)/%.cpp, $(OBJDIR)/%.o, $(filter %.cpp, $(SOURCE)))
+OBJECTS += $(patsubst $(SRCDIR)/%.S, $(OBJDIR)/%.o, $(filter %.S, $(SOURCE)))
OBJDIRS = $(dir $(OBJECTS))
$(shell $(MKDIR) $(OBJDIRS))
$(OBJDIR)/%.o: $(SRCDIR)/%.c
$(COMPILE.c) -o $@ $<
+$(OBJDIR)/%.o: $(SRCDIR)/%.S
+ $(COMPILE.c) -o $@ $<
+
$(OBJDIR)/%.o: $(SRCDIR)/%.cpp
$(COMPILE.cc) -o $@ $<
#endif
extern RenderTextureInfo gRenderTextureInfos[];
-void SetVertexTextureUVCoord(TexCord &dst, float s, float t, int tile, TxtrCacheEntry *pEntry)
+void SetVertexTextureUVCoord(TexCord &dst, const TexCord &src, int tile, TxtrCacheEntry *pEntry)
{
RenderTexture &txtr = g_textures[tile];
RenderTextureInfo &info = gRenderTextureInfos[pEntry->txtrBufIdx-1];
+ float s = src.u;
+ float t = src.v;
uint32 addrOffset = g_TI.dwAddr-info.CI_Info.dwAddr;
uint32 extraTop = (addrOffset>>(info.CI_Info.dwSize-1)) /info.CI_Info.dwWidth;
dst.v = t;
}
-void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T)
+void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, const TexCord &fTex0)
{
RenderTexture &txtr = g_textures[0];
if( txtr.pTextureEntry && txtr.pTextureEntry->txtrBufIdx > 0 )
{
- ::SetVertexTextureUVCoord(v.tcord[0], fTex0S, fTex0T, 0, txtr.pTextureEntry);
+ ::SetVertexTextureUVCoord(v.tcord[0], fTex0, 0, txtr.pTextureEntry);
}
else
{
- v.tcord[0].u = fTex0S;
- v.tcord[0].v = fTex0T;
+ v.tcord[0] = fTex0;
}
}
-void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T, float fTex1S, float fTex1T)
+void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T)
+{
+ TexCord t = { fTex0S, fTex0T };
+ SetVertexTextureUVCoord(v, t);
+}
+
+void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, const TexCord &fTex0_, const TexCord &fTex1_)
{
+ TexCord fTex0 = fTex0_;
+ TexCord fTex1 = fTex1_;
+
if( (options.enableHackForGames == HACK_FOR_ZELDA||options.enableHackForGames == HACK_FOR_ZELDA_MM) && m_Mux == 0x00262a60150c937fLL && gRSP.curTile == 0 )
{
// Hack for Zelda Sun
t1.dwFormat == TXT_FMT_I && t1.dwSize == TXT_SIZE_8b && t1.dwWidth == 64 &&
t0.dwHeight == t1.dwHeight )
{
- fTex0S /= 2;
- fTex0T /= 2;
- fTex1S /= 2;
- fTex1T /= 2;
+ fTex0.u /= 2;
+ fTex0.v /= 2;
+ fTex1.u /= 2;
+ fTex1.v /= 2;
}
}
RenderTexture &txtr0 = g_textures[0];
if( txtr0.pTextureEntry && txtr0.pTextureEntry->txtrBufIdx > 0 )
{
- ::SetVertexTextureUVCoord(v.tcord[0], fTex0S, fTex0T, 0, txtr0.pTextureEntry);
+ ::SetVertexTextureUVCoord(v.tcord[0], fTex0, 0, txtr0.pTextureEntry);
}
else
{
- v.tcord[0].u = fTex0S;
- v.tcord[0].v = fTex0T;
+ v.tcord[0] = fTex0;
}
RenderTexture &txtr1 = g_textures[1];
if( txtr1.pTextureEntry && txtr1.pTextureEntry->txtrBufIdx > 0 )
{
- ::SetVertexTextureUVCoord(v.tcord[1], fTex1S, fTex1T, 1, txtr1.pTextureEntry);
+ ::SetVertexTextureUVCoord(v.tcord[1], fTex1, 1, txtr1.pTextureEntry);
}
else
{
- v.tcord[1].u = fTex1S;
- v.tcord[1].v = fTex1T;
+ v.tcord[1] = fTex1;
}
}
+void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T, float fTex1S, float fTex1T)
+{
+ TexCord t0 = { fTex0S, fTex0T };
+ TexCord t1 = { fTex1S, fTex1T };
+ SetVertexTextureUVCoord(v, t0, t1);
+}
+
void CRender::SetClipRatio(uint32 type, uint32 w1)
{
bool modified = false;
void SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T, float fTex1S, float fTex1T);
void SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T);
+ void SetVertexTextureUVCoord(TLITVERTEX &v, const TexCord &fTex0, const TexCord &fTex1);
+ void SetVertexTextureUVCoord(TLITVERTEX &v, const TexCord &fTex0);
virtual COLOR PostProcessDiffuseColor(COLOR curDiffuseColor)=0;
virtual COLOR PostProcessSpecularColor()=0;
bool bHalfTxtScale=false;
extern uint32 lastSetTile;
+#define noinline __attribute__((noinline))
+
+static noinline void InitVertex_scale_hack_check(uint32 dwV)
+{
+ // Check for txt scale hack
+ if( gRDP.tiles[lastSetTile].dwSize == TXT_SIZE_32b || gRDP.tiles[lastSetTile].dwSize == TXT_SIZE_4b )
+ {
+ int width = ((gRDP.tiles[lastSetTile].sh-gRDP.tiles[lastSetTile].sl+1)<<1);
+ int height = ((gRDP.tiles[lastSetTile].th-gRDP.tiles[lastSetTile].tl+1)<<1);
+ if( g_fVtxTxtCoords[dwV].x*gRSP.fTexScaleX == width || g_fVtxTxtCoords[dwV].y*gRSP.fTexScaleY == height )
+ {
+ bHalfTxtScale=true;
+ }
+ }
+}
+
+static noinline void InitVertex_notopengl_or_clipper_adjust(TLITVERTEX &v, uint32 dwV)
+{
+ v.x = g_vecProjected[dwV].x*gRSP.vtxXMul+gRSP.vtxXAdd;
+ v.y = g_vecProjected[dwV].y*gRSP.vtxYMul+gRSP.vtxYAdd;
+ v.z = (g_vecProjected[dwV].z + 1.0f) * 0.5f; // DirectX minZ=0, maxZ=1
+ //v.z = g_vecProjected[dwV].z; // DirectX minZ=0, maxZ=1
+ v.rhw = g_vecProjected[dwV].w;
+ VTX_DUMP(TRACE4(" Proj : x=%f, y=%f, z=%f, rhw=%f", v.x,v.y,v.z,v.rhw));
+
+ if( gRSP.bProcessSpecularColor )
+ {
+ v.dcSpecular = CRender::g_pRender->PostProcessSpecularColor();
+ if( gRSP.bFogEnabled )
+ {
+ v.dcSpecular &= 0x00FFFFFF;
+ uint32 fogFct = 0xFF-(uint8)((g_fFogCoord[dwV]-gRSPfFogMin)*gRSPfFogDivider);
+ v.dcSpecular |= (fogFct<<24);
+ }
+ }
+ else if( gRSP.bFogEnabled )
+ {
+ uint32 fogFct = 0xFF-(uint8)((g_fFogCoord[dwV]-gRSPfFogMin)*gRSPfFogDivider);
+ v.dcSpecular = (fogFct<<24);
+ }
+}
+
+static noinline void InitVertex_texgen_correct(TLITVERTEX &v, uint32 dwV)
+{
+ // Correction for texGen result
+ float u0,u1,v0,v1;
+ RenderTexture &tex0 = g_textures[gRSP.curTile];
+ u0 = g_fVtxTxtCoords[dwV].x * 32 * 1024 * gRSP.fTexScaleX / tex0.m_fTexWidth;
+ v0 = g_fVtxTxtCoords[dwV].y * 32 * 1024 * gRSP.fTexScaleY / tex0.m_fTexHeight;
+ u0 *= (gRDP.tiles[gRSP.curTile].fShiftScaleS);
+ v0 *= (gRDP.tiles[gRSP.curTile].fShiftScaleT);
+
+ if( CRender::g_pRender->IsTexel1Enable() )
+ {
+ RenderTexture &tex1 = g_textures[(gRSP.curTile+1)&7];
+ u1 = g_fVtxTxtCoords[dwV].x * 32 * 1024 * gRSP.fTexScaleX / tex1.m_fTexWidth;
+ v1 = g_fVtxTxtCoords[dwV].y * 32 * 1024 * gRSP.fTexScaleY / tex1.m_fTexHeight;
+ u1 *= gRDP.tiles[(gRSP.curTile+1)&7].fShiftScaleS;
+ v1 *= gRDP.tiles[(gRSP.curTile+1)&7].fShiftScaleT;
+ CRender::g_pRender->SetVertexTextureUVCoord(v, u0, v0, u1, v1);
+ }
+ else
+ {
+ CRender::g_pRender->SetVertexTextureUVCoord(v, u0, v0);
+ }
+}
+
+#ifndef __ARM_NEON__
+static void multiply_subtract2(float *d, const float *m1, const float *m2, const float *s)
+{
+ int i;
+ for (i = 0; i < 2; i++)
+ d[i] = m1[i] * m2[i] - s[i];
+}
+#else
+extern "C" void multiply_subtract2(float *d, const float *m1, const float *m2, const float *s);
+#endif
void InitVertex(uint32 dwV, uint32 vtxIndex, bool bTexture, bool openGL)
{
VTX_DUMP(TRACE2("Init vertex (%d) to vtx buf[%d]:", dwV, vtxIndex));
+#ifdef __linux__
+ openGL = 1; // what else there is?
+#endif
TLITVERTEX &v = g_vtxBuffer[vtxIndex];
VTX_DUMP(TRACE4(" Trans: x=%f, y=%f, z=%f, w=%f", g_vtxTransformed[dwV].x,g_vtxTransformed[dwV].y,g_vtxTransformed[dwV].z,g_vtxTransformed[dwV].w));
if( openGL )
g_vtxProjected5[vtxIndex][3] = g_vtxTransformed[dwV].w;
g_vtxProjected5[vtxIndex][4] = g_vecProjected[dwV].z;
- if( g_vtxTransformed[dwV].w < 0 )
+ if( *(int *)&g_vtxTransformed[dwV].w < 0 )
g_vtxProjected5[vtxIndex][4] = 0;
g_vtxIndex[vtxIndex] = vtxIndex;
}
- if( !openGL || options.bOGLVertexClipper == TRUE )
+ if( __builtin_expect(!openGL || options.bOGLVertexClipper == TRUE, 0) )
{
- v.x = g_vecProjected[dwV].x*gRSP.vtxXMul+gRSP.vtxXAdd;
- v.y = g_vecProjected[dwV].y*gRSP.vtxYMul+gRSP.vtxYAdd;
- v.z = (g_vecProjected[dwV].z + 1.0f) * 0.5f; // DirectX minZ=0, maxZ=1
- //v.z = g_vecProjected[dwV].z; // DirectX minZ=0, maxZ=1
- v.rhw = g_vecProjected[dwV].w;
- VTX_DUMP(TRACE4(" Proj : x=%f, y=%f, z=%f, rhw=%f", v.x,v.y,v.z,v.rhw));
-
- if( gRSP.bProcessSpecularColor )
- {
- v.dcSpecular = CRender::g_pRender->PostProcessSpecularColor();
- if( gRSP.bFogEnabled )
- {
- v.dcSpecular &= 0x00FFFFFF;
- uint32 fogFct = 0xFF-(uint8)((g_fFogCoord[dwV]-gRSPfFogMin)*gRSPfFogDivider);
- v.dcSpecular |= (fogFct<<24);
- }
- }
- else if( gRSP.bFogEnabled )
- {
- uint32 fogFct = 0xFF-(uint8)((g_fFogCoord[dwV]-gRSPfFogMin)*gRSPfFogDivider);
- v.dcSpecular = (fogFct<<24);
- }
+ InitVertex_notopengl_or_clipper_adjust(v, dwV);
}
VTX_DUMP(TRACE2(" (U,V): %f, %f", g_fVtxTxtCoords[dwV].x,g_fVtxTxtCoords[dwV].y));
{
// If the vert is already lit, then there is no normal (and hence we can't generate tex coord)
// Only scale if not generated automatically
- if (gRSP.bTextureGen && gRSP.bLightingEnable)
+ if ( __builtin_expect(gRSP.bTextureGen && gRSP.bLightingEnable, 0) )
{
- // Correction for texGen result
- float u0,u1,v0,v1;
- RenderTexture &tex0 = g_textures[gRSP.curTile];
- u0 = g_fVtxTxtCoords[dwV].x * 32 * 1024 * gRSP.fTexScaleX / tex0.m_fTexWidth;
- v0 = g_fVtxTxtCoords[dwV].y * 32 * 1024 * gRSP.fTexScaleY / tex0.m_fTexHeight;
- u0 *= (gRDP.tiles[gRSP.curTile].fShiftScaleS);
- v0 *= (gRDP.tiles[gRSP.curTile].fShiftScaleT);
-
- if( CRender::g_pRender->IsTexel1Enable() )
- {
- RenderTexture &tex1 = g_textures[(gRSP.curTile+1)&7];
- u1 = g_fVtxTxtCoords[dwV].x * 32 * 1024 * gRSP.fTexScaleX / tex1.m_fTexWidth;
- v1 = g_fVtxTxtCoords[dwV].y * 32 * 1024 * gRSP.fTexScaleY / tex1.m_fTexHeight;
- u1 *= gRDP.tiles[(gRSP.curTile+1)&7].fShiftScaleS;
- v1 *= gRDP.tiles[(gRSP.curTile+1)&7].fShiftScaleT;
- CRender::g_pRender->SetVertexTextureUVCoord(v, u0, v0, u1, v1);
- }
- else
- {
- CRender::g_pRender->SetVertexTextureUVCoord(v, u0, v0);
- }
+ InitVertex_texgen_correct(v, dwV);
}
else
{
- float tex0u = g_fVtxTxtCoords[dwV].x *gRSP.tex0scaleX - gRSP.tex0OffsetX ;
- float tex0v = g_fVtxTxtCoords[dwV].y *gRSP.tex0scaleY - gRSP.tex0OffsetY ;
+ TexCord tex0;
+ multiply_subtract2(&tex0.u, &g_fVtxTxtCoords[dwV].x, &gRSP.tex0scaleX, &gRSP.tex0OffsetX);
if( CRender::g_pRender->IsTexel1Enable() )
{
- float tex1u = g_fVtxTxtCoords[dwV].x *gRSP.tex1scaleX - gRSP.tex1OffsetX ;
- float tex1v = g_fVtxTxtCoords[dwV].y *gRSP.tex1scaleY - gRSP.tex1OffsetY ;
+ TexCord tex1;
+ multiply_subtract2(&tex1.u, &g_fVtxTxtCoords[dwV].x, &gRSP.tex1scaleX, &gRSP.tex1OffsetX);
- CRender::g_pRender->SetVertexTextureUVCoord(v, tex0u, tex0v, tex1u, tex1v);
- VTX_DUMP(TRACE2(" (tex0): %f, %f", tex0u,tex0v));
- VTX_DUMP(TRACE2(" (tex1): %f, %f", tex1u,tex1v));
+ CRender::g_pRender->SetVertexTextureUVCoord(v, tex0, tex1);
+ VTX_DUMP(TRACE2(" (tex0): %f, %f", tex0.u,tex0.v));
+ VTX_DUMP(TRACE2(" (tex1): %f, %f", tex1.u,tex1.v));
}
else
{
- CRender::g_pRender->SetVertexTextureUVCoord(v, tex0u, tex0v);
- VTX_DUMP(TRACE2(" (tex0): %f, %f", tex0u,tex0v));
+ CRender::g_pRender->SetVertexTextureUVCoord(v, tex0);
+ VTX_DUMP(TRACE2(" (tex0): %f, %f", tex0.u,tex0.v));
}
}
- // Check for txt scale hack
- if( !bHalfTxtScale && g_curRomInfo.bTextureScaleHack &&
- (gRDP.tiles[lastSetTile].dwSize == TXT_SIZE_32b || gRDP.tiles[lastSetTile].dwSize == TXT_SIZE_4b ) )
- {
- int width = ((gRDP.tiles[lastSetTile].sh-gRDP.tiles[lastSetTile].sl+1)<<1);
- int height = ((gRDP.tiles[lastSetTile].th-gRDP.tiles[lastSetTile].tl+1)<<1);
- if( g_fVtxTxtCoords[dwV].x*gRSP.fTexScaleX == width || g_fVtxTxtCoords[dwV].y*gRSP.fTexScaleY == height )
- {
- bHalfTxtScale=true;
- }
- }
- }
-
- if( g_curRomInfo.bEnableTxtLOD && vtxIndex == 1 && gRDP.otherMode.text_lod )
- {
- if( CRender::g_pRender->IsTexel1Enable() && CRender::g_pRender->m_pColorCombiner->m_pDecodedMux->isUsed(MUX_LODFRAC) )
- {
- ComputeLOD(openGL);
- }
- else
- {
- gRDP.LODFrac = 0;
- }
+ if( __builtin_expect(g_curRomInfo.bTextureScaleHack && !bHalfTxtScale, 0) )
+ InitVertex_scale_hack_check(dwV);
}
VTX_DUMP(TRACE2(" DIF(%08X), SPE(%08X)", v.dcDiffuse, v.dcSpecular));
DEBUGGER_PAUSE_AND_DUMP(NEXT_VERTEX_CMD,{TRACE0("Paused at Vertex Cmd");});
}
+/* NEON code */
+
+#include "RenderBase_neon.h"
+
extern "C" void pv_neon(XVECTOR4 *g_vtxTransformed, XVECTOR4 *g_vecProjected,
uint32 *g_dwVtxDifColor, VECTOR2 *g_fVtxTxtCoords,
float *g_fFogCoord, uint32 *g_clipFlag2,
- uint32 dwNum, const FiddledVtx *vtx,
+ uint32 dwNum, int neon_state,
+ const FiddledVtx *vtx,
const Light *gRSPlights, const float *fRSPAmbientLightRGBA,
const XMATRIX *gRSPworldProject, const XMATRIX *gRSPmodelViewTop,
- uint32 gRSPnumLights, float gRSPfFogMin);
+ uint32 gRSPnumLights, float gRSPfFogMin,
+ uint32 primitiveColor, uint32 primitiveColor_);
+
+extern "C" int tv_direction(const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2);
void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum)
{
// assumtions:
// - g_clipFlag is not used at all
+ // - g_fFogCoord is not used at all
// - g_vtxNonTransformed is not used after ProcessVertexData*() returns
// - g_normal - same
-#define PV_NEON_ENABLE_LIGHT (1 << 0)
-#define PV_NEON_ENABLE_SHADE (1 << 1)
-#define PV_NEON_ENABLE_FOG (1 << 2)
-#define PV_NEON_FOG_ALPHA (1 << 3)
-
int neon_state = 0;
if ( gRSP.bLightingEnable )
neon_state |= PV_NEON_ENABLE_LIGHT;
// - g_vtxTransformed[i]
// - g_dwVtxDifColor[i] -> vertex color
// - g_fVtxTxtCoords[i] -> vertex texture cooridinates
- // - g_fFogCoord[i]
+ // - g_fFogCoord[i] -> unused
// - g_clipFlag2[i]
const FiddledVtx * pVtxBase = (const FiddledVtx*)(g_pRDRAMu8 + dwAddr);
g_pVtxBase = (FiddledVtx *)pVtxBase;
+ gRSPmodelViewTop._14 = gRSPmodelViewTop._24 =
+ gRSPmodelViewTop._34 = 0;
+
// SP_Timing(RSP_GBI0_Vtx);
status.SPCycleCount += Timing_RSP_GBI0_Vtx * dwNum;
- if (!(neon_state & (PV_NEON_ENABLE_LIGHT | PV_NEON_ENABLE_SHADE))) {
- for (i = dwV0; i < dwV0 + dwNum; i++)
- g_dwVtxDifColor[i] = gRDP.primitiveColor; // FLAT shade
- }
+
+#if 1
+ i = dwV0;
+ pv_neon(&g_vtxTransformed[i], &g_vecProjected[i],
+ &g_dwVtxDifColor[i], &g_fVtxTxtCoords[i],
+ &g_fFogCoord[i], &g_clipFlag2[i],
+ dwNum, neon_state, &pVtxBase[i - dwV0],
+ gRSPlights, gRSP.fAmbientColors,
+ &gRSPworldProject, &gRSPmodelViewTop,
+ gRSPnumLights, gRSPfFogMin,
+ gRDP.primitiveColor, gRDP.primitiveColor);
+#else
for (i = dwV0; i < dwV0 + dwNum; i++)
{
g_vecProjected[i].y = g_vtxTransformed[i].y * g_vecProjected[i].w;
g_vecProjected[i].z = g_vtxTransformed[i].z * g_vecProjected[i].w;
- if( neon_state & PV_NEON_ENABLE_FOG )
- {
- g_fFogCoord[i] = g_vecProjected[i].z;
- if( g_vecProjected[i].w < 0 || g_vecProjected[i].z < 0 || g_fFogCoord[i] < gRSPfFogMin )
- g_fFogCoord[i] = gRSPfFogMin;
- }
-
// RSP_Vtx_Clipping(i);
g_clipFlag2[i] = 0;
if( g_vecProjected[i].w > 0 )
color.r = vert.rgba.b;
color.a = vert.rgba.a;
}
+ else
+ g_dwVtxDifColor[i] = gRDP.primitiveColor; // FLAT shade
// ReplaceAlphaWithFogFactor(i);
if( neon_state & PV_NEON_FOG_ALPHA )
// Use fog factor to replace vertex alpha
if( g_vecProjected[i].z > 1 )
*(((uint8*)&(g_dwVtxDifColor[i]))+3) = 0xFF;
- if( g_vecProjected[i].z < 0 )
+ // missing 'else' in original code??
+ else if( g_vecProjected[i].z < 0 )
*(((uint8*)&(g_dwVtxDifColor[i]))+3) = 0;
else
*(((uint8*)&(g_dwVtxDifColor[i]))+3) = (uint8)(g_vecProjected[i].z*255);
g_fVtxTxtCoords[i].x = (float)vert.tu;
g_fVtxTxtCoords[i].y = (float)vert.tv;
}
+#endif
}
bool PrepareTriangle(uint32 dwV0, uint32 dwV1, uint32 dwV2)
InitVertex(dwV1, gRSP.numVertices+1, textureFlag, openGL);
InitVertex(dwV2, gRSP.numVertices+2, textureFlag, openGL);
+ if( __builtin_expect(gRSP.numVertices == 0 && g_curRomInfo.bEnableTxtLOD && gRDP.otherMode.text_lod, 0) )
+ {
+ if( CRender::g_pRender->IsTexel1Enable() && CRender::g_pRender->m_pColorCombiner->m_pDecodedMux->isUsed(MUX_LODFRAC) )
+ {
+ ComputeLOD(openGL);
+ }
+ else
+ {
+ gRDP.LODFrac = 0;
+ }
+ }
+
gRSP.numVertices += 3;
status.dwNumTrisRendered++;
}
// method doesnt' work well when the z value is outside of screenspace
//if (v0.z < 1 && v1.z < 1 && v2.z < 1)
{
+#ifndef __ARM_NEON__
float V1 = v2.x - v0.x;
float V2 = v2.y - v0.y;
float fDirection = (V1 * W2) - (V2 * W1);
fDirection = fDirection * v1.w * v2.w * v0.w;
//float fDirection = v0.x*v1.y-v1.x*v0.y+v1.x*v2.y-v2.x*v1.y+v2.x*v0.y-v0.x*v2.y;
+#else
+ // really returns float, but we only need sign
+ int fDirection = tv_direction(&v0, &v1, &v2);
+#endif
if (fDirection < 0 && gRSP.bCullBack)
{
void ProcessVertexDataSSE(uint32 dwAddr, uint32 dwV0, uint32 dwNum);
#endif
void ProcessVertexDataNoSSE(uint32 dwAddr, uint32 dwV0, uint32 dwNum);
+void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum);
void ProcessVertexDataExternal(uint32 dwAddr, uint32 dwV0, uint32 dwNum);
void SetPrimitiveColor(uint32 dwCol, uint32 LODMin, uint32 LODFrac);
void SetPrimitiveDepth(uint32 z, uint32 dwDZ);
--- /dev/null
+/*
+ * (C) GraÅžvydas "notaz" Ignotas, 2014
+ *
+ * This work is licensed under the terms of GNU GPL version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "arm_features.h"
+#include "RenderBase_neon.h"
+
+.syntax unified
+.text
+.align 3
+
+/*
+ * ProcessVertexData register map:
+ *
+ * q | d | c code
+ * ...
+ * 12 24 gRSPworldProject _11,_12,_13,_14
+ * 25
+ * 13 26 gRSPworldProject _21,_22,_23,_24
+ * 27
+ * 14 28 gRSPworldProject _31,_32,_33,_34
+ * 29
+ * 15 30 gRSPworldProject _41,_42,_43,_44
+ * 31
+ *
+ * r4 vtx[], 16 bytes:
+ * short y, x, flag, z, tv, tu;
+ * / uint8 a, b, g, r;
+ * \ char a, z, y, x;
+ *
+ * outputs:
+ * r0 - XVECTOR4 *g_vtxTransformed
+ * r1 - XVECTOR4 *g_vecProjected
+ * r2 - uint32 *g_dwVtxDifColor
+ * r3 - VECTOR2 *g_fVtxTxtCoords
+ * sp+00 - float *g_fFogCoord
+ * r6 sp+04 - uint32 *g_clipFlag2
+ * inputs:
+ * r11 sp+08 - uint32 dwNum
+ * r10 sp+0c - int neon_flags
+ * r4 sp+10 - FiddledVtx vtx[], (r4 [0], r5 [1])
+ * r7 sp+14 - Light *gRSPlights
+ * sp+18 - float *fRSPAmbientLightRGBA
+ * sp+1c - XMATRIX *gRSPworldProject
+ * sp+20 - XMATRIX *gRSPmodelViewTop
+ * sp+24 - uint32 gRSPnumLights
+ * sp+28 - float gRSPfFogMin
+ * sp+2c - uint32 primitiveColor
+ * sp+30 - uint32 primitiveColor
+ */
+FUNCTION(pv_neon):
+ ldr r12, [sp, #0x10]
+ pld [r12]
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ mov r4, r12 @ vtx
+ ldr r12, [sp, #0x64+0x1c]
+ vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject
+ vld1.32 {q14,q15}, [r12, :128]
+ ldr r6, [sp, #0x64+0x04] @ g_clipFlag2
+ add r5, r4, #16 @ vtx + 1
+ ldr r11, [sp, #0x64+0x08] @ dwNum
+ ldr r10, [sp, #0x64+0x0c] @ neon_flags
+
+0:
+ vld1.16 d12, [r4]! @ vtx[0] .z .flag .x .y (reg)
+ vmovl.s16 q6, d12
+ vld1.16 d14, [r5]! @ vtx[1] .z .flag .x .y
+ vmovl.s16 q7, d14
+ vcvt.f32.s32 q6, q6 @ q6 = vtx_raw0
+ vcvt.f32.s32 q7, q7 @ q7 = vtx_raw1
+ vdup.32 q0, d12[1] @ vtx_raw0.x (dup)
+ vdup.32 q1, d12[0] @ vtx_raw0.y (dup)
+ vdup.32 q2, d13[1] @ vtx_raw0.z (dup)
+ vdup.32 q3, d14[1] @ vtx_raw1.x (dup)
+ vdup.32 q4, d14[0] @ vtx_raw1.y (dup)
+ vdup.32 q5, d15[1] @ vtx_raw1.z (dup)
+ /* note: order of operations matters greatly,
+ * may cause like 20 fraction bits to differ! */
+ vmul.f32 q0, q0, q12
+ vmul.f32 q3, q3, q12
+ vmla.f32 q0, q1, q13
+ vmla.f32 q3, q4, q13
+ vmul.f32 q2, q2, q14 @ yes, mul+add is
+ vmul.f32 q5, q5, q14 @ faster than mla
+ vadd.f32 q0, q2
+ vadd.f32 q3, q5
+ vadd.f32 q0, q15 @ q0 = g_vtxTransformed[i]
+ vadd.f32 q3, q15 @ q3 = g_vtxTransformed[i + 1]
+
+ vld1.16 d16[1], [r4]! @ [0].v
+ vmov d2, d1
+ vld1.16 d16[0], [r4]! @ [0].u
+ vsri.64 d2, d7, #32
+ vld1.16 d18[1], [r5]! @ [0].v
+#if 1
+ vrecpe.f32 d4, d2 @ inv [0][1] .w
+ vld1.16 d18[0], [r5]! @ [0].u
+ vrecps.f32 d5, d2, d4 @ step
+ vmovl.s16 q8, d16
+ /* g_vtxTransformed[0] */ vst1.32 {q0}, [r0, :128]!
+ vmovl.s16 q9, d18
+ vcvt.f32.s32 d16, d16
+ vcvt.f32.s32 d18, d18
+ vmul.f32 d4, d5, d4 @ better inv
+ bic r9, r5, #63
+ pld [r9, #64]
+ vrecps.f32 d5, d2, d4 @ step
+ cmp r11, #1
+ /* u,v g_fVtxTxtCoords[0] */ vst1.32 {d16}, [r3]!
+ beq 99f
+ /* g_vtxTransformed[1] */ vst1.32 {q3}, [r0, :128]!
+ /* ... [1] */ vst1.32 {d18}, [r3]!
+ 99:
+ vmov.f32 d20, #1.0
+ vmov.f32 d21, #-1.0
+ vmul.f32 d4, d5, d4 @ better inv [0][1] .w
+ #if 0
+ vrecps.f32 d5, d2, d4 @ step
+ vmul.f32 d4, d5, d4 @ better inv
+ #endif
+#else
+ mov r12, #0x3f800000 @ 1.0f
+ vmov.f32 s6, r12
+ vdiv.f32 s8, s6, s4
+ vdiv.f32 s9, s6, s5
+ #error incomplete
+#endif
+
+ mov r8, #X_CLIP_MAX
+ mov r9, #Y_CLIP_MAX
+ vmov d22, r8, r9
+ vmul.f32 q0, q0, d4[1] @ .x .y .z .w *= [0] .w
+ vmul.f32 q1, q3, d4[0]
+ vshr.u64 d5, d4, #32 @ [0] .w
+ mov r8, #X_CLIP_MIN
+ mov r9, #Y_CLIP_MIN
+ vmov d23, r8, r9
+ vsli.64 d3, d4, #32 @ insert [1] .w
+ vsli.64 d1, d5, #32
+ vsli.u64 d5, d4, #32 @ [0] [1] .w
+ vcgt.f32 d6, d0, d20 @ .xy > 1.0?
+ vcgt.f32 d7, d21, d0
+ vcgt.f32 d4, d5, #0 @ .w > 0?
+ vst1.32 {q0}, [r1]! @ g_vecProjected[0]
+ vcgt.f32 d8, d2, d20
+ vcgt.f32 d9, d21, d2
+ vld1.32 d0[0], [r4]! @ mem: [0] .azyx
+ vand q3, q11
+ vand q4, q11
+ cmp r11, #1
+ beq 99f
+ vst1.32 {q1}, [r1]! @ g_vecProjected[1]
+99:
+ vorr d6, d6, d7
+ vorr d7, d8, d9
+ vld1.32 d0[1], [r5]! @ mem: [1] .azyx
+ vpadd.u32 d6, d7
+ vrev32.8 d0, d0 @ make 0xaazzyyxx [1][0]
+ vsli.u64 d1, d3, #32 @ d3 = [1] [0] .z
+ vmovl.s8 q4, d0
+ vand d6, d4
+ vmovl.s16 q1, d8
+ vmovl.s16 q2, d9
+ vst1.32 {d6}, [r6]! @ g_clipFlag2
+
+ tst r10, #PV_NEON_ENABLE_LIGHT
+ beq pv_neon_no_light
+@ pv_neon_light:
+ @ live NEON registers:
+ @ d1 = [1][0] .z (must preserve)
+ @ q1,q2 = azyx [1][0]
+ @ q12+ = gRSPworldProject
+ ldr r12, [sp, #0x64+0x20]
+ vcvt.f32.s32 q1, q1
+ vcvt.f32.s32 q2, q2
+ vld1.32 {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop
+ vld1.32 {q10}, [r12, :128]
+
+ vdup.32 q5, d4[0] @ [1] .x (dup)
+ vdup.32 q6, d4[1] @ [1] .y (dup)
+ vdup.32 q7, d5[0] @ [1] .z (dup)
+ vdup.32 q2, d2[0] @ [0] .x (dup)
+ vdup.32 q3, d2[1] @ [0] .y (dup)
+ vdup.32 q4, d3[0] @ [0] .z (dup)
+ vmul.f32 q2, q2, q8
+ vmul.f32 q5, q5, q8
+ vmla.f32 q2, q3, q9
+ vmla.f32 q5, q6, q9
+ vmul.f32 q4, q4, q10
+ vmul.f32 q7, q7, q10
+ vadd.f32 q4, q2 @ q4 = temp[0] .xyz0
+ vadd.f32 q5, q7 @ q5 = temp[1] .xyz0
+ vmul.f32 q2, q4, q4 @ temp .xyz0 ^2
+ vmul.f32 q3, q5, q5
+ vpadd.f32 d2, d4, d5
+ vpadd.f32 d3, d6, d7
+ movw r8, #0x0000ffff
+ movt r8, #0x7f7f @ max normal float, ~3.4e+38
+ vdup.32 d4, r8
+ vpadd.f32 d2, d2, d3 @ d2 = [1][0] x^2 + y^2 + z^2
+ vcgt.f32 d5, d2, #0
+ vbif d2, d4, d5 @ if (d2 == 0) d2 = MAXFLOAT
+
+ vrsqrte.f32 d3, d2 @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum
+ vmul.f32 d4, d3, d2
+ ldr r9, [sp, #0x64+0x18] @ &fRSPAmbientLightRGBA
+ ldr r7, [sp, #0x64+0x14] @ gRSPlights
+ ldr r8, [sp, #0x64+0x24] @ gRSPnumLights
+ vrsqrts.f32 d4, d3, d4 @ step
+ vld1.32 {q6}, [r9] @ rgb
+ vld1.32 {q7}, [r9] @ rgb
+ vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
+#if 0 /* not necessary? */
+ vmul.f32 d4, d3, d2
+ vrsqrts.f32 d4, d3, d4 @ step
+ vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
+#endif
+ vmul.f32 q2, q4, d3[0] @ q2 = normal[0] .xyz
+ vmul.f32 q3, q5, d3[1] @ q3 = normal[1] .xyz
+
+1:
+ vld1.32 {q8}, [r7]
+ vmul.f32 q4, q8, q2 @ gRSPlights[l] * normal
+ vmul.f32 q5, q8, q3
+ vpadd.f32 d8, d8, d9
+ vpadd.f32 d10, d10, d11
+ vpadd.f32 d8, d8, d10 @ d8 = [1][0] fCosT
+ vcgt.f32 d9, d8, #0 @ if (!(fCosT > 0))
+ vand d8, d9 @ fCosT = 0
+ add r9, r7, #OFFSETOF_Light_fr
+ vld1.32 {q8}, [r9] @ .fr .fg .fb
+ vdup.32 q5, d8[1] @ [1] fCosT (dup)
+ vdup.32 q4, d8[0] @
+ vmla.f32 q7, q8, q5 @ .rgb += frgb * fCosT
+ vmla.f32 q6, q8, q4
+ add r7, #SIZEOF_Light
+ subs r8, #1
+ bgt 1b
+
+ movt r8, #0x437f @ float 255
+ vdup.32 q8, r8
+ vcgt.f32 q4, q6, q8 @ if (.rgb > 255)
+ vcgt.f32 q5, q7, q8
+ vbit q6, q8, q4 @ .rgb = 255
+ vbit q7, q8, q5
+ vcvt.u32.f32 q6, q6
+ vcvt.u32.f32 q7, q7
+ ldrb r8, [r4, #-4] @ .a from vtx
+ ldrb r9, [r5, #-4]
+ vext.32 q4, q6, q6, #3 @ reg: .abgr -> .bgra
+ vext.32 q5, q7, q7, #3
+ vmov.32 d8[0], r8 @ use .a from input
+ vmov.32 d10[0], r9
+ vmovn.u32 d8, q4
+ vmovn.u32 d10, q5
+ vmovn.u16 d0, q4
+ vmovn.u16 d2, q5
+ vsli.u64 d0, d2, #32
+ vrev32.8 d0, d0 @ 0xbbggrraa -> 0xaarrggbb
+ b pv_neon_fog_alpha
+
+pv_neon_no_light:
+ tst r10, #PV_NEON_ENABLE_SHADE
+ vldr d0, [sp, #0x64+0x2c] @ primitiveColor [0] [1]
+ beq pv_neon_fog_alpha
+ @ easier to do with ARM
+ ldr r8, [r4, #-4]
+ ldr r9, [r5, #-4]
+ ror r8, #8 @ mem: .argb -> .rgba
+ ror r9, #8 @ reg: 0xbbggrraa -> ..
+ vmov d0, r8, r9
+
+pv_neon_fog_alpha:
+ tst r10, #PV_NEON_FOG_ALPHA
+ beq pv_neon_next
+ vmov.f32 d20, #1.0
+ vcgt.f32 d2, d1, d20 @ [0] [1] .z > 1.0?
+ vcgt.f32 d3, d1, #0 @ > 0?
+ movw r8, #0
+ movt r8, #0x4f7f @ r8 = (float)(255<<24)
+ vbit d1, d20, d2 @ make 1.0 if needed
+ vand d1, d3
+ vdup.32 d4, r8
+ vmul.f32 d1, d1, d4
+ vcvt.u32.f32 d1, d1
+ vmov.u32 d5, #0xff000000
+ vbit d0, d1, d5
+
+pv_neon_next:
+ subs r11, #2
+ vst1.32 {d0}, [r2]! @ g_dwVtxDifColor
+ add r4, #16
+ add r5, #16
+ bgt 0b
+ nop
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+ .size pv_neon, .-pv_neon
+
+
+@ (float *d, const float *m1, const float *m2, const float *s)
+FUNCTION(multiply_subtract2):
+ vld1.32 {d1}, [r1]
+ vld1.32 {d2}, [r2]
+ vmul.f32 d0, d1, d2
+ vld1.32 {d3}, [r3]
+ vsub.f32 d0, d3
+ vst1.32 {d0}, [r0]
+ bx lr
+ .size multiply_subtract2, .-multiply_subtract2
+
+
+@ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2)
+FUNCTION(tv_direction):
+ vld1.32 {q0}, [r0]
+ vld1.32 {q2}, [r2]
+ vld1.32 {q1}, [r1]
+ vsub.f32 d6, d4, d0 @ d6 = V2,V1
+ vsub.f32 d7, d4, d2 @ d7 = W2,W1
+ vmul.f32 d1, d5 @ d1 = v0.w * v2.w
+ vrev64.32 d7, d7
+ vmul.f32 d6, d7 @ d6 = V2*W1,V1*W2
+ vmul.f32 d1, d3 @ d1 *= v1.w
+ vshr.u64 d7, d6, #32
+ vsub.f32 d6, d7 @ d6[0] = V1*W2 - V2*W1
+ vshr.u64 d1, d1, #32
+ vmul.f32 d0, d1, d6
+ vmov.32 r0, d0[0]
+ bx lr
+
+
+@ vim:filetype=armasm:expandtab
--- /dev/null
+
+#define PV_NEON_ENABLE_LIGHT (1 << 0)
+#define PV_NEON_ENABLE_SHADE (1 << 1)
+#define PV_NEON_ENABLE_FOG (1 << 2) // unused
+#define PV_NEON_FOG_ALPHA (1 << 3)
+
+#define X_CLIP_MAX 0x1
+#define X_CLIP_MIN 0x2
+#define Y_CLIP_MAX 0x4
+#define Y_CLIP_MIN 0x8
+
+#define OFFSETOF_Light_fr 0x14
+#define SIZEOF_Light 0x44
--- /dev/null
+#ifndef __ARM_FEATURES_H__
+#define __ARM_FEATURES_H__
+
+#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+ || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+ || defined(__ARM_ARCH_7EM__)
+
+#define HAVE_ARMV7
+#define HAVE_ARMV6
+#define HAVE_ARMV5
+
+#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+ || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
+ || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)
+
+#define HAVE_ARMV6
+#define HAVE_ARMV5
+
+#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5E__) \
+ || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+
+#define HAVE_ARMV5
+
+#endif
+
+/* no need for HAVE_NEON - GCC defines __ARM_NEON__ consistently */
+
+/* global function/external symbol */
+#ifndef __MACH__
+#define ESYM(name) name
+
+#define FUNCTION(name) \
+ .globl name; \
+ .type name, %function; \
+ name
+
+#define EXTRA_UNSAVED_REGS
+
+#else
+#define ESYM(name) _##name
+
+#define FUNCTION(name) \
+ .globl ESYM(name); \
+ name: \
+ ESYM(name)
+
+// r7 is preserved, but add it for EABI alignment..
+#define EXTRA_UNSAVED_REGS r7, r9,
+
+#endif
+
+#endif /* __ARM_FEATURES_H__ */