From 48d77f736bea02afeb362cff05c81375752b3015 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Wed, 2 Jul 2014 23:34:32 +0200 Subject: [PATCH] RICE: Copy of Notaz optim to GLES1.1 version --- source/rice_gles/projects/unix/Makefile | 9 + source/rice_gles/src/Render.cpp | 47 ++-- source/rice_gles/src/Render.h | 2 + source/rice_gles/src/RenderBase.cpp | 243 ++++++++++------- source/rice_gles/src/RenderBase.h | 1 + source/rice_gles/src/RenderBase_neon.S | 339 ++++++++++++++++++++++++ source/rice_gles/src/RenderBase_neon.h | 13 + source/rice_gles/src/arm_features.h | 52 ++++ 8 files changed, 592 insertions(+), 114 deletions(-) create mode 100644 source/rice_gles/src/RenderBase_neon.S create mode 100644 source/rice_gles/src/RenderBase_neon.h create mode 100644 source/rice_gles/src/arm_features.h diff --git a/source/rice_gles/projects/unix/Makefile b/source/rice_gles/projects/unix/Makefile index 466f183..edcdcea 100755 --- a/source/rice_gles/projects/unix/Makefile +++ b/source/rice_gles/projects/unix/Makefile @@ -363,9 +363,15 @@ SOURCE += \ $(SRCDIR)/osal_files_unix.c endif +ifeq ($(CPU),ARM) +SOURCE += \ + $(SRCDIR)/RenderBase_neon.S +endif + # generate a list of object files build, make a temporary directory for them OBJECTS := $(patsubst $(SRCDIR)/%.c, $(OBJDIR)/%.o, $(filter %.c, $(SOURCE))) OBJECTS += $(patsubst $(SRCDIR)/%.cpp, $(OBJDIR)/%.o, $(filter %.cpp, $(SOURCE))) +OBJECTS += $(patsubst $(SRCDIR)/%.S, $(OBJDIR)/%.o, $(filter %.S, $(SOURCE))) OBJDIRS = $(dir $(OBJECTS)) $(shell $(MKDIR) $(OBJDIRS)) @@ -425,6 +431,9 @@ CXXFLAGS += $(CFLAGS) $(OBJDIR)/%.o: $(SRCDIR)/%.c $(COMPILE.c) -o $@ $< +$(OBJDIR)/%.o: $(SRCDIR)/%.S + $(COMPILE.c) -o $@ $< + $(OBJDIR)/%.o: $(SRCDIR)/%.cpp $(COMPILE.cc) -o $@ $< diff --git a/source/rice_gles/src/Render.cpp b/source/rice_gles/src/Render.cpp index 8359ee4..68e287f 100755 --- a/source/rice_gles/src/Render.cpp +++ b/source/rice_gles/src/Render.cpp @@ -1677,10 +1677,12 @@ void CRender::SaveTextureToFile(int tex, TextureChannel channel, bool bShow) #endif extern RenderTextureInfo gRenderTextureInfos[]; -void SetVertexTextureUVCoord(TexCord &dst, float s, float t, int tile, TxtrCacheEntry *pEntry) +void SetVertexTextureUVCoord(TexCord &dst, const TexCord &src, int tile, TxtrCacheEntry *pEntry) { RenderTexture &txtr = g_textures[tile]; RenderTextureInfo &info = gRenderTextureInfos[pEntry->txtrBufIdx-1]; + float s = src.u; + float t = src.v; uint32 addrOffset = g_TI.dwAddr-info.CI_Info.dwAddr; uint32 extraTop = (addrOffset>>(info.CI_Info.dwSize-1)) /info.CI_Info.dwWidth; @@ -1700,21 +1702,29 @@ void SetVertexTextureUVCoord(TexCord &dst, float s, float t, int tile, TxtrCache dst.v = t; } -void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T) +void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, const TexCord &fTex0) { RenderTexture &txtr = g_textures[0]; if( txtr.pTextureEntry && txtr.pTextureEntry->txtrBufIdx > 0 ) { - ::SetVertexTextureUVCoord(v.tcord[0], fTex0S, fTex0T, 0, txtr.pTextureEntry); + ::SetVertexTextureUVCoord(v.tcord[0], fTex0, 0, txtr.pTextureEntry); } else { - v.tcord[0].u = fTex0S; - v.tcord[0].v = fTex0T; + v.tcord[0] = fTex0; } } -void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T, float fTex1S, float fTex1T) +void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T) +{ + TexCord t = { fTex0S, fTex0T }; + SetVertexTextureUVCoord(v, t); +} + +void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, const TexCord &fTex0_, const TexCord &fTex1_) { + TexCord fTex0 = fTex0_; + TexCord fTex1 = fTex1_; + if( (options.enableHackForGames == HACK_FOR_ZELDA||options.enableHackForGames == HACK_FOR_ZELDA_MM) && m_Mux == 0x00262a60150c937fLL && gRSP.curTile == 0 ) { // Hack for Zelda Sun @@ -1724,36 +1734,41 @@ void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T, t1.dwFormat == TXT_FMT_I && t1.dwSize == TXT_SIZE_8b && t1.dwWidth == 64 && t0.dwHeight == t1.dwHeight ) { - fTex0S /= 2; - fTex0T /= 2; - fTex1S /= 2; - fTex1T /= 2; + fTex0.u /= 2; + fTex0.v /= 2; + fTex1.u /= 2; + fTex1.v /= 2; } } RenderTexture &txtr0 = g_textures[0]; if( txtr0.pTextureEntry && txtr0.pTextureEntry->txtrBufIdx > 0 ) { - ::SetVertexTextureUVCoord(v.tcord[0], fTex0S, fTex0T, 0, txtr0.pTextureEntry); + ::SetVertexTextureUVCoord(v.tcord[0], fTex0, 0, txtr0.pTextureEntry); } else { - v.tcord[0].u = fTex0S; - v.tcord[0].v = fTex0T; + v.tcord[0] = fTex0; } RenderTexture &txtr1 = g_textures[1]; if( txtr1.pTextureEntry && txtr1.pTextureEntry->txtrBufIdx > 0 ) { - ::SetVertexTextureUVCoord(v.tcord[1], fTex1S, fTex1T, 1, txtr1.pTextureEntry); + ::SetVertexTextureUVCoord(v.tcord[1], fTex1, 1, txtr1.pTextureEntry); } else { - v.tcord[1].u = fTex1S; - v.tcord[1].v = fTex1T; + v.tcord[1] = fTex1; } } +void CRender::SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T, float fTex1S, float fTex1T) +{ + TexCord t0 = { fTex0S, fTex0T }; + TexCord t1 = { fTex1S, fTex1T }; + SetVertexTextureUVCoord(v, t0, t1); +} + void CRender::SetClipRatio(uint32 type, uint32 w1) { bool modified = false; diff --git a/source/rice_gles/src/Render.h b/source/rice_gles/src/Render.h index 9ae2849..02e7b3b 100644 --- a/source/rice_gles/src/Render.h +++ b/source/rice_gles/src/Render.h @@ -168,6 +168,8 @@ public: void SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T, float fTex1S, float fTex1T); void SetVertexTextureUVCoord(TLITVERTEX &v, float fTex0S, float fTex0T); + void SetVertexTextureUVCoord(TLITVERTEX &v, const TexCord &fTex0, const TexCord &fTex1); + void SetVertexTextureUVCoord(TLITVERTEX &v, const TexCord &fTex0); virtual COLOR PostProcessDiffuseColor(COLOR curDiffuseColor)=0; virtual COLOR PostProcessSpecularColor()=0; diff --git a/source/rice_gles/src/RenderBase.cpp b/source/rice_gles/src/RenderBase.cpp index 045041b..eabae7a 100755 --- a/source/rice_gles/src/RenderBase.cpp +++ b/source/rice_gles/src/RenderBase.cpp @@ -917,11 +917,91 @@ void ComputeLOD(bool openGL) bool bHalfTxtScale=false; extern uint32 lastSetTile; +#define noinline __attribute__((noinline)) + +static noinline void InitVertex_scale_hack_check(uint32 dwV) +{ + // Check for txt scale hack + if( gRDP.tiles[lastSetTile].dwSize == TXT_SIZE_32b || gRDP.tiles[lastSetTile].dwSize == TXT_SIZE_4b ) + { + int width = ((gRDP.tiles[lastSetTile].sh-gRDP.tiles[lastSetTile].sl+1)<<1); + int height = ((gRDP.tiles[lastSetTile].th-gRDP.tiles[lastSetTile].tl+1)<<1); + if( g_fVtxTxtCoords[dwV].x*gRSP.fTexScaleX == width || g_fVtxTxtCoords[dwV].y*gRSP.fTexScaleY == height ) + { + bHalfTxtScale=true; + } + } +} + +static noinline void InitVertex_notopengl_or_clipper_adjust(TLITVERTEX &v, uint32 dwV) +{ + v.x = g_vecProjected[dwV].x*gRSP.vtxXMul+gRSP.vtxXAdd; + v.y = g_vecProjected[dwV].y*gRSP.vtxYMul+gRSP.vtxYAdd; + v.z = (g_vecProjected[dwV].z + 1.0f) * 0.5f; // DirectX minZ=0, maxZ=1 + //v.z = g_vecProjected[dwV].z; // DirectX minZ=0, maxZ=1 + v.rhw = g_vecProjected[dwV].w; + VTX_DUMP(TRACE4(" Proj : x=%f, y=%f, z=%f, rhw=%f", v.x,v.y,v.z,v.rhw)); + + if( gRSP.bProcessSpecularColor ) + { + v.dcSpecular = CRender::g_pRender->PostProcessSpecularColor(); + if( gRSP.bFogEnabled ) + { + v.dcSpecular &= 0x00FFFFFF; + uint32 fogFct = 0xFF-(uint8)((g_fFogCoord[dwV]-gRSPfFogMin)*gRSPfFogDivider); + v.dcSpecular |= (fogFct<<24); + } + } + else if( gRSP.bFogEnabled ) + { + uint32 fogFct = 0xFF-(uint8)((g_fFogCoord[dwV]-gRSPfFogMin)*gRSPfFogDivider); + v.dcSpecular = (fogFct<<24); + } +} + +static noinline void InitVertex_texgen_correct(TLITVERTEX &v, uint32 dwV) +{ + // Correction for texGen result + float u0,u1,v0,v1; + RenderTexture &tex0 = g_textures[gRSP.curTile]; + u0 = g_fVtxTxtCoords[dwV].x * 32 * 1024 * gRSP.fTexScaleX / tex0.m_fTexWidth; + v0 = g_fVtxTxtCoords[dwV].y * 32 * 1024 * gRSP.fTexScaleY / tex0.m_fTexHeight; + u0 *= (gRDP.tiles[gRSP.curTile].fShiftScaleS); + v0 *= (gRDP.tiles[gRSP.curTile].fShiftScaleT); + + if( CRender::g_pRender->IsTexel1Enable() ) + { + RenderTexture &tex1 = g_textures[(gRSP.curTile+1)&7]; + u1 = g_fVtxTxtCoords[dwV].x * 32 * 1024 * gRSP.fTexScaleX / tex1.m_fTexWidth; + v1 = g_fVtxTxtCoords[dwV].y * 32 * 1024 * gRSP.fTexScaleY / tex1.m_fTexHeight; + u1 *= gRDP.tiles[(gRSP.curTile+1)&7].fShiftScaleS; + v1 *= gRDP.tiles[(gRSP.curTile+1)&7].fShiftScaleT; + CRender::g_pRender->SetVertexTextureUVCoord(v, u0, v0, u1, v1); + } + else + { + CRender::g_pRender->SetVertexTextureUVCoord(v, u0, v0); + } +} + +#ifndef __ARM_NEON__ +static void multiply_subtract2(float *d, const float *m1, const float *m2, const float *s) +{ + int i; + for (i = 0; i < 2; i++) + d[i] = m1[i] * m2[i] - s[i]; +} +#else +extern "C" void multiply_subtract2(float *d, const float *m1, const float *m2, const float *s); +#endif void InitVertex(uint32 dwV, uint32 vtxIndex, bool bTexture, bool openGL) { VTX_DUMP(TRACE2("Init vertex (%d) to vtx buf[%d]:", dwV, vtxIndex)); +#ifdef __linux__ + openGL = 1; // what else there is? +#endif TLITVERTEX &v = g_vtxBuffer[vtxIndex]; VTX_DUMP(TRACE4(" Trans: x=%f, y=%f, z=%f, w=%f", g_vtxTransformed[dwV].x,g_vtxTransformed[dwV].y,g_vtxTransformed[dwV].z,g_vtxTransformed[dwV].w)); if( openGL ) @@ -932,36 +1012,15 @@ void InitVertex(uint32 dwV, uint32 vtxIndex, bool bTexture, bool openGL) g_vtxProjected5[vtxIndex][3] = g_vtxTransformed[dwV].w; g_vtxProjected5[vtxIndex][4] = g_vecProjected[dwV].z; - if( g_vtxTransformed[dwV].w < 0 ) + if( *(int *)&g_vtxTransformed[dwV].w < 0 ) g_vtxProjected5[vtxIndex][4] = 0; g_vtxIndex[vtxIndex] = vtxIndex; } - if( !openGL || options.bOGLVertexClipper == TRUE ) + if( __builtin_expect(!openGL || options.bOGLVertexClipper == TRUE, 0) ) { - v.x = g_vecProjected[dwV].x*gRSP.vtxXMul+gRSP.vtxXAdd; - v.y = g_vecProjected[dwV].y*gRSP.vtxYMul+gRSP.vtxYAdd; - v.z = (g_vecProjected[dwV].z + 1.0f) * 0.5f; // DirectX minZ=0, maxZ=1 - //v.z = g_vecProjected[dwV].z; // DirectX minZ=0, maxZ=1 - v.rhw = g_vecProjected[dwV].w; - VTX_DUMP(TRACE4(" Proj : x=%f, y=%f, z=%f, rhw=%f", v.x,v.y,v.z,v.rhw)); - - if( gRSP.bProcessSpecularColor ) - { - v.dcSpecular = CRender::g_pRender->PostProcessSpecularColor(); - if( gRSP.bFogEnabled ) - { - v.dcSpecular &= 0x00FFFFFF; - uint32 fogFct = 0xFF-(uint8)((g_fFogCoord[dwV]-gRSPfFogMin)*gRSPfFogDivider); - v.dcSpecular |= (fogFct<<24); - } - } - else if( gRSP.bFogEnabled ) - { - uint32 fogFct = 0xFF-(uint8)((g_fFogCoord[dwV]-gRSPfFogMin)*gRSPfFogDivider); - v.dcSpecular = (fogFct<<24); - } + InitVertex_notopengl_or_clipper_adjust(v, dwV); } VTX_DUMP(TRACE2(" (U,V): %f, %f", g_fVtxTxtCoords[dwV].x,g_fVtxTxtCoords[dwV].y)); @@ -997,74 +1056,33 @@ void InitVertex(uint32 dwV, uint32 vtxIndex, bool bTexture, bool openGL) { // If the vert is already lit, then there is no normal (and hence we can't generate tex coord) // Only scale if not generated automatically - if (gRSP.bTextureGen && gRSP.bLightingEnable) + if ( __builtin_expect(gRSP.bTextureGen && gRSP.bLightingEnable, 0) ) { - // Correction for texGen result - float u0,u1,v0,v1; - RenderTexture &tex0 = g_textures[gRSP.curTile]; - u0 = g_fVtxTxtCoords[dwV].x * 32 * 1024 * gRSP.fTexScaleX / tex0.m_fTexWidth; - v0 = g_fVtxTxtCoords[dwV].y * 32 * 1024 * gRSP.fTexScaleY / tex0.m_fTexHeight; - u0 *= (gRDP.tiles[gRSP.curTile].fShiftScaleS); - v0 *= (gRDP.tiles[gRSP.curTile].fShiftScaleT); - - if( CRender::g_pRender->IsTexel1Enable() ) - { - RenderTexture &tex1 = g_textures[(gRSP.curTile+1)&7]; - u1 = g_fVtxTxtCoords[dwV].x * 32 * 1024 * gRSP.fTexScaleX / tex1.m_fTexWidth; - v1 = g_fVtxTxtCoords[dwV].y * 32 * 1024 * gRSP.fTexScaleY / tex1.m_fTexHeight; - u1 *= gRDP.tiles[(gRSP.curTile+1)&7].fShiftScaleS; - v1 *= gRDP.tiles[(gRSP.curTile+1)&7].fShiftScaleT; - CRender::g_pRender->SetVertexTextureUVCoord(v, u0, v0, u1, v1); - } - else - { - CRender::g_pRender->SetVertexTextureUVCoord(v, u0, v0); - } + InitVertex_texgen_correct(v, dwV); } else { - float tex0u = g_fVtxTxtCoords[dwV].x *gRSP.tex0scaleX - gRSP.tex0OffsetX ; - float tex0v = g_fVtxTxtCoords[dwV].y *gRSP.tex0scaleY - gRSP.tex0OffsetY ; + TexCord tex0; + multiply_subtract2(&tex0.u, &g_fVtxTxtCoords[dwV].x, &gRSP.tex0scaleX, &gRSP.tex0OffsetX); if( CRender::g_pRender->IsTexel1Enable() ) { - float tex1u = g_fVtxTxtCoords[dwV].x *gRSP.tex1scaleX - gRSP.tex1OffsetX ; - float tex1v = g_fVtxTxtCoords[dwV].y *gRSP.tex1scaleY - gRSP.tex1OffsetY ; + TexCord tex1; + multiply_subtract2(&tex1.u, &g_fVtxTxtCoords[dwV].x, &gRSP.tex1scaleX, &gRSP.tex1OffsetX); - CRender::g_pRender->SetVertexTextureUVCoord(v, tex0u, tex0v, tex1u, tex1v); - VTX_DUMP(TRACE2(" (tex0): %f, %f", tex0u,tex0v)); - VTX_DUMP(TRACE2(" (tex1): %f, %f", tex1u,tex1v)); + CRender::g_pRender->SetVertexTextureUVCoord(v, tex0, tex1); + VTX_DUMP(TRACE2(" (tex0): %f, %f", tex0.u,tex0.v)); + VTX_DUMP(TRACE2(" (tex1): %f, %f", tex1.u,tex1.v)); } else { - CRender::g_pRender->SetVertexTextureUVCoord(v, tex0u, tex0v); - VTX_DUMP(TRACE2(" (tex0): %f, %f", tex0u,tex0v)); + CRender::g_pRender->SetVertexTextureUVCoord(v, tex0); + VTX_DUMP(TRACE2(" (tex0): %f, %f", tex0.u,tex0.v)); } } - // Check for txt scale hack - if( !bHalfTxtScale && g_curRomInfo.bTextureScaleHack && - (gRDP.tiles[lastSetTile].dwSize == TXT_SIZE_32b || gRDP.tiles[lastSetTile].dwSize == TXT_SIZE_4b ) ) - { - int width = ((gRDP.tiles[lastSetTile].sh-gRDP.tiles[lastSetTile].sl+1)<<1); - int height = ((gRDP.tiles[lastSetTile].th-gRDP.tiles[lastSetTile].tl+1)<<1); - if( g_fVtxTxtCoords[dwV].x*gRSP.fTexScaleX == width || g_fVtxTxtCoords[dwV].y*gRSP.fTexScaleY == height ) - { - bHalfTxtScale=true; - } - } - } - - if( g_curRomInfo.bEnableTxtLOD && vtxIndex == 1 && gRDP.otherMode.text_lod ) - { - if( CRender::g_pRender->IsTexel1Enable() && CRender::g_pRender->m_pColorCombiner->m_pDecodedMux->isUsed(MUX_LODFRAC) ) - { - ComputeLOD(openGL); - } - else - { - gRDP.LODFrac = 0; - } + if( __builtin_expect(g_curRomInfo.bTextureScaleHack && !bHalfTxtScale, 0) ) + InitVertex_scale_hack_check(dwV); } VTX_DUMP(TRACE2(" DIF(%08X), SPE(%08X)", v.dcDiffuse, v.dcSpecular)); @@ -1592,13 +1610,21 @@ void ProcessVertexDataNoSSE(uint32 dwAddr, uint32 dwV0, uint32 dwNum) DEBUGGER_PAUSE_AND_DUMP(NEXT_VERTEX_CMD,{TRACE0("Paused at Vertex Cmd");}); } +/* NEON code */ + +#include "RenderBase_neon.h" + extern "C" void pv_neon(XVECTOR4 *g_vtxTransformed, XVECTOR4 *g_vecProjected, uint32 *g_dwVtxDifColor, VECTOR2 *g_fVtxTxtCoords, float *g_fFogCoord, uint32 *g_clipFlag2, - uint32 dwNum, const FiddledVtx *vtx, + uint32 dwNum, int neon_state, + const FiddledVtx *vtx, const Light *gRSPlights, const float *fRSPAmbientLightRGBA, const XMATRIX *gRSPworldProject, const XMATRIX *gRSPmodelViewTop, - uint32 gRSPnumLights, float gRSPfFogMin); + uint32 gRSPnumLights, float gRSPfFogMin, + uint32 primitiveColor, uint32 primitiveColor_); + +extern "C" int tv_direction(const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2); void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum) { @@ -1609,14 +1635,10 @@ void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum) // assumtions: // - g_clipFlag is not used at all + // - g_fFogCoord is not used at all // - g_vtxNonTransformed is not used after ProcessVertexData*() returns // - g_normal - same -#define PV_NEON_ENABLE_LIGHT (1 << 0) -#define PV_NEON_ENABLE_SHADE (1 << 1) -#define PV_NEON_ENABLE_FOG (1 << 2) -#define PV_NEON_FOG_ALPHA (1 << 3) - int neon_state = 0; if ( gRSP.bLightingEnable ) neon_state |= PV_NEON_ENABLE_LIGHT; @@ -1642,19 +1664,30 @@ void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum) // - g_vtxTransformed[i] // - g_dwVtxDifColor[i] -> vertex color // - g_fVtxTxtCoords[i] -> vertex texture cooridinates - // - g_fFogCoord[i] + // - g_fFogCoord[i] -> unused // - g_clipFlag2[i] const FiddledVtx * pVtxBase = (const FiddledVtx*)(g_pRDRAMu8 + dwAddr); g_pVtxBase = (FiddledVtx *)pVtxBase; + gRSPmodelViewTop._14 = gRSPmodelViewTop._24 = + gRSPmodelViewTop._34 = 0; + // SP_Timing(RSP_GBI0_Vtx); status.SPCycleCount += Timing_RSP_GBI0_Vtx * dwNum; - if (!(neon_state & (PV_NEON_ENABLE_LIGHT | PV_NEON_ENABLE_SHADE))) { - for (i = dwV0; i < dwV0 + dwNum; i++) - g_dwVtxDifColor[i] = gRDP.primitiveColor; // FLAT shade - } + +#if 1 + i = dwV0; + pv_neon(&g_vtxTransformed[i], &g_vecProjected[i], + &g_dwVtxDifColor[i], &g_fVtxTxtCoords[i], + &g_fFogCoord[i], &g_clipFlag2[i], + dwNum, neon_state, &pVtxBase[i - dwV0], + gRSPlights, gRSP.fAmbientColors, + &gRSPworldProject, &gRSPmodelViewTop, + gRSPnumLights, gRSPfFogMin, + gRDP.primitiveColor, gRDP.primitiveColor); +#else for (i = dwV0; i < dwV0 + dwNum; i++) { @@ -1672,13 +1705,6 @@ void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum) g_vecProjected[i].y = g_vtxTransformed[i].y * g_vecProjected[i].w; g_vecProjected[i].z = g_vtxTransformed[i].z * g_vecProjected[i].w; - if( neon_state & PV_NEON_ENABLE_FOG ) - { - g_fFogCoord[i] = g_vecProjected[i].z; - if( g_vecProjected[i].w < 0 || g_vecProjected[i].z < 0 || g_fFogCoord[i] < gRSPfFogMin ) - g_fFogCoord[i] = gRSPfFogMin; - } - // RSP_Vtx_Clipping(i); g_clipFlag2[i] = 0; if( g_vecProjected[i].w > 0 ) @@ -1728,6 +1754,8 @@ void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum) color.r = vert.rgba.b; color.a = vert.rgba.a; } + else + g_dwVtxDifColor[i] = gRDP.primitiveColor; // FLAT shade // ReplaceAlphaWithFogFactor(i); if( neon_state & PV_NEON_FOG_ALPHA ) @@ -1735,7 +1763,8 @@ void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum) // Use fog factor to replace vertex alpha if( g_vecProjected[i].z > 1 ) *(((uint8*)&(g_dwVtxDifColor[i]))+3) = 0xFF; - if( g_vecProjected[i].z < 0 ) + // missing 'else' in original code?? + else if( g_vecProjected[i].z < 0 ) *(((uint8*)&(g_dwVtxDifColor[i]))+3) = 0; else *(((uint8*)&(g_dwVtxDifColor[i]))+3) = (uint8)(g_vecProjected[i].z*255); @@ -1744,6 +1773,7 @@ void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum) g_fVtxTxtCoords[i].x = (float)vert.tu; g_fVtxTxtCoords[i].y = (float)vert.tv; } +#endif } bool PrepareTriangle(uint32 dwV0, uint32 dwV1, uint32 dwV2) @@ -1767,6 +1797,18 @@ void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum) InitVertex(dwV1, gRSP.numVertices+1, textureFlag, openGL); InitVertex(dwV2, gRSP.numVertices+2, textureFlag, openGL); + if( __builtin_expect(gRSP.numVertices == 0 && g_curRomInfo.bEnableTxtLOD && gRDP.otherMode.text_lod, 0) ) + { + if( CRender::g_pRender->IsTexel1Enable() && CRender::g_pRender->m_pColorCombiner->m_pDecodedMux->isUsed(MUX_LODFRAC) ) + { + ComputeLOD(openGL); + } + else + { + gRDP.LODFrac = 0; + } + } + gRSP.numVertices += 3; status.dwNumTrisRendered++; } @@ -1806,6 +1848,7 @@ bool IsTriangleVisible(uint32 dwV0, uint32 dwV1, uint32 dwV2) // method doesnt' work well when the z value is outside of screenspace //if (v0.z < 1 && v1.z < 1 && v2.z < 1) { +#ifndef __ARM_NEON__ float V1 = v2.x - v0.x; float V2 = v2.y - v0.y; @@ -1815,6 +1858,10 @@ bool IsTriangleVisible(uint32 dwV0, uint32 dwV1, uint32 dwV2) float fDirection = (V1 * W2) - (V2 * W1); fDirection = fDirection * v1.w * v2.w * v0.w; //float fDirection = v0.x*v1.y-v1.x*v0.y+v1.x*v2.y-v2.x*v1.y+v2.x*v0.y-v0.x*v2.y; +#else + // really returns float, but we only need sign + int fDirection = tv_direction(&v0, &v1, &v2); +#endif if (fDirection < 0 && gRSP.bCullBack) { diff --git a/source/rice_gles/src/RenderBase.h b/source/rice_gles/src/RenderBase.h index ceeb385..b2e37bb 100755 --- a/source/rice_gles/src/RenderBase.h +++ b/source/rice_gles/src/RenderBase.h @@ -243,6 +243,7 @@ extern void (*ProcessVertexData)(uint32 dwAddr, uint32 dwV0, uint32 dwNum); void ProcessVertexDataSSE(uint32 dwAddr, uint32 dwV0, uint32 dwNum); #endif void ProcessVertexDataNoSSE(uint32 dwAddr, uint32 dwV0, uint32 dwNum); +void ProcessVertexDataNEON(uint32 dwAddr, uint32 dwV0, uint32 dwNum); void ProcessVertexDataExternal(uint32 dwAddr, uint32 dwV0, uint32 dwNum); void SetPrimitiveColor(uint32 dwCol, uint32 LODMin, uint32 LODFrac); void SetPrimitiveDepth(uint32 z, uint32 dwDZ); diff --git a/source/rice_gles/src/RenderBase_neon.S b/source/rice_gles/src/RenderBase_neon.S new file mode 100644 index 0000000..da769c7 --- /dev/null +++ b/source/rice_gles/src/RenderBase_neon.S @@ -0,0 +1,339 @@ +/* + * (C) Gražvydas "notaz" Ignotas, 2014 + * + * This work is licensed under the terms of GNU GPL version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "arm_features.h" +#include "RenderBase_neon.h" + +.syntax unified +.text +.align 3 + +/* + * ProcessVertexData register map: + * + * q | d | c code + * ... + * 12 24 gRSPworldProject _11,_12,_13,_14 + * 25 + * 13 26 gRSPworldProject _21,_22,_23,_24 + * 27 + * 14 28 gRSPworldProject _31,_32,_33,_34 + * 29 + * 15 30 gRSPworldProject _41,_42,_43,_44 + * 31 + * + * r4 vtx[], 16 bytes: + * short y, x, flag, z, tv, tu; + * / uint8 a, b, g, r; + * \ char a, z, y, x; + * + * outputs: + * r0 - XVECTOR4 *g_vtxTransformed + * r1 - XVECTOR4 *g_vecProjected + * r2 - uint32 *g_dwVtxDifColor + * r3 - VECTOR2 *g_fVtxTxtCoords + * sp+00 - float *g_fFogCoord + * r6 sp+04 - uint32 *g_clipFlag2 + * inputs: + * r11 sp+08 - uint32 dwNum + * r10 sp+0c - int neon_flags + * r4 sp+10 - FiddledVtx vtx[], (r4 [0], r5 [1]) + * r7 sp+14 - Light *gRSPlights + * sp+18 - float *fRSPAmbientLightRGBA + * sp+1c - XMATRIX *gRSPworldProject + * sp+20 - XMATRIX *gRSPmodelViewTop + * sp+24 - uint32 gRSPnumLights + * sp+28 - float gRSPfFogMin + * sp+2c - uint32 primitiveColor + * sp+30 - uint32 primitiveColor + */ +FUNCTION(pv_neon): + ldr r12, [sp, #0x10] + pld [r12] + + push {r4-r11,lr} + vpush {q4-q7} + + mov r4, r12 @ vtx + ldr r12, [sp, #0x64+0x1c] + vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject + vld1.32 {q14,q15}, [r12, :128] + ldr r6, [sp, #0x64+0x04] @ g_clipFlag2 + add r5, r4, #16 @ vtx + 1 + ldr r11, [sp, #0x64+0x08] @ dwNum + ldr r10, [sp, #0x64+0x0c] @ neon_flags + +0: + vld1.16 d12, [r4]! @ vtx[0] .z .flag .x .y (reg) + vmovl.s16 q6, d12 + vld1.16 d14, [r5]! @ vtx[1] .z .flag .x .y + vmovl.s16 q7, d14 + vcvt.f32.s32 q6, q6 @ q6 = vtx_raw0 + vcvt.f32.s32 q7, q7 @ q7 = vtx_raw1 + vdup.32 q0, d12[1] @ vtx_raw0.x (dup) + vdup.32 q1, d12[0] @ vtx_raw0.y (dup) + vdup.32 q2, d13[1] @ vtx_raw0.z (dup) + vdup.32 q3, d14[1] @ vtx_raw1.x (dup) + vdup.32 q4, d14[0] @ vtx_raw1.y (dup) + vdup.32 q5, d15[1] @ vtx_raw1.z (dup) + /* note: order of operations matters greatly, + * may cause like 20 fraction bits to differ! */ + vmul.f32 q0, q0, q12 + vmul.f32 q3, q3, q12 + vmla.f32 q0, q1, q13 + vmla.f32 q3, q4, q13 + vmul.f32 q2, q2, q14 @ yes, mul+add is + vmul.f32 q5, q5, q14 @ faster than mla + vadd.f32 q0, q2 + vadd.f32 q3, q5 + vadd.f32 q0, q15 @ q0 = g_vtxTransformed[i] + vadd.f32 q3, q15 @ q3 = g_vtxTransformed[i + 1] + + vld1.16 d16[1], [r4]! @ [0].v + vmov d2, d1 + vld1.16 d16[0], [r4]! @ [0].u + vsri.64 d2, d7, #32 + vld1.16 d18[1], [r5]! @ [0].v +#if 1 + vrecpe.f32 d4, d2 @ inv [0][1] .w + vld1.16 d18[0], [r5]! @ [0].u + vrecps.f32 d5, d2, d4 @ step + vmovl.s16 q8, d16 + /* g_vtxTransformed[0] */ vst1.32 {q0}, [r0, :128]! + vmovl.s16 q9, d18 + vcvt.f32.s32 d16, d16 + vcvt.f32.s32 d18, d18 + vmul.f32 d4, d5, d4 @ better inv + bic r9, r5, #63 + pld [r9, #64] + vrecps.f32 d5, d2, d4 @ step + cmp r11, #1 + /* u,v g_fVtxTxtCoords[0] */ vst1.32 {d16}, [r3]! + beq 99f + /* g_vtxTransformed[1] */ vst1.32 {q3}, [r0, :128]! + /* ... [1] */ vst1.32 {d18}, [r3]! + 99: + vmov.f32 d20, #1.0 + vmov.f32 d21, #-1.0 + vmul.f32 d4, d5, d4 @ better inv [0][1] .w + #if 0 + vrecps.f32 d5, d2, d4 @ step + vmul.f32 d4, d5, d4 @ better inv + #endif +#else + mov r12, #0x3f800000 @ 1.0f + vmov.f32 s6, r12 + vdiv.f32 s8, s6, s4 + vdiv.f32 s9, s6, s5 + #error incomplete +#endif + + mov r8, #X_CLIP_MAX + mov r9, #Y_CLIP_MAX + vmov d22, r8, r9 + vmul.f32 q0, q0, d4[1] @ .x .y .z .w *= [0] .w + vmul.f32 q1, q3, d4[0] + vshr.u64 d5, d4, #32 @ [0] .w + mov r8, #X_CLIP_MIN + mov r9, #Y_CLIP_MIN + vmov d23, r8, r9 + vsli.64 d3, d4, #32 @ insert [1] .w + vsli.64 d1, d5, #32 + vsli.u64 d5, d4, #32 @ [0] [1] .w + vcgt.f32 d6, d0, d20 @ .xy > 1.0? + vcgt.f32 d7, d21, d0 + vcgt.f32 d4, d5, #0 @ .w > 0? + vst1.32 {q0}, [r1]! @ g_vecProjected[0] + vcgt.f32 d8, d2, d20 + vcgt.f32 d9, d21, d2 + vld1.32 d0[0], [r4]! @ mem: [0] .azyx + vand q3, q11 + vand q4, q11 + cmp r11, #1 + beq 99f + vst1.32 {q1}, [r1]! @ g_vecProjected[1] +99: + vorr d6, d6, d7 + vorr d7, d8, d9 + vld1.32 d0[1], [r5]! @ mem: [1] .azyx + vpadd.u32 d6, d7 + vrev32.8 d0, d0 @ make 0xaazzyyxx [1][0] + vsli.u64 d1, d3, #32 @ d3 = [1] [0] .z + vmovl.s8 q4, d0 + vand d6, d4 + vmovl.s16 q1, d8 + vmovl.s16 q2, d9 + vst1.32 {d6}, [r6]! @ g_clipFlag2 + + tst r10, #PV_NEON_ENABLE_LIGHT + beq pv_neon_no_light +@ pv_neon_light: + @ live NEON registers: + @ d1 = [1][0] .z (must preserve) + @ q1,q2 = azyx [1][0] + @ q12+ = gRSPworldProject + ldr r12, [sp, #0x64+0x20] + vcvt.f32.s32 q1, q1 + vcvt.f32.s32 q2, q2 + vld1.32 {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop + vld1.32 {q10}, [r12, :128] + + vdup.32 q5, d4[0] @ [1] .x (dup) + vdup.32 q6, d4[1] @ [1] .y (dup) + vdup.32 q7, d5[0] @ [1] .z (dup) + vdup.32 q2, d2[0] @ [0] .x (dup) + vdup.32 q3, d2[1] @ [0] .y (dup) + vdup.32 q4, d3[0] @ [0] .z (dup) + vmul.f32 q2, q2, q8 + vmul.f32 q5, q5, q8 + vmla.f32 q2, q3, q9 + vmla.f32 q5, q6, q9 + vmul.f32 q4, q4, q10 + vmul.f32 q7, q7, q10 + vadd.f32 q4, q2 @ q4 = temp[0] .xyz0 + vadd.f32 q5, q7 @ q5 = temp[1] .xyz0 + vmul.f32 q2, q4, q4 @ temp .xyz0 ^2 + vmul.f32 q3, q5, q5 + vpadd.f32 d2, d4, d5 + vpadd.f32 d3, d6, d7 + movw r8, #0x0000ffff + movt r8, #0x7f7f @ max normal float, ~3.4e+38 + vdup.32 d4, r8 + vpadd.f32 d2, d2, d3 @ d2 = [1][0] x^2 + y^2 + z^2 + vcgt.f32 d5, d2, #0 + vbif d2, d4, d5 @ if (d2 == 0) d2 = MAXFLOAT + + vrsqrte.f32 d3, d2 @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum + vmul.f32 d4, d3, d2 + ldr r9, [sp, #0x64+0x18] @ &fRSPAmbientLightRGBA + ldr r7, [sp, #0x64+0x14] @ gRSPlights + ldr r8, [sp, #0x64+0x24] @ gRSPnumLights + vrsqrts.f32 d4, d3, d4 @ step + vld1.32 {q6}, [r9] @ rgb + vld1.32 {q7}, [r9] @ rgb + vmul.f32 d3, d3, d4 @ 1/sqrt(d2) +#if 0 /* not necessary? */ + vmul.f32 d4, d3, d2 + vrsqrts.f32 d4, d3, d4 @ step + vmul.f32 d3, d3, d4 @ 1/sqrt(d2) +#endif + vmul.f32 q2, q4, d3[0] @ q2 = normal[0] .xyz + vmul.f32 q3, q5, d3[1] @ q3 = normal[1] .xyz + +1: + vld1.32 {q8}, [r7] + vmul.f32 q4, q8, q2 @ gRSPlights[l] * normal + vmul.f32 q5, q8, q3 + vpadd.f32 d8, d8, d9 + vpadd.f32 d10, d10, d11 + vpadd.f32 d8, d8, d10 @ d8 = [1][0] fCosT + vcgt.f32 d9, d8, #0 @ if (!(fCosT > 0)) + vand d8, d9 @ fCosT = 0 + add r9, r7, #OFFSETOF_Light_fr + vld1.32 {q8}, [r9] @ .fr .fg .fb + vdup.32 q5, d8[1] @ [1] fCosT (dup) + vdup.32 q4, d8[0] @ + vmla.f32 q7, q8, q5 @ .rgb += frgb * fCosT + vmla.f32 q6, q8, q4 + add r7, #SIZEOF_Light + subs r8, #1 + bgt 1b + + movt r8, #0x437f @ float 255 + vdup.32 q8, r8 + vcgt.f32 q4, q6, q8 @ if (.rgb > 255) + vcgt.f32 q5, q7, q8 + vbit q6, q8, q4 @ .rgb = 255 + vbit q7, q8, q5 + vcvt.u32.f32 q6, q6 + vcvt.u32.f32 q7, q7 + ldrb r8, [r4, #-4] @ .a from vtx + ldrb r9, [r5, #-4] + vext.32 q4, q6, q6, #3 @ reg: .abgr -> .bgra + vext.32 q5, q7, q7, #3 + vmov.32 d8[0], r8 @ use .a from input + vmov.32 d10[0], r9 + vmovn.u32 d8, q4 + vmovn.u32 d10, q5 + vmovn.u16 d0, q4 + vmovn.u16 d2, q5 + vsli.u64 d0, d2, #32 + vrev32.8 d0, d0 @ 0xbbggrraa -> 0xaarrggbb + b pv_neon_fog_alpha + +pv_neon_no_light: + tst r10, #PV_NEON_ENABLE_SHADE + vldr d0, [sp, #0x64+0x2c] @ primitiveColor [0] [1] + beq pv_neon_fog_alpha + @ easier to do with ARM + ldr r8, [r4, #-4] + ldr r9, [r5, #-4] + ror r8, #8 @ mem: .argb -> .rgba + ror r9, #8 @ reg: 0xbbggrraa -> .. + vmov d0, r8, r9 + +pv_neon_fog_alpha: + tst r10, #PV_NEON_FOG_ALPHA + beq pv_neon_next + vmov.f32 d20, #1.0 + vcgt.f32 d2, d1, d20 @ [0] [1] .z > 1.0? + vcgt.f32 d3, d1, #0 @ > 0? + movw r8, #0 + movt r8, #0x4f7f @ r8 = (float)(255<<24) + vbit d1, d20, d2 @ make 1.0 if needed + vand d1, d3 + vdup.32 d4, r8 + vmul.f32 d1, d1, d4 + vcvt.u32.f32 d1, d1 + vmov.u32 d5, #0xff000000 + vbit d0, d1, d5 + +pv_neon_next: + subs r11, #2 + vst1.32 {d0}, [r2]! @ g_dwVtxDifColor + add r4, #16 + add r5, #16 + bgt 0b + nop + + vpop {q4-q7} + pop {r4-r11,pc} + .size pv_neon, .-pv_neon + + +@ (float *d, const float *m1, const float *m2, const float *s) +FUNCTION(multiply_subtract2): + vld1.32 {d1}, [r1] + vld1.32 {d2}, [r2] + vmul.f32 d0, d1, d2 + vld1.32 {d3}, [r3] + vsub.f32 d0, d3 + vst1.32 {d0}, [r0] + bx lr + .size multiply_subtract2, .-multiply_subtract2 + + +@ (const XVECTOR4 *v0, const XVECTOR4 *v1, const XVECTOR4 *v2) +FUNCTION(tv_direction): + vld1.32 {q0}, [r0] + vld1.32 {q2}, [r2] + vld1.32 {q1}, [r1] + vsub.f32 d6, d4, d0 @ d6 = V2,V1 + vsub.f32 d7, d4, d2 @ d7 = W2,W1 + vmul.f32 d1, d5 @ d1 = v0.w * v2.w + vrev64.32 d7, d7 + vmul.f32 d6, d7 @ d6 = V2*W1,V1*W2 + vmul.f32 d1, d3 @ d1 *= v1.w + vshr.u64 d7, d6, #32 + vsub.f32 d6, d7 @ d6[0] = V1*W2 - V2*W1 + vshr.u64 d1, d1, #32 + vmul.f32 d0, d1, d6 + vmov.32 r0, d0[0] + bx lr + + +@ vim:filetype=armasm:expandtab diff --git a/source/rice_gles/src/RenderBase_neon.h b/source/rice_gles/src/RenderBase_neon.h new file mode 100644 index 0000000..0c252d0 --- /dev/null +++ b/source/rice_gles/src/RenderBase_neon.h @@ -0,0 +1,13 @@ + +#define PV_NEON_ENABLE_LIGHT (1 << 0) +#define PV_NEON_ENABLE_SHADE (1 << 1) +#define PV_NEON_ENABLE_FOG (1 << 2) // unused +#define PV_NEON_FOG_ALPHA (1 << 3) + +#define X_CLIP_MAX 0x1 +#define X_CLIP_MIN 0x2 +#define Y_CLIP_MAX 0x4 +#define Y_CLIP_MIN 0x8 + +#define OFFSETOF_Light_fr 0x14 +#define SIZEOF_Light 0x44 diff --git a/source/rice_gles/src/arm_features.h b/source/rice_gles/src/arm_features.h new file mode 100644 index 0000000..fdec522 --- /dev/null +++ b/source/rice_gles/src/arm_features.h @@ -0,0 +1,52 @@ +#ifndef __ARM_FEATURES_H__ +#define __ARM_FEATURES_H__ + +#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__) + +#define HAVE_ARMV7 +#define HAVE_ARMV6 +#define HAVE_ARMV5 + +#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \ + || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) + +#define HAVE_ARMV6 +#define HAVE_ARMV5 + +#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5E__) \ + || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) + +#define HAVE_ARMV5 + +#endif + +/* no need for HAVE_NEON - GCC defines __ARM_NEON__ consistently */ + +/* global function/external symbol */ +#ifndef __MACH__ +#define ESYM(name) name + +#define FUNCTION(name) \ + .globl name; \ + .type name, %function; \ + name + +#define EXTRA_UNSAVED_REGS + +#else +#define ESYM(name) _##name + +#define FUNCTION(name) \ + .globl ESYM(name); \ + name: \ + ESYM(name) + +// r7 is preserved, but add it for EABI alignment.. +#define EXTRA_UNSAVED_REGS r7, r9, + +#endif + +#endif /* __ARM_FEATURES_H__ */ -- 2.39.5