[mupen64plus-pandora.git] / source / gles2glide64 / src / Glide64 / 3dmath.cpp

/*
* Glide64 - Glide video plugin for Nintendo 64 emulators.
* Copyright (c) 2002  Dave2001
* Copyright (c) 2003-2009  Sergey 'Gonetz' Lipski
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

//****************************************************************
//
// Glide64 - Glide Plugin for Nintendo 64 emulators
// Project started on December 29th, 2001
//
// Authors:
// Dave2001, original author, founded the project in 2001, left it in 2002
// Gugaman, joined the project in 2002, left it in 2002
// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
//
//****************************************************************
//
// To modify Glide64:
// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
// * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
//
//****************************************************************

#include "Gfx_1.3.h"
extern "C" {
#ifndef NOSSE
#include <xmmintrin.h>
#endif
}

#include <math.h>
#include "3dmath.h"

void calc_light (VERTEX *v)
{
  float light_intensity = 0.0f;
  register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
  for (wxUint32 l=0; l<rdp.num_lights; l++)
  {
    light_intensity = DotProduct (rdp.light_vector[l], v->vec);
    
    if (light_intensity > 0.0f) 
    {
      color[0] += rdp.light[l].r * light_intensity;
      color[1] += rdp.light[l].g * light_intensity;
      color[2] += rdp.light[l].b * light_intensity;
    }
  }
  
  if (color[0] > 1.0f) color[0] = 1.0f;
  if (color[1] > 1.0f) color[1] = 1.0f;
  if (color[2] > 1.0f) color[2] = 1.0f;
  
  v->r = (wxUint8)(color[0]*255.0f);
  v->g = (wxUint8)(color[1]*255.0f);
  v->b = (wxUint8)(color[2]*255.0f);
}

//*
void calc_linear (VERTEX *v)
{
  if (settings.force_calc_sphere)
  {
    calc_sphere(v);
    return;
  }
  DECLAREALIGN16VAR(vec[3]);
  
  TransformVector (v->vec, vec, rdp.model);
  //    TransformVector (v->vec, vec, rdp.combined);
  NormalizeVector (vec);
  float x, y;
  if (!rdp.use_lookat)
  {
    x = vec[0];
    y = vec[1];
  }
  else
  {
    x = DotProduct (rdp.lookat[0], vec);
    y = DotProduct (rdp.lookat[1], vec);
  }
  
  if (x > 1.0f)
    x = 1.0f;
  else if (x < -1.0f)
    x = -1.0f;
  if (y > 1.0f)
    y = 1.0f;
  else if (y < -1.0f)
    y = -1.0f;
  
  if (rdp.cur_cache[0])
  {
    // scale >> 6 is size to map to
    v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
    v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
  }
  v->uv_scaled = 1;
#ifdef EXTREME_LOGGING
  FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
#endif
}

void calc_sphere (VERTEX *v)
{
//  LRDP("calc_sphere\n");
  DECLAREALIGN16VAR(vec[3]);
  int s_scale, t_scale;
  if (settings.hacks&hack_Chopper)
  {
    s_scale = min(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
    t_scale = min(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
  }
  else
  {
    s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
    t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
  }
  TransformVector (v->vec, vec, rdp.model);
  //    TransformVector (v->vec, vec, rdp.combined);
  NormalizeVector (vec);
  float x, y;
  if (!rdp.use_lookat)
  {
    x = vec[0];
    y = vec[1];
  }
  else
  {
    x = DotProduct (rdp.lookat[0], vec);
    y = DotProduct (rdp.lookat[1], vec);
  }
  v->ou = (x * 0.5f + 0.5f) * s_scale;
  v->ov = (y * 0.5f + 0.5f) * t_scale;
  v->uv_scaled = 1;
#ifdef EXTREME_LOGGING
  FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
#endif
}

float DotProductC(register float *v1, register float *v2)
{
    register float result;
    result = v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
    return(result);
}

void NormalizeVectorC(float *v)
{
    register float len;
    len = sqrtf(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
    if (len > 0.0f)
    {
        v[0] /= len;
        v[1] /= len;
        v[2] /= len;
    }
}

void TransformVectorC(float *src, float *dst, float mat[4][4])
{
  dst[0] = mat[0][0]*src[0] + mat[1][0]*src[1] + mat[2][0]*src[2];
  dst[1] = mat[0][1]*src[0] + mat[1][1]*src[1] + mat[2][1]*src[2];
  dst[2] = mat[0][2]*src[0] + mat[1][2]*src[1] + mat[2][2]*src[2];
}

void InverseTransformVectorC (float *src, float *dst, float mat[4][4])
{
  dst[0] = mat[0][0]*src[0] + mat[0][1]*src[1] + mat[0][2]*src[2];
  dst[1] = mat[1][0]*src[0] + mat[1][1]*src[1] + mat[1][2]*src[2];
  dst[2] = mat[2][0]*src[0] + mat[2][1]*src[1] + mat[2][2]*src[2];
}

/*
void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
{
  for (int i=0; i<4; i++)
  {
    for (int j=0; j<4; j++)
    {
      r[i][j] = m1[i][0] * m2[0][j] +
                m1[i][1] * m2[1][j] +
                m1[i][2] * m2[2][j] +
                m1[i][3] * m2[3][j];
    }
  }
}
*/
void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
{
  for (int j=0; j<4; j++)
  {
      r[0][j] = m1[0][0] * m2[0][j] +
                m1[0][1] * m2[1][j] +
                m1[0][2] * m2[2][j] +
                m1[0][3] * m2[3][j];
      r[1][j] = m1[1][0] * m2[0][j] +
                m1[1][1] * m2[1][j] +
                m1[1][2] * m2[2][j] +
                m1[1][3] * m2[3][j];
      r[2][j] = m1[2][0] * m2[0][j] +
                m1[2][1] * m2[1][j] +
                m1[2][2] * m2[2][j] +
                m1[2][3] * m2[3][j];
      r[3][j] = m1[3][0] * m2[0][j] +
                m1[3][1] * m2[1][j] +
                m1[3][2] * m2[2][j] +
                m1[3][3] * m2[3][j];
  }
}

#ifdef __ARM_NEON__
void MultMatrix_neon( float m0[4][4], float m1[4][4], float dest[4][4])
{
    asm volatile (
	"vld1.32 		{d0, d1}, [%1]!			\n\t"	//q0 = m1
	"vld1.32 		{d2, d3}, [%1]!	    	\n\t"	//q1 = m1+4
	"vld1.32 		{d4, d5}, [%1]!	    	\n\t"	//q2 = m1+8
	"vld1.32 		{d6, d7}, [%1]	    	\n\t"	//q3 = m1+12
	"vld1.32 		{d16, d17}, [%0]!		\n\t"	//q8 = m0
	"vld1.32 		{d18, d19}, [%0]!   	\n\t"	//q9 = m0+4
	"vld1.32 		{d20, d21}, [%0]!   	\n\t"	//q10 = m0+8
	"vld1.32 		{d22, d23}, [%0]    	\n\t"	//q11 = m0+12

	"vmul.f32 		q12, q8, d0[0] 			\n\t"	//q12 = q8 * d0[0]
	"vmul.f32 		q13, q8, d2[0] 		    \n\t"	//q13 = q8 * d2[0]
	"vmul.f32 		q14, q8, d4[0] 		    \n\t"	//q14 = q8 * d4[0]
	"vmul.f32 		q15, q8, d6[0]	 		\n\t"	//q15 = q8 * d6[0]
	"vmla.f32 		q12, q9, d0[1] 			\n\t"	//q12 = q9 * d0[1]
	"vmla.f32 		q13, q9, d2[1] 		    \n\t"	//q13 = q9 * d2[1]
	"vmla.f32 		q14, q9, d4[1] 		    \n\t"	//q14 = q9 * d4[1]
	"vmla.f32 		q15, q9, d6[1] 		    \n\t"	//q15 = q9 * d6[1]
	"vmla.f32 		q12, q10, d1[0] 		\n\t"	//q12 = q10 * d0[0]
	"vmla.f32 		q13, q10, d3[0] 		\n\t"	//q13 = q10 * d2[0]
	"vmla.f32 		q14, q10, d5[0] 		\n\t"	//q14 = q10 * d4[0]
	"vmla.f32 		q15, q10, d7[0] 		\n\t"	//q15 = q10 * d6[0]
	"vmla.f32 		q12, q11, d1[1] 		\n\t"	//q12 = q11 * d0[1]
	"vmla.f32 		q13, q11, d3[1] 		\n\t"	//q13 = q11 * d2[1]
	"vmla.f32 		q14, q11, d5[1] 		\n\t"	//q14 = q11 * d4[1]
	"vmla.f32 		q15, q11, d7[1]	 	    \n\t"	//q15 = q11 * d6[1]

	"vst1.32 		{d24, d25}, [%2]! 		\n\t"	//d = q12
	"vst1.32 		{d26, d27}, [%2]! 	    \n\t"	//d+4 = q13
	"vst1.32 		{d28, d29}, [%2]! 	    \n\t"	//d+8 = q14
	"vst1.32 		{d30, d31}, [%2] 	    \n\t"	//d+12 = q15

	:"+r"(m1), "+r"(m0), "+r"(dest):
    : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
    "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
    "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
    "memory"
	);
}

void Normalize_neon(float v[3])
{
	asm volatile (
	"vld1.32 		{d4}, [%0]!	    		\n\t"	//d4={x,y}
	"flds    		s10, [%0]   	    	\n\t"	//d5[0] = z
	"sub    		%0, %0, #8   	    	\n\t"	//d5[0] = z
	"vmul.f32 		d0, d4, d4				\n\t"	//d0= d4*d4
	"vpadd.f32 		d0, d0, d0				\n\t"	//d0 = d[0] + d[1]
    "vmla.f32 		d0, d5, d5				\n\t"	//d0 = d0 + d5*d5

	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2
	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d3) / 2
	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4

	"vmul.f32 		q2, q2, d0[0]			\n\t"	//d0= d2*d4
	"vst1.32 		{d4}, [%0]!  			\n\t"	//d2={x0,y0}, d3={z0, w0}
	"fsts    		s10, [%0]     			\n\t"	//d2={x0,y0}, d3={z0, w0}

	:"+r"(v) :
    : "d0", "d1", "d2", "d3", "d4", "d5", "memory"
	);
}

float DotProduct_neon( float v0[3], float v1[3] )
{
    float dot;
	asm volatile (
	"vld1.32 		{d8}, [%1]!			\n\t"	//d8={x0,y0}
	"vld1.32 		{d10}, [%2]!		\n\t"	//d10={x1,y1}
	"flds 			s18, [%1, #0]	    \n\t"	//d9[0]={z0}
	"flds 			s22, [%2, #0]	    \n\t"	//d11[0]={z1}
	"vmul.f32 		d12, d8, d10		\n\t"	//d0= d2*d4
	"vpadd.f32 		d12, d12, d12		\n\t"	//d0 = d[0] + d[1]
	"vmla.f32 		d12, d9, d11		\n\t"	//d0 = d0 + d3*d5
    "fmrs	        %0, s24	    		\n\t"	//r0 = s0
	: "=r"(dot), "+r"(v0), "+r"(v1):
    : "d8", "d9", "d10", "d11", "d12"

	);
    return dot;
}

#endif

// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
//                      and 3DNOW! 4x4 4x4 matrix multiplication
// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
// This will need fixing.
#ifndef __ARM_NEON__
MULMATRIX MulMatrices = MulMatricesC;
TRANSFORMVECTOR TransformVector = TransformVectorC;
TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
DOTPRODUCT DotProduct = DotProductC;
NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
#endif

void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
{
#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
   /* [row][col]*/
  typedef float v4sf __attribute__ ((vector_size (16)));
  v4sf row0 = _mm_loadu_ps(m2[0]);
  v4sf row1 = _mm_loadu_ps(m2[1]);
  v4sf row2 = _mm_loadu_ps(m2[2]);
  v4sf row3 = _mm_loadu_ps(m2[3]);

  for (int i = 0; i < 4; ++i)
  {
    v4sf leftrow = _mm_loadu_ps(m1[i]);

    // Fill tmp with four copies of leftrow[0]
    v4sf tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 0);
    // Calculate the four first summands
    v4sf destrow = tmp * row0;

    // Fill tmp with four copies of leftrow[1]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
    destrow += tmp * row1;

    // Fill tmp with four copies of leftrow[2]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
    destrow += tmp * row2;

    // Fill tmp with four copies of leftrow[3]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
    destrow += tmp * row3;

    __builtin_ia32_storeups(r[i], destrow);
  }
 #elif !defined(NO_ASM) && !defined(NOSSE)
  __asm
  {
    mov     eax, dword ptr [r]  
      mov     ecx, dword ptr [m1]
      mov     edx, dword ptr [m2]

      movaps  xmm0,[edx]
      movaps  xmm1,[edx+16]
      movaps  xmm2,[edx+32]
      movaps  xmm3,[edx+48]

// r[0][0],r[0][1],r[0][2],r[0][3]

      movaps  xmm4,xmmword ptr[ecx]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax],xmm4

// r[1][0],r[1][1],r[1][2],r[1][3]

      movaps  xmm4,xmmword ptr[ecx+16]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+16],xmm4


// r[2][0],r[2][1],r[2][2],r[2][3]

      movaps  xmm4,xmmword ptr[ecx+32]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+32],xmm4

// r[3][0],r[3][1],r[3][2],r[3][3]

      movaps  xmm4,xmmword ptr[ecx+48]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+48],xmm4
    }
#endif // _WIN32
  }


  void math_init()
  {
#ifndef __ARM_NEON__
#ifndef _DEBUG
    int IsSSE = FALSE;
#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
    int edx, eax;
    GLIDE64_TRY
    {
  #if defined(__x86_64__)
      asm volatile(" cpuid;        "
        : "=a"(eax), "=d"(edx)
        : "0"(1)
        : "rbx", "rcx"
        );
  #else
      asm volatile(" push %%ebx;   "
        " push %%ecx;   "
        " cpuid;        "
        " pop %%ecx;    "
        " pop %%ebx;    "
        : "=a"(eax), "=d"(edx)
        : "0"(1)
        :
      );
  #endif
    }
    GLIDE64_CATCH
      { return; }
    // Check for SSE
    if (edx & (1 << 25))
      IsSSE = TRUE;
#elif !defined(NO_ASM) && !defined(NOSSE)
    DWORD dwEdx;
    __try
    {
      __asm 
      {
        mov  eax,1
          cpuid
          mov dwEdx,edx
        }  
      }
      __except(EXCEPTION_EXECUTE_HANDLER)
      {
        return;
      }

      if (dwEdx & (1<<25)) 
      {
        if (dwEdx & (1<<24))
        {      
          __try
          {
            __asm xorps xmm0, xmm0
              IsSSE = TRUE;
          }
          __except(EXCEPTION_EXECUTE_HANDLER)
          {
            return;
          }
        }
      }
#endif // _WIN32
      if (IsSSE)
      {
        MulMatrices = MulMatricesSSE;
        LOG("3DNOW! detected.\n");
      }

#endif //_DEBUG
#endif	//__ARM_NEON__
    }
Commit	Line	Data
98e75f2d	1	/*
	2	* Glide64 - Glide video plugin for Nintendo 64 emulators.
	3	* Copyright (c) 2002 Dave2001
	4	* Copyright (c) 2003-2009 Sergey 'Gonetz' Lipski
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	19	*/
	20
	21	//****************************************************************
	22	//
	23	// Glide64 - Glide Plugin for Nintendo 64 emulators
	24	// Project started on December 29th, 2001
	25	//
	26	// Authors:
	27	// Dave2001, original author, founded the project in 2001, left it in 2002
	28	// Gugaman, joined the project in 2002, left it in 2002
	29	// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
	30	// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
	31	//
	32	//****************************************************************
	33	//
	34	// To modify Glide64:
	35	// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
	36	// * Do NOT send me the whole project or file that you modified. Take out your modified code sections, and tell me where to put them. If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
	37	//
	38	//****************************************************************
	39
	40	#include "Gfx_1.3.h"
	41	extern "C" {
	42	#ifndef NOSSE
	43	#include <xmmintrin.h>
	44	#endif
	45	}
	46
	47	#include <math.h>
	48	#include "3dmath.h"
	49
	50	void calc_light (VERTEX *v)
	51	{
	52	float light_intensity = 0.0f;
	53	register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
	54	for (wxUint32 l=0; l<rdp.num_lights; l++)
	55	{
	56	light_intensity = DotProduct (rdp.light_vector[l], v->vec);
	57
	58	if (light_intensity > 0.0f)
	59	{
	60	color[0] += rdp.light[l].r * light_intensity;
	61	color[1] += rdp.light[l].g * light_intensity;
	62	color[2] += rdp.light[l].b * light_intensity;
	63	}
	64	}
65
66	if (color[0] > 1.0f) color[0] = 1.0f;
67	if (color[1] > 1.0f) color[1] = 1.0f;
68	if (color[2] > 1.0f) color[2] = 1.0f;
69
70	v->r = (wxUint8)(color[0]*255.0f);
71	v->g = (wxUint8)(color[1]*255.0f);
72	v->b = (wxUint8)(color[2]*255.0f);
73	}
74
75	//*
76	void calc_linear (VERTEX *v)
77	{
78	if (settings.force_calc_sphere)
79	{
80	calc_sphere(v);
81	return;
82	}
83	DECLAREALIGN16VAR(vec[3]);
84
85	TransformVector (v->vec, vec, rdp.model);
86	// TransformVector (v->vec, vec, rdp.combined);
87	NormalizeVector (vec);
88	float x, y;
89	if (!rdp.use_lookat)
90	{
91	x = vec[0];
92	y = vec[1];
93	}
94	else
95	{
96	x = DotProduct (rdp.lookat[0], vec);
97	y = DotProduct (rdp.lookat[1], vec);
98	}
99
100	if (x > 1.0f)
101	x = 1.0f;
102	else if (x < -1.0f)
103	x = -1.0f;
104	if (y > 1.0f)
105	y = 1.0f;
106	else if (y < -1.0f)
107	y = -1.0f;
108
109	if (rdp.cur_cache[0])
110	{
111	// scale >> 6 is size to map to
112	v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
113	v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
114	}
115	v->uv_scaled = 1;
116	#ifdef EXTREME_LOGGING
117	FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
118	#endif
119	}
120
121	void calc_sphere (VERTEX *v)
122	{
123	// LRDP("calc_sphere\n");
124	DECLAREALIGN16VAR(vec[3]);
125	int s_scale, t_scale;
126	if (settings.hacks&hack_Chopper)
127	{
128	s_scale = min(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
129	t_scale = min(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
130	}
131	else
132	{
133	s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
134	t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
135	}
136	TransformVector (v->vec, vec, rdp.model);
137	// TransformVector (v->vec, vec, rdp.combined);
138	NormalizeVector (vec);
139	float x, y;
140	if (!rdp.use_lookat)
141	{
142	x = vec[0];
143	y = vec[1];
144	}
145	else
146	{
147	x = DotProduct (rdp.lookat[0], vec);
148	y = DotProduct (rdp.lookat[1], vec);
149	}
150	v->ou = (x * 0.5f + 0.5f) * s_scale;
151	v->ov = (y * 0.5f + 0.5f) * t_scale;
152	v->uv_scaled = 1;
153	#ifdef EXTREME_LOGGING
154	FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
155	#endif
156	}
157
158	float DotProductC(register float v1, register float v2)
159	{
160	register float result;
161	result = v1[0]v2[0] + v1[1]v2[1] + v1[2]*v2[2];
162	return(result);
163	}
164
165	void NormalizeVectorC(float *v)
166	{
167	register float len;
168	len = sqrtf(v[0]v[0] + v[1]v[1] + v[2]*v[2]);
169	if (len > 0.0f)
170	{
171	v[0] /= len;
172	v[1] /= len;
173	v[2] /= len;
174	}
175	}
176
177	void TransformVectorC(float src, float dst, float mat[4][4])
178	{
179	dst[0] = mat[0][0]src[0] + mat[1][0]src[1] + mat[2][0]*src[2];
180	dst[1] = mat[0][1]src[0] + mat[1][1]src[1] + mat[2][1]*src[2];
181	dst[2] = mat[0][2]src[0] + mat[1][2]src[1] + mat[2][2]*src[2];
182	}
183
184	void InverseTransformVectorC (float src, float dst, float mat[4][4])
185	{
186	dst[0] = mat[0][0]src[0] + mat[0][1]src[1] + mat[0][2]*src[2];
187	dst[1] = mat[1][0]src[0] + mat[1][1]src[1] + mat[1][2]*src[2];
188	dst[2] = mat[2][0]src[0] + mat[2][1]src[1] + mat[2][2]*src[2];
189	}
190
191	/*
192	void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
193	{
194	for (int i=0; i<4; i++)
195	{
196	for (int j=0; j<4; j++)
197	{
198	r[i][j] = m1[i][0] * m2[0][j] +
199	m1[i][1] * m2[1][j] +
200	m1[i][2] * m2[2][j] +
201	m1[i][3] * m2[3][j];
202	}
203	}
204	}
205	*/
206	void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
207	{
208	for (int j=0; j<4; j++)
209	{
210	r[0][j] = m1[0][0] * m2[0][j] +
211	m1[0][1] * m2[1][j] +
212	m1[0][2] * m2[2][j] +
213	m1[0][3] * m2[3][j];
214	r[1][j] = m1[1][0] * m2[0][j] +
215	m1[1][1] * m2[1][j] +
216	m1[1][2] * m2[2][j] +
217	m1[1][3] * m2[3][j];
218	r[2][j] = m1[2][0] * m2[0][j] +
219	m1[2][1] * m2[1][j] +
220	m1[2][2] * m2[2][j] +
221	m1[2][3] * m2[3][j];
222	r[3][j] = m1[3][0] * m2[0][j] +
223	m1[3][1] * m2[1][j] +
224	m1[3][2] * m2[2][j] +
225	m1[3][3] * m2[3][j];
226	}
227	}
228
229	#ifdef __ARM_NEON__
230	void MultMatrix_neon( float m0[4][4], float m1[4][4], float dest[4][4])
231	{
232	asm volatile (
233	"vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1
234	"vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4
235	"vld1.32 {d4, d5}, [%1]! \n\t" //q2 = m1+8
236	"vld1.32 {d6, d7}, [%1] \n\t" //q3 = m1+12
237	"vld1.32 {d16, d17}, [%0]! \n\t" //q8 = m0
238	"vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m0+4
239	"vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m0+8
240	"vld1.32 {d22, d23}, [%0] \n\t" //q11 = m0+12
241
242	"vmul.f32 q12, q8, d0[0] \n\t" //q12 = q8 * d0[0]
243	"vmul.f32 q13, q8, d2[0] \n\t" //q13 = q8 * d2[0]
244	"vmul.f32 q14, q8, d4[0] \n\t" //q14 = q8 * d4[0]
245	"vmul.f32 q15, q8, d6[0] \n\t" //q15 = q8 * d6[0]
246	"vmla.f32 q12, q9, d0[1] \n\t" //q12 = q9 * d0[1]
247	"vmla.f32 q13, q9, d2[1] \n\t" //q13 = q9 * d2[1]
248	"vmla.f32 q14, q9, d4[1] \n\t" //q14 = q9 * d4[1]
249	"vmla.f32 q15, q9, d6[1] \n\t" //q15 = q9 * d6[1]
250	"vmla.f32 q12, q10, d1[0] \n\t" //q12 = q10 * d0[0]
251	"vmla.f32 q13, q10, d3[0] \n\t" //q13 = q10 * d2[0]
252	"vmla.f32 q14, q10, d5[0] \n\t" //q14 = q10 * d4[0]
253	"vmla.f32 q15, q10, d7[0] \n\t" //q15 = q10 * d6[0]
254	"vmla.f32 q12, q11, d1[1] \n\t" //q12 = q11 * d0[1]
255	"vmla.f32 q13, q11, d3[1] \n\t" //q13 = q11 * d2[1]
256	"vmla.f32 q14, q11, d5[1] \n\t" //q14 = q11 * d4[1]
257	"vmla.f32 q15, q11, d7[1] \n\t" //q15 = q11 * d6[1]
258
259	"vst1.32 {d24, d25}, [%2]! \n\t" //d = q12
260	"vst1.32 {d26, d27}, [%2]! \n\t" //d+4 = q13
261	"vst1.32 {d28, d29}, [%2]! \n\t" //d+8 = q14
262	"vst1.32 {d30, d31}, [%2] \n\t" //d+12 = q15
263
264	:"+r"(m1), "+r"(m0), "+r"(dest):
265	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
266	"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
267	"d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
268	"memory"
269	);
270	}
271
272	void Normalize_neon(float v[3])
273	{
274	asm volatile (
275	"vld1.32 {d4}, [%0]! \n\t" //d4={x,y}
276	"flds s10, [%0] \n\t" //d5[0] = z
277	"sub %0, %0, #8 \n\t" //d5[0] = z
278	"vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4
279	"vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1]
280	"vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5
281
282	"vmov.f32 d1, d0 \n\t" //d1 = d0
283	"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
284	"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
285	"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
286	"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
287	"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
288	"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d3) / 2
289	"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4
290
291	"vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4
292	"vst1.32 {d4}, [%0]! \n\t" //d2={x0,y0}, d3={z0, w0}
293	"fsts s10, [%0] \n\t" //d2={x0,y0}, d3={z0, w0}
294
295	:"+r"(v) :
296	: "d0", "d1", "d2", "d3", "d4", "d5", "memory"
297	);
298	}
299
300	float DotProduct_neon( float v0[3], float v1[3] )
301	{
302	float dot;
303	asm volatile (
304	"vld1.32 {d8}, [%1]! \n\t" //d8={x0,y0}
305	"vld1.32 {d10}, [%2]! \n\t" //d10={x1,y1}
306	"flds s18, [%1, #0] \n\t" //d9[0]={z0}
307	"flds s22, [%2, #0] \n\t" //d11[0]={z1}
308	"vmul.f32 d12, d8, d10 \n\t" //d0= d2*d4
309	"vpadd.f32 d12, d12, d12 \n\t" //d0 = d[0] + d[1]
310	"vmla.f32 d12, d9, d11 \n\t" //d0 = d0 + d3*d5
311	"fmrs %0, s24 \n\t" //r0 = s0
312	: "=r"(dot), "+r"(v0), "+r"(v1):
313	: "d8", "d9", "d10", "d11", "d12"
314
315	);
316	return dot;
317	}
318
319	#endif
320
321	// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
322	// and 3DNOW! 4x4 4x4 matrix multiplication
323	// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
324	// This will need fixing.
325	#ifndef __ARM_NEON__
326	MULMATRIX MulMatrices = MulMatricesC;
327	TRANSFORMVECTOR TransformVector = TransformVectorC;
328	TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
329	DOTPRODUCT DotProduct = DotProductC;
330	NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
331	#endif
332
333	void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
334	{
335	#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
336	/* [row][col]*/
337	typedef float v4sf __attribute__ ((vector_size (16)));
338	v4sf row0 = _mm_loadu_ps(m2[0]);
339	v4sf row1 = _mm_loadu_ps(m2[1]);
340	v4sf row2 = _mm_loadu_ps(m2[2]);
341	v4sf row3 = _mm_loadu_ps(m2[3]);
342
343	for (int i = 0; i < 4; ++i)
344	{
345	v4sf leftrow = _mm_loadu_ps(m1[i]);
346
347	// Fill tmp with four copies of leftrow[0]
348	v4sf tmp = leftrow;
349	tmp = _mm_shuffle_ps (tmp, tmp, 0);
350	// Calculate the four first summands
351	v4sf destrow = tmp * row0;
352
353	// Fill tmp with four copies of leftrow[1]
354	tmp = leftrow;
355	tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
356	destrow += tmp * row1;
357
358	// Fill tmp with four copies of leftrow[2]
359	tmp = leftrow;
360	tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
361	destrow += tmp * row2;
362
363	// Fill tmp with four copies of leftrow[3]
364	tmp = leftrow;
365	tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
366	destrow += tmp * row3;
367
368	__builtin_ia32_storeups(r[i], destrow);
369	}
370	#elif !defined(NO_ASM) && !defined(NOSSE)
371	__asm
372	{
373	mov eax, dword ptr [r]
374	mov ecx, dword ptr [m1]
375	mov edx, dword ptr [m2]
376
377	movaps xmm0,[edx]
378	movaps xmm1,[edx+16]
379	movaps xmm2,[edx+32]
380	movaps xmm3,[edx+48]
381
382	// r[0][0],r[0][1],r[0][2],r[0][3]
383
384	movaps xmm4,xmmword ptr[ecx]
385	movaps xmm5,xmm4
386	movaps xmm6,xmm4
387	movaps xmm7,xmm4
388
389	shufps xmm4,xmm4,00000000b
390	shufps xmm5,xmm5,01010101b
391	shufps xmm6,xmm6,10101010b
392	shufps xmm7,xmm7,11111111b
393
394	mulps xmm4,xmm0
395	mulps xmm5,xmm1
396	mulps xmm6,xmm2
397	mulps xmm7,xmm3
398
399	addps xmm4,xmm5
400	addps xmm4,xmm6
401	addps xmm4,xmm7
402
403	movaps xmmword ptr[eax],xmm4
404
405	// r[1][0],r[1][1],r[1][2],r[1][3]
406
407	movaps xmm4,xmmword ptr[ecx+16]
408	movaps xmm5,xmm4
409	movaps xmm6,xmm4
410	movaps xmm7,xmm4
411
412	shufps xmm4,xmm4,00000000b
413	shufps xmm5,xmm5,01010101b
414	shufps xmm6,xmm6,10101010b
415	shufps xmm7,xmm7,11111111b
416
417	mulps xmm4,xmm0
418	mulps xmm5,xmm1
419	mulps xmm6,xmm2
420	mulps xmm7,xmm3
421
422	addps xmm4,xmm5
423	addps xmm4,xmm6
424	addps xmm4,xmm7
425
426	movaps xmmword ptr[eax+16],xmm4
427
428
429	// r[2][0],r[2][1],r[2][2],r[2][3]
430
431	movaps xmm4,xmmword ptr[ecx+32]
432	movaps xmm5,xmm4
433	movaps xmm6,xmm4
434	movaps xmm7,xmm4
435
436	shufps xmm4,xmm4,00000000b
437	shufps xmm5,xmm5,01010101b
438	shufps xmm6,xmm6,10101010b
439	shufps xmm7,xmm7,11111111b
440
441	mulps xmm4,xmm0
442	mulps xmm5,xmm1
443	mulps xmm6,xmm2
444	mulps xmm7,xmm3
445
446	addps xmm4,xmm5
447	addps xmm4,xmm6
448	addps xmm4,xmm7
449
450	movaps xmmword ptr[eax+32],xmm4
451
452	// r[3][0],r[3][1],r[3][2],r[3][3]
453
454	movaps xmm4,xmmword ptr[ecx+48]
455	movaps xmm5,xmm4
456	movaps xmm6,xmm4
457	movaps xmm7,xmm4
458
459	shufps xmm4,xmm4,00000000b
460	shufps xmm5,xmm5,01010101b
461	shufps xmm6,xmm6,10101010b
462	shufps xmm7,xmm7,11111111b
463
464	mulps xmm4,xmm0
465	mulps xmm5,xmm1
466	mulps xmm6,xmm2
467	mulps xmm7,xmm3
468
469	addps xmm4,xmm5
470	addps xmm4,xmm6
471	addps xmm4,xmm7
472
473	movaps xmmword ptr[eax+48],xmm4
474	}
475	#endif // _WIN32
476	}
477
478
479
480	void math_init()
481	{
482	#ifndef __ARM_NEON__
483	#ifndef _DEBUG
484	int IsSSE = FALSE;
485	#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
486	int edx, eax;
487	GLIDE64_TRY
488	{
489	#if defined(__x86_64__)
490	asm volatile(" cpuid; "
491	: "=a"(eax), "=d"(edx)
492	: "0"(1)
493	: "rbx", "rcx"
494	);
495	#else
496	asm volatile(" push %%ebx; "
497	" push %%ecx; "
498	" cpuid; "
499	" pop %%ecx; "
500	" pop %%ebx; "
501	: "=a"(eax), "=d"(edx)
502	: "0"(1)
503	:
504	);
505	#endif
506	}
507	GLIDE64_CATCH
508	{ return; }
509	// Check for SSE
510	if (edx & (1 << 25))
511	IsSSE = TRUE;
512	#elif !defined(NO_ASM) && !defined(NOSSE)
513	DWORD dwEdx;
514	__try
515	{
516	__asm
517	{
518	mov eax,1
519	cpuid
520	mov dwEdx,edx
521	}
522	}
523	__except(EXCEPTION_EXECUTE_HANDLER)
524	{
525	return;
526	}
527
528	if (dwEdx & (1<<25))
529	{
530	if (dwEdx & (1<<24))
531	{
532	__try
533	{
534	__asm xorps xmm0, xmm0
535	IsSSE = TRUE;
536	}
537	__except(EXCEPTION_EXECUTE_HANDLER)
538	{
539	return;
540	}
541	}
542	}
543	#endif // _WIN32
544	if (IsSSE)
545	{
546	MulMatrices = MulMatricesSSE;
547	LOG("3DNOW! detected.\n");
548	}
549
550	#endif //_DEBUG
551	#endif //__ARM_NEON__
552	}