[mupen64plus-pandora.git] / 3dmath.cpp

/*
* Glide64 - Glide video plugin for Nintendo 64 emulators.
* Copyright (c) 2002  Dave2001
* Copyright (c) 2003-2009  Sergey 'Gonetz' Lipski
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

//****************************************************************
//
// Glide64 - Glide Plugin for Nintendo 64 emulators
// Project started on December 29th, 2001
//
// Authors:
// Dave2001, original author, founded the project in 2001, left it in 2002
// Gugaman, joined the project in 2002, left it in 2002
// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
//
//****************************************************************
//
// To modify Glide64:
// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
// * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
//
//****************************************************************

#include "Gfx_1.3.h"
extern "C" {
#ifndef NOSSE
#include <xmmintrin.h>
#endif
}

#include <math.h>
#include "3dmath.h"

void calc_light (VERTEX *v)
{
  float light_intensity = 0.0f;
  register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
  for (wxUint32 l=0; l<rdp.num_lights; l++)
  {
    light_intensity = DotProduct (rdp.light_vector[l], v->vec);
    
    if (light_intensity > 0.0f) 
    {
      color[0] += rdp.light[l].r * light_intensity;
      color[1] += rdp.light[l].g * light_intensity;
      color[2] += rdp.light[l].b * light_intensity;
    }
  }
  
  if (color[0] > 1.0f) color[0] = 1.0f;
  if (color[1] > 1.0f) color[1] = 1.0f;
  if (color[2] > 1.0f) color[2] = 1.0f;
  
  v->r = (wxUint8)(color[0]*255.0f);
  v->g = (wxUint8)(color[1]*255.0f);
  v->b = (wxUint8)(color[2]*255.0f);
}

//*
void calc_linear (VERTEX *v)
{
  if (settings.force_calc_sphere)
  {
    calc_sphere(v);
    return;
  }
  DECLAREALIGN16VAR(vec[3]);
  
  TransformVector (v->vec, vec, rdp.model);
  //    TransformVector (v->vec, vec, rdp.combined);
  NormalizeVector (vec);
  float x, y;
  if (!rdp.use_lookat)
  {
    x = vec[0];
    y = vec[1];
  }
  else
  {
    x = DotProduct (rdp.lookat[0], vec);
    y = DotProduct (rdp.lookat[1], vec);
  }
  
  if (x > 1.0f)
    x = 1.0f;
  else if (x < -1.0f)
    x = -1.0f;
  if (y > 1.0f)
    y = 1.0f;
  else if (y < -1.0f)
    y = -1.0f;
  
  if (rdp.cur_cache[0])
  {
    // scale >> 6 is size to map to
    v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
    v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
  }
  v->uv_scaled = 1;
#ifdef EXTREME_LOGGING
  FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
#endif
}

void calc_sphere (VERTEX *v)
{
//  LRDP("calc_sphere\n");
  DECLAREALIGN16VAR(vec[3]);
  int s_scale, t_scale;
  if (settings.hacks&hack_Chopper)
  {
    s_scale = min(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
    t_scale = min(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
  }
  else
  {
    s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
    t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
  }
  TransformVector (v->vec, vec, rdp.model);
  //    TransformVector (v->vec, vec, rdp.combined);
  NormalizeVector (vec);
  float x, y;
  if (!rdp.use_lookat)
  {
    x = vec[0];
    y = vec[1];
  }
  else
  {
    x = DotProduct (rdp.lookat[0], vec);
    y = DotProduct (rdp.lookat[1], vec);
  }
  v->ou = (x * 0.5f + 0.5f) * s_scale;
  v->ov = (y * 0.5f + 0.5f) * t_scale;
  v->uv_scaled = 1;
#ifdef EXTREME_LOGGING
  FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
#endif
}

float DotProductC(register float *v1, register float *v2)
{
    register float result;
    result = v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
    return(result);
}

void NormalizeVectorC(float *v)
{
    register float len;
    len = sqrtf(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
    if (len > 0.0f)
    {
        v[0] /= len;
        v[1] /= len;
        v[2] /= len;
    }
}

void TransformVectorC(float *src, float *dst, float mat[4][4])
{
  dst[0] = mat[0][0]*src[0] + mat[1][0]*src[1] + mat[2][0]*src[2];
  dst[1] = mat[0][1]*src[0] + mat[1][1]*src[1] + mat[2][1]*src[2];
  dst[2] = mat[0][2]*src[0] + mat[1][2]*src[1] + mat[2][2]*src[2];
}

void InverseTransformVectorC (float *src, float *dst, float mat[4][4])
{
  dst[0] = mat[0][0]*src[0] + mat[0][1]*src[1] + mat[0][2]*src[2];
  dst[1] = mat[1][0]*src[0] + mat[1][1]*src[1] + mat[1][2]*src[2];
  dst[2] = mat[2][0]*src[0] + mat[2][1]*src[1] + mat[2][2]*src[2];
}

/*
void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
{
  for (int i=0; i<4; i++)
  {
    for (int j=0; j<4; j++)
    {
      r[i][j] = m1[i][0] * m2[0][j] +
                m1[i][1] * m2[1][j] +
                m1[i][2] * m2[2][j] +
                m1[i][3] * m2[3][j];
    }
  }
}
*/
void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
{
  for (int j=0; j<4; j++)
  {
      r[0][j] = m1[0][0] * m2[0][j] +
                m1[0][1] * m2[1][j] +
                m1[0][2] * m2[2][j] +
                m1[0][3] * m2[3][j];
      r[1][j] = m1[1][0] * m2[0][j] +
                m1[1][1] * m2[1][j] +
                m1[1][2] * m2[2][j] +
                m1[1][3] * m2[3][j];
      r[2][j] = m1[2][0] * m2[0][j] +
                m1[2][1] * m2[1][j] +
                m1[2][2] * m2[2][j] +
                m1[2][3] * m2[3][j];
      r[3][j] = m1[3][0] * m2[0][j] +
                m1[3][1] * m2[1][j] +
                m1[3][2] * m2[2][j] +
                m1[3][3] * m2[3][j];
  }
}

#ifdef __ARM_NEON__
void MultMatrix_neon( float m0[4][4], float m1[4][4], float dest[4][4])
{
    asm volatile (
	"vld1.32 		{d0, d1}, [%1]!			\n\t"	//q0 = m1
	"vld1.32 		{d2, d3}, [%1]!	    	\n\t"	//q1 = m1+4
	"vld1.32 		{d4, d5}, [%1]!	    	\n\t"	//q2 = m1+8
	"vld1.32 		{d6, d7}, [%1]	    	\n\t"	//q3 = m1+12
	"vld1.32 		{d16, d17}, [%0]!		\n\t"	//q8 = m0
	"vld1.32 		{d18, d19}, [%0]!   	\n\t"	//q9 = m0+4
	"vld1.32 		{d20, d21}, [%0]!   	\n\t"	//q10 = m0+8
	"vld1.32 		{d22, d23}, [%0]    	\n\t"	//q11 = m0+12

	"vmul.f32 		q12, q8, d0[0] 			\n\t"	//q12 = q8 * d0[0]
	"vmul.f32 		q13, q8, d2[0] 		    \n\t"	//q13 = q8 * d2[0]
	"vmul.f32 		q14, q8, d4[0] 		    \n\t"	//q14 = q8 * d4[0]
	"vmul.f32 		q15, q8, d6[0]	 		\n\t"	//q15 = q8 * d6[0]
	"vmla.f32 		q12, q9, d0[1] 			\n\t"	//q12 = q9 * d0[1]
	"vmla.f32 		q13, q9, d2[1] 		    \n\t"	//q13 = q9 * d2[1]
	"vmla.f32 		q14, q9, d4[1] 		    \n\t"	//q14 = q9 * d4[1]
	"vmla.f32 		q15, q9, d6[1] 		    \n\t"	//q15 = q9 * d6[1]
	"vmla.f32 		q12, q10, d1[0] 		\n\t"	//q12 = q10 * d0[0]
	"vmla.f32 		q13, q10, d3[0] 		\n\t"	//q13 = q10 * d2[0]
	"vmla.f32 		q14, q10, d5[0] 		\n\t"	//q14 = q10 * d4[0]
	"vmla.f32 		q15, q10, d7[0] 		\n\t"	//q15 = q10 * d6[0]
	"vmla.f32 		q12, q11, d1[1] 		\n\t"	//q12 = q11 * d0[1]
	"vmla.f32 		q13, q11, d3[1] 		\n\t"	//q13 = q11 * d2[1]
	"vmla.f32 		q14, q11, d5[1] 		\n\t"	//q14 = q11 * d4[1]
	"vmla.f32 		q15, q11, d7[1]	 	    \n\t"	//q15 = q11 * d6[1]

	"vst1.32 		{d24, d25}, [%2]! 		\n\t"	//d = q12
	"vst1.32 		{d26, d27}, [%2]! 	    \n\t"	//d+4 = q13
	"vst1.32 		{d28, d29}, [%2]! 	    \n\t"	//d+8 = q14
	"vst1.32 		{d30, d31}, [%2] 	    \n\t"	//d+12 = q15

	:"+r"(m1), "+r"(m0), "+r"(dest):
    : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
    "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
    "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
    "memory"
	);
}

void Normalize_neon(float v[3])
{
	asm volatile (
	"vld1.32 		{d4}, [%0]!	    		\n\t"	//d4={x,y}
	"flds    		s10, [%0]   	    	\n\t"	//d5[0] = z
	"sub    		%0, %0, #8   	    	\n\t"	//d5[0] = z
	"vmul.f32 		d0, d4, d4				\n\t"	//d0= d4*d4
	"vpadd.f32 		d0, d0, d0				\n\t"	//d0 = d[0] + d[1]
    "vmla.f32 		d0, d5, d5				\n\t"	//d0 = d0 + d5*d5

	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2
	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d3) / 2
	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4

	"vmul.f32 		q2, q2, d0[0]			\n\t"	//d0= d2*d4
	"vst1.32 		{d4}, [%0]!  			\n\t"	//d2={x0,y0}, d3={z0, w0}
	"fsts    		s10, [%0]     			\n\t"	//d2={x0,y0}, d3={z0, w0}

	:"+r"(v) :
    : "d0", "d1", "d2", "d3", "d4", "d5", "memory"
	);
}

float DotProduct_neon( float v0[3], float v1[3] )
{
    float dot;
	asm volatile (
	"vld1.32 		{d8}, [%1]!			\n\t"	//d8={x0,y0}
	"vld1.32 		{d10}, [%2]!		\n\t"	//d10={x1,y1}
	"flds 			s18, [%1, #0]	    \n\t"	//d9[0]={z0}
	"flds 			s22, [%2, #0]	    \n\t"	//d11[0]={z1}
	"vmul.f32 		d12, d8, d10		\n\t"	//d0= d2*d4
	"vpadd.f32 		d12, d12, d12		\n\t"	//d0 = d[0] + d[1]
	"vmla.f32 		d12, d9, d11		\n\t"	//d0 = d0 + d3*d5
    "fmrs	        %0, s24	    		\n\t"	//r0 = s0
	: "=r"(dot), "+r"(v0), "+r"(v1):
    : "d8", "d9", "d10", "d11", "d12"

	);
    return dot;
}

#endif

// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
//                      and 3DNOW! 4x4 4x4 matrix multiplication
// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
// This will need fixing.
#ifndef __ARM_NEON__
MULMATRIX MulMatrices = MulMatricesC;
TRANSFORMVECTOR TransformVector = TransformVectorC;
TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
DOTPRODUCT DotProduct = DotProductC;
NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
#endif

void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
{
#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
   /* [row][col]*/
  typedef float v4sf __attribute__ ((vector_size (16)));
  v4sf row0 = _mm_loadu_ps(m2[0]);
  v4sf row1 = _mm_loadu_ps(m2[1]);
  v4sf row2 = _mm_loadu_ps(m2[2]);
  v4sf row3 = _mm_loadu_ps(m2[3]);

  for (int i = 0; i < 4; ++i)
  {
    v4sf leftrow = _mm_loadu_ps(m1[i]);

    // Fill tmp with four copies of leftrow[0]
    v4sf tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 0);
    // Calculate the four first summands
    v4sf destrow = tmp * row0;

    // Fill tmp with four copies of leftrow[1]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
    destrow += tmp * row1;

    // Fill tmp with four copies of leftrow[2]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
    destrow += tmp * row2;

    // Fill tmp with four copies of leftrow[3]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
    destrow += tmp * row3;

    __builtin_ia32_storeups(r[i], destrow);
  }
 #elif !defined(NO_ASM) && !defined(NOSSE)
  __asm
  {
    mov     eax, dword ptr [r]  
      mov     ecx, dword ptr [m1]
      mov     edx, dword ptr [m2]

      movaps  xmm0,[edx]
      movaps  xmm1,[edx+16]
      movaps  xmm2,[edx+32]
      movaps  xmm3,[edx+48]

// r[0][0],r[0][1],r[0][2],r[0][3]

      movaps  xmm4,xmmword ptr[ecx]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax],xmm4

// r[1][0],r[1][1],r[1][2],r[1][3]

      movaps  xmm4,xmmword ptr[ecx+16]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+16],xmm4


// r[2][0],r[2][1],r[2][2],r[2][3]

      movaps  xmm4,xmmword ptr[ecx+32]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+32],xmm4

// r[3][0],r[3][1],r[3][2],r[3][3]

      movaps  xmm4,xmmword ptr[ecx+48]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+48],xmm4
    }
#endif // _WIN32
  }


  void math_init()
  {
#ifndef __ARM_NEON__
#ifndef _DEBUG
    int IsSSE = FALSE;
#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
    int edx, eax;
    GLIDE64_TRY
    {
  #if defined(__x86_64__)
      asm volatile(" cpuid;        "
        : "=a"(eax), "=d"(edx)
        : "0"(1)
        : "rbx", "rcx"
        );
  #else
      asm volatile(" push %%ebx;   "
        " push %%ecx;   "
        " cpuid;        "
        " pop %%ecx;    "
        " pop %%ebx;    "
        : "=a"(eax), "=d"(edx)
        : "0"(1)
        :
      );
  #endif
    }
    GLIDE64_CATCH
      { return; }
    // Check for SSE
    if (edx & (1 << 25))
      IsSSE = TRUE;
#elif !defined(NO_ASM) && !defined(NOSSE)
    DWORD dwEdx;
    __try
    {
      __asm 
      {
        mov  eax,1
          cpuid
          mov dwEdx,edx
        }  
      }
      __except(EXCEPTION_EXECUTE_HANDLER)
      {
        return;
      }

      if (dwEdx & (1<<25)) 
      {
        if (dwEdx & (1<<24))
        {      
          __try
          {
            __asm xorps xmm0, xmm0
              IsSSE = TRUE;
          }
          __except(EXCEPTION_EXECUTE_HANDLER)
          {
            return;
          }
        }
      }
#endif // _WIN32
      if (IsSSE)
      {
        MulMatrices = MulMatricesSSE;
        LOG("3DNOW! detected.\n");
      }

#endif //_DEBUG
#endif	//__ARM_NEON__
    }
Commit	Line	Data
	1	/*
	2	* Glide64 - Glide video plugin for Nintendo 64 emulators.
	3	* Copyright (c) 2002 Dave2001
	4	* Copyright (c) 2003-2009 Sergey 'Gonetz' Lipski
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	19	*/
	20
	21	//****************************************************************
	22	//
	23	// Glide64 - Glide Plugin for Nintendo 64 emulators
	24	// Project started on December 29th, 2001
	25	//
	26	// Authors:
	27	// Dave2001, original author, founded the project in 2001, left it in 2002
	28	// Gugaman, joined the project in 2002, left it in 2002
	29	// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
	30	// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
	31	//
	32	//****************************************************************
	33	//
	34	// To modify Glide64:
	35	// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
	36	// * Do NOT send me the whole project or file that you modified. Take out your modified code sections, and tell me where to put them. If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
	37	//
	38	//****************************************************************
	39
	40	#include "Gfx_1.3.h"
	41	extern "C" {
	42	#ifndef NOSSE
	43	#include <xmmintrin.h>
	44	#endif
	45	}
	46
	47	#include <math.h>
	48	#include "3dmath.h"
	49
	50	void calc_light (VERTEX *v)
	51	{
	52	float light_intensity = 0.0f;
	53	register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
	54	for (wxUint32 l=0; l<rdp.num_lights; l++)
	55	{
	56	light_intensity = DotProduct (rdp.light_vector[l], v->vec);
	57
	58	if (light_intensity > 0.0f)
	59	{
	60	color[0] += rdp.light[l].r * light_intensity;
	61	color[1] += rdp.light[l].g * light_intensity;
	62	color[2] += rdp.light[l].b * light_intensity;
	63	}
	64	}
	65
	66	if (color[0] > 1.0f) color[0] = 1.0f;
	67	if (color[1] > 1.0f) color[1] = 1.0f;
	68	if (color[2] > 1.0f) color[2] = 1.0f;
	69
	70	v->r = (wxUint8)(color[0]*255.0f);
	71	v->g = (wxUint8)(color[1]*255.0f);
	72	v->b = (wxUint8)(color[2]*255.0f);
	73	}
	74
	75	//*
	76	void calc_linear (VERTEX *v)
	77	{
	78	if (settings.force_calc_sphere)
	79	{
	80	calc_sphere(v);
	81	return;
	82	}
	83	DECLAREALIGN16VAR(vec[3]);
	84
	85	TransformVector (v->vec, vec, rdp.model);
	86	// TransformVector (v->vec, vec, rdp.combined);
	87	NormalizeVector (vec);
	88	float x, y;
	89	if (!rdp.use_lookat)
	90	{
	91	x = vec[0];
	92	y = vec[1];
	93	}
	94	else
	95	{
	96	x = DotProduct (rdp.lookat[0], vec);
	97	y = DotProduct (rdp.lookat[1], vec);
	98	}
	99
	100	if (x > 1.0f)
	101	x = 1.0f;
	102	else if (x < -1.0f)
	103	x = -1.0f;
	104	if (y > 1.0f)
	105	y = 1.0f;
	106	else if (y < -1.0f)
	107	y = -1.0f;
	108
	109	if (rdp.cur_cache[0])
	110	{
	111	// scale >> 6 is size to map to
	112	v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
	113	v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
	114	}
	115	v->uv_scaled = 1;
	116	#ifdef EXTREME_LOGGING
	117	FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
	118	#endif
	119	}
	120
	121	void calc_sphere (VERTEX *v)
	122	{
	123	// LRDP("calc_sphere\n");
	124	DECLAREALIGN16VAR(vec[3]);
	125	int s_scale, t_scale;
	126	if (settings.hacks&hack_Chopper)
	127	{
	128	s_scale = min(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
	129	t_scale = min(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
	130	}
	131	else
	132	{
	133	s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
	134	t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
	135	}
	136	TransformVector (v->vec, vec, rdp.model);
	137	// TransformVector (v->vec, vec, rdp.combined);
	138	NormalizeVector (vec);
	139	float x, y;
	140	if (!rdp.use_lookat)
	141	{
	142	x = vec[0];
	143	y = vec[1];
	144	}
	145	else
	146	{
	147	x = DotProduct (rdp.lookat[0], vec);
	148	y = DotProduct (rdp.lookat[1], vec);
	149	}
	150	v->ou = (x * 0.5f + 0.5f) * s_scale;
	151	v->ov = (y * 0.5f + 0.5f) * t_scale;
	152	v->uv_scaled = 1;
	153	#ifdef EXTREME_LOGGING
	154	FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
	155	#endif
	156	}
	157
	158	float DotProductC(register float v1, register float v2)
	159	{
	160	register float result;
	161	result = v1[0]v2[0] + v1[1]v2[1] + v1[2]*v2[2];
	162	return(result);
	163	}
	164
	165	void NormalizeVectorC(float *v)
	166	{
	167	register float len;
	168	len = sqrtf(v[0]v[0] + v[1]v[1] + v[2]*v[2]);
	169	if (len > 0.0f)
	170	{
	171	v[0] /= len;
	172	v[1] /= len;
	173	v[2] /= len;
	174	}
	175	}
	176
	177	void TransformVectorC(float src, float dst, float mat[4][4])
	178	{
	179	dst[0] = mat[0][0]src[0] + mat[1][0]src[1] + mat[2][0]*src[2];
	180	dst[1] = mat[0][1]src[0] + mat[1][1]src[1] + mat[2][1]*src[2];
	181	dst[2] = mat[0][2]src[0] + mat[1][2]src[1] + mat[2][2]*src[2];
	182	}
	183
	184	void InverseTransformVectorC (float src, float dst, float mat[4][4])
	185	{
	186	dst[0] = mat[0][0]src[0] + mat[0][1]src[1] + mat[0][2]*src[2];
	187	dst[1] = mat[1][0]src[0] + mat[1][1]src[1] + mat[1][2]*src[2];
	188	dst[2] = mat[2][0]src[0] + mat[2][1]src[1] + mat[2][2]*src[2];
	189	}
	190
	191	/*
	192	void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
	193	{
	194	for (int i=0; i<4; i++)
	195	{
	196	for (int j=0; j<4; j++)
	197	{
	198	r[i][j] = m1[i][0] * m2[0][j] +
	199	m1[i][1] * m2[1][j] +
	200	m1[i][2] * m2[2][j] +
	201	m1[i][3] * m2[3][j];
	202	}
	203	}
	204	}
	205	*/
	206	void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
	207	{
	208	for (int j=0; j<4; j++)
	209	{
	210	r[0][j] = m1[0][0] * m2[0][j] +
	211	m1[0][1] * m2[1][j] +
	212	m1[0][2] * m2[2][j] +
	213	m1[0][3] * m2[3][j];
	214	r[1][j] = m1[1][0] * m2[0][j] +
	215	m1[1][1] * m2[1][j] +
	216	m1[1][2] * m2[2][j] +
	217	m1[1][3] * m2[3][j];
	218	r[2][j] = m1[2][0] * m2[0][j] +
	219	m1[2][1] * m2[1][j] +
	220	m1[2][2] * m2[2][j] +
	221	m1[2][3] * m2[3][j];
	222	r[3][j] = m1[3][0] * m2[0][j] +
	223	m1[3][1] * m2[1][j] +
	224	m1[3][2] * m2[2][j] +
	225	m1[3][3] * m2[3][j];
	226	}
	227	}
	228
	229	#ifdef __ARM_NEON__
	230	void MultMatrix_neon( float m0[4][4], float m1[4][4], float dest[4][4])
	231	{
	232	asm volatile (
	233	"vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1
	234	"vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4
	235	"vld1.32 {d4, d5}, [%1]! \n\t" //q2 = m1+8
	236	"vld1.32 {d6, d7}, [%1] \n\t" //q3 = m1+12
	237	"vld1.32 {d16, d17}, [%0]! \n\t" //q8 = m0
	238	"vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m0+4
	239	"vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m0+8
	240	"vld1.32 {d22, d23}, [%0] \n\t" //q11 = m0+12
	241
	242	"vmul.f32 q12, q8, d0[0] \n\t" //q12 = q8 * d0[0]
	243	"vmul.f32 q13, q8, d2[0] \n\t" //q13 = q8 * d2[0]
	244	"vmul.f32 q14, q8, d4[0] \n\t" //q14 = q8 * d4[0]
	245	"vmul.f32 q15, q8, d6[0] \n\t" //q15 = q8 * d6[0]
	246	"vmla.f32 q12, q9, d0[1] \n\t" //q12 = q9 * d0[1]
	247	"vmla.f32 q13, q9, d2[1] \n\t" //q13 = q9 * d2[1]
	248	"vmla.f32 q14, q9, d4[1] \n\t" //q14 = q9 * d4[1]
	249	"vmla.f32 q15, q9, d6[1] \n\t" //q15 = q9 * d6[1]
	250	"vmla.f32 q12, q10, d1[0] \n\t" //q12 = q10 * d0[0]
	251	"vmla.f32 q13, q10, d3[0] \n\t" //q13 = q10 * d2[0]
	252	"vmla.f32 q14, q10, d5[0] \n\t" //q14 = q10 * d4[0]
	253	"vmla.f32 q15, q10, d7[0] \n\t" //q15 = q10 * d6[0]
	254	"vmla.f32 q12, q11, d1[1] \n\t" //q12 = q11 * d0[1]
	255	"vmla.f32 q13, q11, d3[1] \n\t" //q13 = q11 * d2[1]
	256	"vmla.f32 q14, q11, d5[1] \n\t" //q14 = q11 * d4[1]
	257	"vmla.f32 q15, q11, d7[1] \n\t" //q15 = q11 * d6[1]
	258
	259	"vst1.32 {d24, d25}, [%2]! \n\t" //d = q12
	260	"vst1.32 {d26, d27}, [%2]! \n\t" //d+4 = q13
	261	"vst1.32 {d28, d29}, [%2]! \n\t" //d+8 = q14
	262	"vst1.32 {d30, d31}, [%2] \n\t" //d+12 = q15
	263
	264	:"+r"(m1), "+r"(m0), "+r"(dest):
	265	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
	266	"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
	267	"d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
	268	"memory"
	269	);
	270	}
	271
	272	void Normalize_neon(float v[3])
	273	{
	274	asm volatile (
	275	"vld1.32 {d4}, [%0]! \n\t" //d4={x,y}
	276	"flds s10, [%0] \n\t" //d5[0] = z
	277	"sub %0, %0, #8 \n\t" //d5[0] = z
	278	"vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4
	279	"vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1]
	280	"vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5
	281
	282	"vmov.f32 d1, d0 \n\t" //d1 = d0
	283	"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
	284	"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
	285	"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
	286	"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
	287	"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
	288	"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d3) / 2
	289	"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4
	290
	291	"vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4
	292	"vst1.32 {d4}, [%0]! \n\t" //d2={x0,y0}, d3={z0, w0}
	293	"fsts s10, [%0] \n\t" //d2={x0,y0}, d3={z0, w0}
	294
	295	:"+r"(v) :
	296	: "d0", "d1", "d2", "d3", "d4", "d5", "memory"
	297	);
	298	}
	299
	300	float DotProduct_neon( float v0[3], float v1[3] )
	301	{
	302	float dot;
	303	asm volatile (
	304	"vld1.32 {d8}, [%1]! \n\t" //d8={x0,y0}
	305	"vld1.32 {d10}, [%2]! \n\t" //d10={x1,y1}
	306	"flds s18, [%1, #0] \n\t" //d9[0]={z0}
	307	"flds s22, [%2, #0] \n\t" //d11[0]={z1}
	308	"vmul.f32 d12, d8, d10 \n\t" //d0= d2*d4
	309	"vpadd.f32 d12, d12, d12 \n\t" //d0 = d[0] + d[1]
	310	"vmla.f32 d12, d9, d11 \n\t" //d0 = d0 + d3*d5
	311	"fmrs %0, s24 \n\t" //r0 = s0
	312	: "=r"(dot), "+r"(v0), "+r"(v1):
	313	: "d8", "d9", "d10", "d11", "d12"
	314
	315	);
	316	return dot;
	317	}
	318
	319	#endif
	320
	321	// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
	322	// and 3DNOW! 4x4 4x4 matrix multiplication
	323	// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
	324	// This will need fixing.
	325	#ifndef __ARM_NEON__
	326	MULMATRIX MulMatrices = MulMatricesC;
	327	TRANSFORMVECTOR TransformVector = TransformVectorC;
	328	TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
	329	DOTPRODUCT DotProduct = DotProductC;
	330	NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
	331	#endif
	332
	333	void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
	334	{
	335	#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
	336	/* [row][col]*/
	337	typedef float v4sf __attribute__ ((vector_size (16)));
	338	v4sf row0 = _mm_loadu_ps(m2[0]);
	339	v4sf row1 = _mm_loadu_ps(m2[1]);
	340	v4sf row2 = _mm_loadu_ps(m2[2]);
	341	v4sf row3 = _mm_loadu_ps(m2[3]);
	342
	343	for (int i = 0; i < 4; ++i)
	344	{
	345	v4sf leftrow = _mm_loadu_ps(m1[i]);
	346
	347	// Fill tmp with four copies of leftrow[0]
	348	v4sf tmp = leftrow;
	349	tmp = _mm_shuffle_ps (tmp, tmp, 0);
	350	// Calculate the four first summands
	351	v4sf destrow = tmp * row0;
	352
	353	// Fill tmp with four copies of leftrow[1]
	354	tmp = leftrow;
	355	tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
	356	destrow += tmp * row1;
	357
	358	// Fill tmp with four copies of leftrow[2]
	359	tmp = leftrow;
	360	tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
	361	destrow += tmp * row2;
	362
	363	// Fill tmp with four copies of leftrow[3]
	364	tmp = leftrow;
	365	tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
	366	destrow += tmp * row3;
	367
	368	__builtin_ia32_storeups(r[i], destrow);
	369	}
	370	#elif !defined(NO_ASM) && !defined(NOSSE)
	371	__asm
	372	{
	373	mov eax, dword ptr [r]
	374	mov ecx, dword ptr [m1]
	375	mov edx, dword ptr [m2]
	376
	377	movaps xmm0,[edx]
	378	movaps xmm1,[edx+16]
	379	movaps xmm2,[edx+32]
	380	movaps xmm3,[edx+48]
	381
	382	// r[0][0],r[0][1],r[0][2],r[0][3]
	383
	384	movaps xmm4,xmmword ptr[ecx]
	385	movaps xmm5,xmm4
	386	movaps xmm6,xmm4
	387	movaps xmm7,xmm4
	388
	389	shufps xmm4,xmm4,00000000b
	390	shufps xmm5,xmm5,01010101b
	391	shufps xmm6,xmm6,10101010b
	392	shufps xmm7,xmm7,11111111b
	393
	394	mulps xmm4,xmm0
	395	mulps xmm5,xmm1
	396	mulps xmm6,xmm2
	397	mulps xmm7,xmm3
	398
	399	addps xmm4,xmm5
	400	addps xmm4,xmm6
	401	addps xmm4,xmm7
	402
	403	movaps xmmword ptr[eax],xmm4
	404
	405	// r[1][0],r[1][1],r[1][2],r[1][3]
	406
	407	movaps xmm4,xmmword ptr[ecx+16]
	408	movaps xmm5,xmm4
	409	movaps xmm6,xmm4
	410	movaps xmm7,xmm4
	411
	412	shufps xmm4,xmm4,00000000b
	413	shufps xmm5,xmm5,01010101b
	414	shufps xmm6,xmm6,10101010b
	415	shufps xmm7,xmm7,11111111b
	416
	417	mulps xmm4,xmm0
	418	mulps xmm5,xmm1
	419	mulps xmm6,xmm2
	420	mulps xmm7,xmm3
	421
	422	addps xmm4,xmm5
	423	addps xmm4,xmm6
	424	addps xmm4,xmm7
	425
	426	movaps xmmword ptr[eax+16],xmm4
	427
	428
	429	// r[2][0],r[2][1],r[2][2],r[2][3]
	430
	431	movaps xmm4,xmmword ptr[ecx+32]
	432	movaps xmm5,xmm4
	433	movaps xmm6,xmm4
	434	movaps xmm7,xmm4
	435
	436	shufps xmm4,xmm4,00000000b
	437	shufps xmm5,xmm5,01010101b
	438	shufps xmm6,xmm6,10101010b
	439	shufps xmm7,xmm7,11111111b
	440
	441	mulps xmm4,xmm0
	442	mulps xmm5,xmm1
	443	mulps xmm6,xmm2
	444	mulps xmm7,xmm3
	445
	446	addps xmm4,xmm5
	447	addps xmm4,xmm6
	448	addps xmm4,xmm7
	449
	450	movaps xmmword ptr[eax+32],xmm4
	451
	452	// r[3][0],r[3][1],r[3][2],r[3][3]
	453
	454	movaps xmm4,xmmword ptr[ecx+48]
	455	movaps xmm5,xmm4
	456	movaps xmm6,xmm4
	457	movaps xmm7,xmm4
	458
	459	shufps xmm4,xmm4,00000000b
	460	shufps xmm5,xmm5,01010101b
	461	shufps xmm6,xmm6,10101010b
	462	shufps xmm7,xmm7,11111111b
	463
	464	mulps xmm4,xmm0
	465	mulps xmm5,xmm1
	466	mulps xmm6,xmm2
	467	mulps xmm7,xmm3
	468
	469	addps xmm4,xmm5
	470	addps xmm4,xmm6
	471	addps xmm4,xmm7
	472
	473	movaps xmmword ptr[eax+48],xmm4
	474	}
	475	#endif // _WIN32
	476	}
	477
	478
	479
	480	void math_init()
	481	{
	482	#ifndef __ARM_NEON__
	483	#ifndef _DEBUG
	484	int IsSSE = FALSE;
	485	#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
	486	int edx, eax;
	487	GLIDE64_TRY
	488	{
	489	#if defined(__x86_64__)
	490	asm volatile(" cpuid; "
	491	: "=a"(eax), "=d"(edx)
	492	: "0"(1)
	493	: "rbx", "rcx"
	494	);
	495	#else
	496	asm volatile(" push %%ebx; "
	497	" push %%ecx; "
	498	" cpuid; "
	499	" pop %%ecx; "
	500	" pop %%ebx; "
	501	: "=a"(eax), "=d"(edx)
	502	: "0"(1)
	503	:
	504	);
	505	#endif
	506	}
	507	GLIDE64_CATCH
	508	{ return; }
	509	// Check for SSE
	510	if (edx & (1 << 25))
	511	IsSSE = TRUE;
	512	#elif !defined(NO_ASM) && !defined(NOSSE)
	513	DWORD dwEdx;
	514	__try
	515	{
	516	__asm
	517	{
	518	mov eax,1
	519	cpuid
	520	mov dwEdx,edx
	521	}
	522	}
	523	__except(EXCEPTION_EXECUTE_HANDLER)
	524	{
	525	return;
	526	}
	527
	528	if (dwEdx & (1<<25))
	529	{
	530	if (dwEdx & (1<<24))
	531	{
	532	__try
	533	{
	534	__asm xorps xmm0, xmm0
	535	IsSSE = TRUE;
	536	}
	537	__except(EXCEPTION_EXECUTE_HANDLER)
	538	{
	539	return;
	540	}
	541	}
	542	}
	543	#endif // _WIN32
	544	if (IsSSE)
	545	{
	546	MulMatrices = MulMatricesSSE;
	547	LOG("3DNOW! detected.\n");
	548	}
	549
	550	#endif //_DEBUG
	551	#endif //__ARM_NEON__
	552	}