source/gles2glide64/src/Glide64/3dmath.cpp

   1 /*
   2 * Glide64 - Glide video plugin for Nintendo 64 emulators.
   3 * Copyright (c) 2002  Dave2001
   4 * Copyright (c) 2003-2009  Sergey 'Gonetz' Lipski
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19 */
  20
  21 //****************************************************************
  22 //
  23 // Glide64 - Glide Plugin for Nintendo 64 emulators
  24 // Project started on December 29th, 2001
  25 //
  26 // Authors:
  27 // Dave2001, original author, founded the project in 2001, left it in 2002
  28 // Gugaman, joined the project in 2002, left it in 2002
  29 // Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
  30 // Hiroshi 'KoolSmoky' Morii, joined the project in 2007
  31 //
  32 //****************************************************************
  33 //
  34 // To modify Glide64:
  35 // * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
  36 // * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
  37 //
  38 //****************************************************************
  39
  40 #include "Gfx_1.3.h"
  41 extern "C" {
  42 #ifndef NOSSE
  43 #include <xmmintrin.h>
  44 #endif
  45 }
  46
  47 #include <math.h>
  48 #include "3dmath.h"
  49
  50 void calc_light (VERTEX *v)
  51 {
  52   float light_intensity = 0.0f;
  53   register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
  54   for (wxUint32 l=0; l<rdp.num_lights; l++)
  55   {
  56     light_intensity = DotProduct (rdp.light_vector[l], v->vec);
  57
  58     if (light_intensity > 0.0f)
  59     {
  60       color[0] += rdp.light[l].r * light_intensity;
  61       color[1] += rdp.light[l].g * light_intensity;
  62       color[2] += rdp.light[l].b * light_intensity;
  63     }
  64   }
  65
  66   if (color[0] > 1.0f) color[0] = 1.0f;
  67   if (color[1] > 1.0f) color[1] = 1.0f;
  68   if (color[2] > 1.0f) color[2] = 1.0f;
  69
  70   v->r = (wxUint8)(color[0]*255.0f);
  71   v->g = (wxUint8)(color[1]*255.0f);
  72   v->b = (wxUint8)(color[2]*255.0f);
  73 }
  74
  75 //*
  76 void calc_linear (VERTEX *v)
  77 {
  78   if (settings.force_calc_sphere)
  79   {
  80     calc_sphere(v);
  81     return;
  82   }
  83   DECLAREALIGN16VAR(vec[3]);
  84
  85   TransformVector (v->vec, vec, rdp.model);
  86   //    TransformVector (v->vec, vec, rdp.combined);
  87   NormalizeVector (vec);
  88   float x, y;
  89   if (!rdp.use_lookat)
  90   {
  91     x = vec[0];
  92     y = vec[1];
  93   }
  94   else
  95   {
  96     x = DotProduct (rdp.lookat[0], vec);
  97     y = DotProduct (rdp.lookat[1], vec);
  98   }
  99
 100   if (x > 1.0f)
 101     x = 1.0f;
 102   else if (x < -1.0f)
 103     x = -1.0f;
 104   if (y > 1.0f)
 105     y = 1.0f;
 106   else if (y < -1.0f)
 107     y = -1.0f;
 108
 109   if (rdp.cur_cache[0])
 110   {
 111     // scale >> 6 is size to map to
 112     v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
 113     v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
 114   }
 115   v->uv_scaled = 1;
 116 #ifdef EXTREME_LOGGING
 117   FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
 118 #endif
 119 }
 120
 121 void calc_sphere (VERTEX *v)
 122 {
 123 //  LRDP("calc_sphere\n");
 124   DECLAREALIGN16VAR(vec[3]);
 125   int s_scale, t_scale;
 126   if (settings.hacks&hack_Chopper)
 127   {
 128     s_scale = min(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
 129     t_scale = min(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
 130   }
 131   else
 132   {
 133     s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
 134     t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
 135   }
 136   TransformVector (v->vec, vec, rdp.model);
 137   //    TransformVector (v->vec, vec, rdp.combined);
 138   NormalizeVector (vec);
 139   float x, y;
 140   if (!rdp.use_lookat)
 141   {
 142     x = vec[0];
 143     y = vec[1];
 144   }
 145   else
 146   {
 147     x = DotProduct (rdp.lookat[0], vec);
 148     y = DotProduct (rdp.lookat[1], vec);
 149   }
 150   v->ou = (x * 0.5f + 0.5f) * s_scale;
 151   v->ov = (y * 0.5f + 0.5f) * t_scale;
 152   v->uv_scaled = 1;
 153 #ifdef EXTREME_LOGGING
 154   FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
 155 #endif
 156 }
 157
 158 float DotProductC(register float *v1, register float *v2)
 159 {
 160     register float result;
 161     result = v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
 162     return(result);
 163 }
 164
 165 void NormalizeVectorC(float *v)
 166 {
 167     register float len;
 168     len = sqrtf(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
 169     if (len > 0.0f)
 170     {
 171         v[0] /= len;
 172         v[1] /= len;
 173         v[2] /= len;
 174     }
 175 }
 176
 177 void TransformVectorC(float *src, float *dst, float mat[4][4])
 178 {
 179   dst[0] = mat[0][0]*src[0] + mat[1][0]*src[1] + mat[2][0]*src[2];
 180   dst[1] = mat[0][1]*src[0] + mat[1][1]*src[1] + mat[2][1]*src[2];
 181   dst[2] = mat[0][2]*src[0] + mat[1][2]*src[1] + mat[2][2]*src[2];
 182 }
 183
 184 void InverseTransformVectorC (float *src, float *dst, float mat[4][4])
 185 {
 186   dst[0] = mat[0][0]*src[0] + mat[0][1]*src[1] + mat[0][2]*src[2];
 187   dst[1] = mat[1][0]*src[0] + mat[1][1]*src[1] + mat[1][2]*src[2];
 188   dst[2] = mat[2][0]*src[0] + mat[2][1]*src[1] + mat[2][2]*src[2];
 189 }
 190
 191 /*
 192 void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
 193 {
 194   for (int i=0; i<4; i++)
 195   {
 196     for (int j=0; j<4; j++)
 197     {
 198       r[i][j] = m1[i][0] * m2[0][j] +
 199                 m1[i][1] * m2[1][j] +
 200                 m1[i][2] * m2[2][j] +
 201                 m1[i][3] * m2[3][j];
 202     }
 203   }
 204 }
 205 */
 206 void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
 207 {
 208   for (int j=0; j<4; j++)
 209   {
 210       r[0][j] = m1[0][0] * m2[0][j] +
 211                 m1[0][1] * m2[1][j] +
 212                 m1[0][2] * m2[2][j] +
 213                 m1[0][3] * m2[3][j];
 214       r[1][j] = m1[1][0] * m2[0][j] +
 215                 m1[1][1] * m2[1][j] +
 216                 m1[1][2] * m2[2][j] +
 217                 m1[1][3] * m2[3][j];
 218       r[2][j] = m1[2][0] * m2[0][j] +
 219                 m1[2][1] * m2[1][j] +
 220                 m1[2][2] * m2[2][j] +
 221                 m1[2][3] * m2[3][j];
 222       r[3][j] = m1[3][0] * m2[0][j] +
 223                 m1[3][1] * m2[1][j] +
 224                 m1[3][2] * m2[2][j] +
 225                 m1[3][3] * m2[3][j];
 226   }
 227 }
 228
 229 #ifdef __ARM_NEON__
 230 void MultMatrix_neon( float m0[4][4], float m1[4][4], float dest[4][4])
 231 {
 232     asm volatile (
 233         "vld1.32                {d0, d1}, [%1]!                 \n\t"   //q0 = m1
 234         "vld1.32                {d2, d3}, [%1]!         \n\t"   //q1 = m1+4
 235         "vld1.32                {d4, d5}, [%1]!         \n\t"   //q2 = m1+8
 236         "vld1.32                {d6, d7}, [%1]          \n\t"   //q3 = m1+12
 237         "vld1.32                {d16, d17}, [%0]!               \n\t"   //q8 = m0
 238         "vld1.32                {d18, d19}, [%0]!       \n\t"   //q9 = m0+4
 239         "vld1.32                {d20, d21}, [%0]!       \n\t"   //q10 = m0+8
 240         "vld1.32                {d22, d23}, [%0]        \n\t"   //q11 = m0+12
 241
 242         "vmul.f32               q12, q8, d0[0]                  \n\t"   //q12 = q8 * d0[0]
 243         "vmul.f32               q13, q8, d2[0]              \n\t"       //q13 = q8 * d2[0]
 244         "vmul.f32               q14, q8, d4[0]              \n\t"       //q14 = q8 * d4[0]
 245         "vmul.f32               q15, q8, d6[0]                  \n\t"   //q15 = q8 * d6[0]
 246         "vmla.f32               q12, q9, d0[1]                  \n\t"   //q12 = q9 * d0[1]
 247         "vmla.f32               q13, q9, d2[1]              \n\t"       //q13 = q9 * d2[1]
 248         "vmla.f32               q14, q9, d4[1]              \n\t"       //q14 = q9 * d4[1]
 249         "vmla.f32               q15, q9, d6[1]              \n\t"       //q15 = q9 * d6[1]
 250         "vmla.f32               q12, q10, d1[0]                 \n\t"   //q12 = q10 * d0[0]
 251         "vmla.f32               q13, q10, d3[0]                 \n\t"   //q13 = q10 * d2[0]
 252         "vmla.f32               q14, q10, d5[0]                 \n\t"   //q14 = q10 * d4[0]
 253         "vmla.f32               q15, q10, d7[0]                 \n\t"   //q15 = q10 * d6[0]
 254         "vmla.f32               q12, q11, d1[1]                 \n\t"   //q12 = q11 * d0[1]
 255         "vmla.f32               q13, q11, d3[1]                 \n\t"   //q13 = q11 * d2[1]
 256         "vmla.f32               q14, q11, d5[1]                 \n\t"   //q14 = q11 * d4[1]
 257         "vmla.f32               q15, q11, d7[1]             \n\t"       //q15 = q11 * d6[1]
 258
 259         "vst1.32                {d24, d25}, [%2]!               \n\t"   //d = q12
 260         "vst1.32                {d26, d27}, [%2]!           \n\t"       //d+4 = q13
 261         "vst1.32                {d28, d29}, [%2]!           \n\t"       //d+8 = q14
 262         "vst1.32                {d30, d31}, [%2]            \n\t"       //d+12 = q15
 263
 264         :"+r"(m1), "+r"(m0), "+r"(dest):
 265     : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 266     "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
 267     "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
 268     "memory"
 269         );
 270 }
 271
 272 void Normalize_neon(float v[3])
 273 {
 274         asm volatile (
 275         "vld1.32                {d4}, [%0]!                     \n\t"   //d4={x,y}
 276         "flds                   s10, [%0]               \n\t"   //d5[0] = z
 277         "sub                    %0, %0, #8              \n\t"   //d5[0] = z
 278         "vmul.f32               d0, d4, d4                              \n\t"   //d0= d4*d4
 279         "vpadd.f32              d0, d0, d0                              \n\t"   //d0 = d[0] + d[1]
 280     "vmla.f32           d0, d5, d5                              \n\t"   //d0 = d0 + d5*d5
 281
 282         "vmov.f32               d1, d0                                  \n\t"   //d1 = d0
 283         "vrsqrte.f32    d0, d0                                  \n\t"   //d0 = ~ 1.0 / sqrt(d0)
 284         "vmul.f32               d2, d0, d1                              \n\t"   //d2 = d0 * d1
 285         "vrsqrts.f32    d3, d2, d0                              \n\t"   //d3 = (3 - d0 * d2) / 2
 286         "vmul.f32               d0, d0, d3                              \n\t"   //d0 = d0 * d3
 287         "vmul.f32               d2, d0, d1                              \n\t"   //d2 = d0 * d1
 288         "vrsqrts.f32    d3, d2, d0                              \n\t"   //d3 = (3 - d0 * d3) / 2
 289         "vmul.f32               d0, d0, d3                              \n\t"   //d0 = d0 * d4
 290
 291         "vmul.f32               q2, q2, d0[0]                   \n\t"   //d0= d2*d4
 292         "vst1.32                {d4}, [%0]!                     \n\t"   //d2={x0,y0}, d3={z0, w0}
 293         "fsts                   s10, [%0]                       \n\t"   //d2={x0,y0}, d3={z0, w0}
 294
 295         :"+r"(v) :
 296     : "d0", "d1", "d2", "d3", "d4", "d5", "memory"
 297         );
 298 }
 299
 300 float DotProduct_neon( float v0[3], float v1[3] )
 301 {
 302     float dot;
 303         asm volatile (
 304         "vld1.32                {d8}, [%1]!                     \n\t"   //d8={x0,y0}
 305         "vld1.32                {d10}, [%2]!            \n\t"   //d10={x1,y1}
 306         "flds                   s18, [%1, #0]       \n\t"       //d9[0]={z0}
 307         "flds                   s22, [%2, #0]       \n\t"       //d11[0]={z1}
 308         "vmul.f32               d12, d8, d10            \n\t"   //d0= d2*d4
 309         "vpadd.f32              d12, d12, d12           \n\t"   //d0 = d[0] + d[1]
 310         "vmla.f32               d12, d9, d11            \n\t"   //d0 = d0 + d3*d5
 311     "fmrs               %0, s24                 \n\t"   //r0 = s0
 312         : "=r"(dot), "+r"(v0), "+r"(v1):
 313     : "d8", "d9", "d10", "d11", "d12"
 314
 315         );
 316     return dot;
 317 }
 318
 319 #endif
 320
 321 // 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
 322 //                      and 3DNOW! 4x4 4x4 matrix multiplication
 323 // 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
 324 // This will need fixing.
 325 #ifndef __ARM_NEON__
 326 MULMATRIX MulMatrices = MulMatricesC;
 327 TRANSFORMVECTOR TransformVector = TransformVectorC;
 328 TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
 329 DOTPRODUCT DotProduct = DotProductC;
 330 NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
 331 #endif
 332
 333 void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
 334 {
 335 #if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
 336    /* [row][col]*/
 337   typedef float v4sf __attribute__ ((vector_size (16)));
 338   v4sf row0 = _mm_loadu_ps(m2[0]);
 339   v4sf row1 = _mm_loadu_ps(m2[1]);
 340   v4sf row2 = _mm_loadu_ps(m2[2]);
 341   v4sf row3 = _mm_loadu_ps(m2[3]);
 342
 343   for (int i = 0; i < 4; ++i)
 344   {
 345     v4sf leftrow = _mm_loadu_ps(m1[i]);
 346
 347     // Fill tmp with four copies of leftrow[0]
 348     v4sf tmp = leftrow;
 349     tmp = _mm_shuffle_ps (tmp, tmp, 0);
 350     // Calculate the four first summands
 351     v4sf destrow = tmp * row0;
 352
 353     // Fill tmp with four copies of leftrow[1]
 354     tmp = leftrow;
 355     tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
 356     destrow += tmp * row1;
 357
 358     // Fill tmp with four copies of leftrow[2]
 359     tmp = leftrow;
 360     tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
 361     destrow += tmp * row2;
 362
 363     // Fill tmp with four copies of leftrow[3]
 364     tmp = leftrow;
 365     tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
 366     destrow += tmp * row3;
 367
 368     __builtin_ia32_storeups(r[i], destrow);
 369   }
 370  #elif !defined(NO_ASM) && !defined(NOSSE)
 371   __asm
 372   {
 373     mov     eax, dword ptr [r]
 374       mov     ecx, dword ptr [m1]
 375       mov     edx, dword ptr [m2]
 376
 377       movaps  xmm0,[edx]
 378       movaps  xmm1,[edx+16]
 379       movaps  xmm2,[edx+32]
 380       movaps  xmm3,[edx+48]
 381
 382 // r[0][0],r[0][1],r[0][2],r[0][3]
 383
 384       movaps  xmm4,xmmword ptr[ecx]
 385       movaps  xmm5,xmm4
 386       movaps  xmm6,xmm4
 387       movaps  xmm7,xmm4
 388
 389       shufps  xmm4,xmm4,00000000b
 390       shufps  xmm5,xmm5,01010101b
 391       shufps  xmm6,xmm6,10101010b
 392       shufps  xmm7,xmm7,11111111b
 393
 394       mulps   xmm4,xmm0
 395       mulps   xmm5,xmm1
 396       mulps   xmm6,xmm2
 397       mulps   xmm7,xmm3
 398
 399       addps   xmm4,xmm5
 400       addps   xmm4,xmm6
 401       addps   xmm4,xmm7
 402
 403       movaps  xmmword ptr[eax],xmm4
 404
 405 // r[1][0],r[1][1],r[1][2],r[1][3]
 406
 407       movaps  xmm4,xmmword ptr[ecx+16]
 408       movaps  xmm5,xmm4
 409       movaps  xmm6,xmm4
 410       movaps  xmm7,xmm4
 411
 412       shufps  xmm4,xmm4,00000000b
 413       shufps  xmm5,xmm5,01010101b
 414       shufps  xmm6,xmm6,10101010b
 415       shufps  xmm7,xmm7,11111111b
 416
 417       mulps   xmm4,xmm0
 418       mulps   xmm5,xmm1
 419       mulps   xmm6,xmm2
 420       mulps   xmm7,xmm3
 421
 422       addps   xmm4,xmm5
 423       addps   xmm4,xmm6
 424       addps   xmm4,xmm7
 425
 426       movaps  xmmword ptr[eax+16],xmm4
 427
 428
 429 // r[2][0],r[2][1],r[2][2],r[2][3]
 430
 431       movaps  xmm4,xmmword ptr[ecx+32]
 432       movaps  xmm5,xmm4
 433       movaps  xmm6,xmm4
 434       movaps  xmm7,xmm4
 435
 436       shufps  xmm4,xmm4,00000000b
 437       shufps  xmm5,xmm5,01010101b
 438       shufps  xmm6,xmm6,10101010b
 439       shufps  xmm7,xmm7,11111111b
 440
 441       mulps   xmm4,xmm0
 442       mulps   xmm5,xmm1
 443       mulps   xmm6,xmm2
 444       mulps   xmm7,xmm3
 445
 446       addps   xmm4,xmm5
 447       addps   xmm4,xmm6
 448       addps   xmm4,xmm7
 449
 450       movaps  xmmword ptr[eax+32],xmm4
 451
 452 // r[3][0],r[3][1],r[3][2],r[3][3]
 453
 454       movaps  xmm4,xmmword ptr[ecx+48]
 455       movaps  xmm5,xmm4
 456       movaps  xmm6,xmm4
 457       movaps  xmm7,xmm4
 458
 459       shufps  xmm4,xmm4,00000000b
 460       shufps  xmm5,xmm5,01010101b
 461       shufps  xmm6,xmm6,10101010b
 462       shufps  xmm7,xmm7,11111111b
 463
 464       mulps   xmm4,xmm0
 465       mulps   xmm5,xmm1
 466       mulps   xmm6,xmm2
 467       mulps   xmm7,xmm3
 468
 469       addps   xmm4,xmm5
 470       addps   xmm4,xmm6
 471       addps   xmm4,xmm7
 472
 473       movaps  xmmword ptr[eax+48],xmm4
 474     }
 475 #endif // _WIN32
 476   }
 477
 478
 479
 480   void math_init()
 481   {
 482 #ifndef __ARM_NEON__
 483 #ifndef _DEBUG
 484     int IsSSE = FALSE;
 485 #if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
 486     int edx, eax;
 487     GLIDE64_TRY
 488     {
 489   #if defined(__x86_64__)
 490       asm volatile(" cpuid;        "
 491         : "=a"(eax), "=d"(edx)
 492         : "0"(1)
 493         : "rbx", "rcx"
 494         );
 495   #else
 496       asm volatile(" push %%ebx;   "
 497         " push %%ecx;   "
 498         " cpuid;        "
 499         " pop %%ecx;    "
 500         " pop %%ebx;    "
 501         : "=a"(eax), "=d"(edx)
 502         : "0"(1)
 503         :
 504       );
 505   #endif
 506     }
 507     GLIDE64_CATCH
 508       { return; }
 509     // Check for SSE
 510     if (edx & (1 << 25))
 511       IsSSE = TRUE;
 512 #elif !defined(NO_ASM) && !defined(NOSSE)
 513     DWORD dwEdx;
 514     __try
 515     {
 516       __asm
 517       {
 518         mov  eax,1
 519           cpuid
 520           mov dwEdx,edx
 521         }
 522       }
 523       __except(EXCEPTION_EXECUTE_HANDLER)
 524       {
 525         return;
 526       }
 527
 528       if (dwEdx & (1<<25))
 529       {
 530         if (dwEdx & (1<<24))
 531         {
 532           __try
 533           {
 534             __asm xorps xmm0, xmm0
 535               IsSSE = TRUE;
 536           }
 537           __except(EXCEPTION_EXECUTE_HANDLER)
 538           {
 539             return;
 540           }
 541         }
 542       }
 543 #endif // _WIN32
 544       if (IsSSE)
 545       {
 546         MulMatrices = MulMatricesSSE;
 547         LOG("3DNOW! detected.\n");
 548       }
 549
 550 #endif //_DEBUG
 551 #endif  //__ARM_NEON__
 552     }