Glide Plugin GLES2 port from mupen64plus-ae, but with special FrameSkip code
[mupen64plus-pandora.git] / source / gles2glide64 / src / Glide64 / 3dmath.cpp
CommitLineData
98e75f2d 1/*
2* Glide64 - Glide video plugin for Nintendo 64 emulators.
3* Copyright (c) 2002 Dave2001
4* Copyright (c) 2003-2009 Sergey 'Gonetz' Lipski
5*
6* This program is free software; you can redistribute it and/or modify
7* it under the terms of the GNU General Public License as published by
8* the Free Software Foundation; either version 2 of the License, or
9* any later version.
10*
11* This program is distributed in the hope that it will be useful,
12* but WITHOUT ANY WARRANTY; without even the implied warranty of
13* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14* GNU General Public License for more details.
15*
16* You should have received a copy of the GNU General Public License
17* along with this program; if not, write to the Free Software
18* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19*/
20
21//****************************************************************
22//
23// Glide64 - Glide Plugin for Nintendo 64 emulators
24// Project started on December 29th, 2001
25//
26// Authors:
27// Dave2001, original author, founded the project in 2001, left it in 2002
28// Gugaman, joined the project in 2002, left it in 2002
29// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
30// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
31//
32//****************************************************************
33//
34// To modify Glide64:
35// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
36// * Do NOT send me the whole project or file that you modified. Take out your modified code sections, and tell me where to put them. If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
37//
38//****************************************************************
39
40#include "Gfx_1.3.h"
41extern "C" {
42#ifndef NOSSE
43#include <xmmintrin.h>
44#endif
45}
46
47#include <math.h>
48#include "3dmath.h"
49
50void calc_light (VERTEX *v)
51{
52 float light_intensity = 0.0f;
53 register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
54 for (wxUint32 l=0; l<rdp.num_lights; l++)
55 {
56 light_intensity = DotProduct (rdp.light_vector[l], v->vec);
57
58 if (light_intensity > 0.0f)
59 {
60 color[0] += rdp.light[l].r * light_intensity;
61 color[1] += rdp.light[l].g * light_intensity;
62 color[2] += rdp.light[l].b * light_intensity;
63 }
64 }
65
66 if (color[0] > 1.0f) color[0] = 1.0f;
67 if (color[1] > 1.0f) color[1] = 1.0f;
68 if (color[2] > 1.0f) color[2] = 1.0f;
69
70 v->r = (wxUint8)(color[0]*255.0f);
71 v->g = (wxUint8)(color[1]*255.0f);
72 v->b = (wxUint8)(color[2]*255.0f);
73}
74
75//*
76void calc_linear (VERTEX *v)
77{
78 if (settings.force_calc_sphere)
79 {
80 calc_sphere(v);
81 return;
82 }
83 DECLAREALIGN16VAR(vec[3]);
84
85 TransformVector (v->vec, vec, rdp.model);
86 // TransformVector (v->vec, vec, rdp.combined);
87 NormalizeVector (vec);
88 float x, y;
89 if (!rdp.use_lookat)
90 {
91 x = vec[0];
92 y = vec[1];
93 }
94 else
95 {
96 x = DotProduct (rdp.lookat[0], vec);
97 y = DotProduct (rdp.lookat[1], vec);
98 }
99
100 if (x > 1.0f)
101 x = 1.0f;
102 else if (x < -1.0f)
103 x = -1.0f;
104 if (y > 1.0f)
105 y = 1.0f;
106 else if (y < -1.0f)
107 y = -1.0f;
108
109 if (rdp.cur_cache[0])
110 {
111 // scale >> 6 is size to map to
112 v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
113 v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
114 }
115 v->uv_scaled = 1;
116#ifdef EXTREME_LOGGING
117 FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
118#endif
119}
120
121void calc_sphere (VERTEX *v)
122{
123// LRDP("calc_sphere\n");
124 DECLAREALIGN16VAR(vec[3]);
125 int s_scale, t_scale;
126 if (settings.hacks&hack_Chopper)
127 {
128 s_scale = min(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
129 t_scale = min(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
130 }
131 else
132 {
133 s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
134 t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
135 }
136 TransformVector (v->vec, vec, rdp.model);
137 // TransformVector (v->vec, vec, rdp.combined);
138 NormalizeVector (vec);
139 float x, y;
140 if (!rdp.use_lookat)
141 {
142 x = vec[0];
143 y = vec[1];
144 }
145 else
146 {
147 x = DotProduct (rdp.lookat[0], vec);
148 y = DotProduct (rdp.lookat[1], vec);
149 }
150 v->ou = (x * 0.5f + 0.5f) * s_scale;
151 v->ov = (y * 0.5f + 0.5f) * t_scale;
152 v->uv_scaled = 1;
153#ifdef EXTREME_LOGGING
154 FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
155#endif
156}
157
158float DotProductC(register float *v1, register float *v2)
159{
160 register float result;
161 result = v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
162 return(result);
163}
164
165void NormalizeVectorC(float *v)
166{
167 register float len;
168 len = sqrtf(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
169 if (len > 0.0f)
170 {
171 v[0] /= len;
172 v[1] /= len;
173 v[2] /= len;
174 }
175}
176
177void TransformVectorC(float *src, float *dst, float mat[4][4])
178{
179 dst[0] = mat[0][0]*src[0] + mat[1][0]*src[1] + mat[2][0]*src[2];
180 dst[1] = mat[0][1]*src[0] + mat[1][1]*src[1] + mat[2][1]*src[2];
181 dst[2] = mat[0][2]*src[0] + mat[1][2]*src[1] + mat[2][2]*src[2];
182}
183
184void InverseTransformVectorC (float *src, float *dst, float mat[4][4])
185{
186 dst[0] = mat[0][0]*src[0] + mat[0][1]*src[1] + mat[0][2]*src[2];
187 dst[1] = mat[1][0]*src[0] + mat[1][1]*src[1] + mat[1][2]*src[2];
188 dst[2] = mat[2][0]*src[0] + mat[2][1]*src[1] + mat[2][2]*src[2];
189}
190
191/*
192void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
193{
194 for (int i=0; i<4; i++)
195 {
196 for (int j=0; j<4; j++)
197 {
198 r[i][j] = m1[i][0] * m2[0][j] +
199 m1[i][1] * m2[1][j] +
200 m1[i][2] * m2[2][j] +
201 m1[i][3] * m2[3][j];
202 }
203 }
204}
205*/
206void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
207{
208 for (int j=0; j<4; j++)
209 {
210 r[0][j] = m1[0][0] * m2[0][j] +
211 m1[0][1] * m2[1][j] +
212 m1[0][2] * m2[2][j] +
213 m1[0][3] * m2[3][j];
214 r[1][j] = m1[1][0] * m2[0][j] +
215 m1[1][1] * m2[1][j] +
216 m1[1][2] * m2[2][j] +
217 m1[1][3] * m2[3][j];
218 r[2][j] = m1[2][0] * m2[0][j] +
219 m1[2][1] * m2[1][j] +
220 m1[2][2] * m2[2][j] +
221 m1[2][3] * m2[3][j];
222 r[3][j] = m1[3][0] * m2[0][j] +
223 m1[3][1] * m2[1][j] +
224 m1[3][2] * m2[2][j] +
225 m1[3][3] * m2[3][j];
226 }
227}
228
229#ifdef __ARM_NEON__
230void MultMatrix_neon( float m0[4][4], float m1[4][4], float dest[4][4])
231{
232 asm volatile (
233 "vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1
234 "vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4
235 "vld1.32 {d4, d5}, [%1]! \n\t" //q2 = m1+8
236 "vld1.32 {d6, d7}, [%1] \n\t" //q3 = m1+12
237 "vld1.32 {d16, d17}, [%0]! \n\t" //q8 = m0
238 "vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m0+4
239 "vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m0+8
240 "vld1.32 {d22, d23}, [%0] \n\t" //q11 = m0+12
241
242 "vmul.f32 q12, q8, d0[0] \n\t" //q12 = q8 * d0[0]
243 "vmul.f32 q13, q8, d2[0] \n\t" //q13 = q8 * d2[0]
244 "vmul.f32 q14, q8, d4[0] \n\t" //q14 = q8 * d4[0]
245 "vmul.f32 q15, q8, d6[0] \n\t" //q15 = q8 * d6[0]
246 "vmla.f32 q12, q9, d0[1] \n\t" //q12 = q9 * d0[1]
247 "vmla.f32 q13, q9, d2[1] \n\t" //q13 = q9 * d2[1]
248 "vmla.f32 q14, q9, d4[1] \n\t" //q14 = q9 * d4[1]
249 "vmla.f32 q15, q9, d6[1] \n\t" //q15 = q9 * d6[1]
250 "vmla.f32 q12, q10, d1[0] \n\t" //q12 = q10 * d0[0]
251 "vmla.f32 q13, q10, d3[0] \n\t" //q13 = q10 * d2[0]
252 "vmla.f32 q14, q10, d5[0] \n\t" //q14 = q10 * d4[0]
253 "vmla.f32 q15, q10, d7[0] \n\t" //q15 = q10 * d6[0]
254 "vmla.f32 q12, q11, d1[1] \n\t" //q12 = q11 * d0[1]
255 "vmla.f32 q13, q11, d3[1] \n\t" //q13 = q11 * d2[1]
256 "vmla.f32 q14, q11, d5[1] \n\t" //q14 = q11 * d4[1]
257 "vmla.f32 q15, q11, d7[1] \n\t" //q15 = q11 * d6[1]
258
259 "vst1.32 {d24, d25}, [%2]! \n\t" //d = q12
260 "vst1.32 {d26, d27}, [%2]! \n\t" //d+4 = q13
261 "vst1.32 {d28, d29}, [%2]! \n\t" //d+8 = q14
262 "vst1.32 {d30, d31}, [%2] \n\t" //d+12 = q15
263
264 :"+r"(m1), "+r"(m0), "+r"(dest):
265 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
266 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
267 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
268 "memory"
269 );
270}
271
272void Normalize_neon(float v[3])
273{
274 asm volatile (
275 "vld1.32 {d4}, [%0]! \n\t" //d4={x,y}
276 "flds s10, [%0] \n\t" //d5[0] = z
277 "sub %0, %0, #8 \n\t" //d5[0] = z
278 "vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4
279 "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1]
280 "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5
281
282 "vmov.f32 d1, d0 \n\t" //d1 = d0
283 "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
284 "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
285 "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
286 "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
287 "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
288 "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d3) / 2
289 "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4
290
291 "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4
292 "vst1.32 {d4}, [%0]! \n\t" //d2={x0,y0}, d3={z0, w0}
293 "fsts s10, [%0] \n\t" //d2={x0,y0}, d3={z0, w0}
294
295 :"+r"(v) :
296 : "d0", "d1", "d2", "d3", "d4", "d5", "memory"
297 );
298}
299
300float DotProduct_neon( float v0[3], float v1[3] )
301{
302 float dot;
303 asm volatile (
304 "vld1.32 {d8}, [%1]! \n\t" //d8={x0,y0}
305 "vld1.32 {d10}, [%2]! \n\t" //d10={x1,y1}
306 "flds s18, [%1, #0] \n\t" //d9[0]={z0}
307 "flds s22, [%2, #0] \n\t" //d11[0]={z1}
308 "vmul.f32 d12, d8, d10 \n\t" //d0= d2*d4
309 "vpadd.f32 d12, d12, d12 \n\t" //d0 = d[0] + d[1]
310 "vmla.f32 d12, d9, d11 \n\t" //d0 = d0 + d3*d5
311 "fmrs %0, s24 \n\t" //r0 = s0
312 : "=r"(dot), "+r"(v0), "+r"(v1):
313 : "d8", "d9", "d10", "d11", "d12"
314
315 );
316 return dot;
317}
318
319#endif
320
321// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
322// and 3DNOW! 4x4 4x4 matrix multiplication
323// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
324// This will need fixing.
325#ifndef __ARM_NEON__
326MULMATRIX MulMatrices = MulMatricesC;
327TRANSFORMVECTOR TransformVector = TransformVectorC;
328TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
329DOTPRODUCT DotProduct = DotProductC;
330NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
331#endif
332
333void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
334{
335#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
336 /* [row][col]*/
337 typedef float v4sf __attribute__ ((vector_size (16)));
338 v4sf row0 = _mm_loadu_ps(m2[0]);
339 v4sf row1 = _mm_loadu_ps(m2[1]);
340 v4sf row2 = _mm_loadu_ps(m2[2]);
341 v4sf row3 = _mm_loadu_ps(m2[3]);
342
343 for (int i = 0; i < 4; ++i)
344 {
345 v4sf leftrow = _mm_loadu_ps(m1[i]);
346
347 // Fill tmp with four copies of leftrow[0]
348 v4sf tmp = leftrow;
349 tmp = _mm_shuffle_ps (tmp, tmp, 0);
350 // Calculate the four first summands
351 v4sf destrow = tmp * row0;
352
353 // Fill tmp with four copies of leftrow[1]
354 tmp = leftrow;
355 tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
356 destrow += tmp * row1;
357
358 // Fill tmp with four copies of leftrow[2]
359 tmp = leftrow;
360 tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
361 destrow += tmp * row2;
362
363 // Fill tmp with four copies of leftrow[3]
364 tmp = leftrow;
365 tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
366 destrow += tmp * row3;
367
368 __builtin_ia32_storeups(r[i], destrow);
369 }
370 #elif !defined(NO_ASM) && !defined(NOSSE)
371 __asm
372 {
373 mov eax, dword ptr [r]
374 mov ecx, dword ptr [m1]
375 mov edx, dword ptr [m2]
376
377 movaps xmm0,[edx]
378 movaps xmm1,[edx+16]
379 movaps xmm2,[edx+32]
380 movaps xmm3,[edx+48]
381
382// r[0][0],r[0][1],r[0][2],r[0][3]
383
384 movaps xmm4,xmmword ptr[ecx]
385 movaps xmm5,xmm4
386 movaps xmm6,xmm4
387 movaps xmm7,xmm4
388
389 shufps xmm4,xmm4,00000000b
390 shufps xmm5,xmm5,01010101b
391 shufps xmm6,xmm6,10101010b
392 shufps xmm7,xmm7,11111111b
393
394 mulps xmm4,xmm0
395 mulps xmm5,xmm1
396 mulps xmm6,xmm2
397 mulps xmm7,xmm3
398
399 addps xmm4,xmm5
400 addps xmm4,xmm6
401 addps xmm4,xmm7
402
403 movaps xmmword ptr[eax],xmm4
404
405// r[1][0],r[1][1],r[1][2],r[1][3]
406
407 movaps xmm4,xmmword ptr[ecx+16]
408 movaps xmm5,xmm4
409 movaps xmm6,xmm4
410 movaps xmm7,xmm4
411
412 shufps xmm4,xmm4,00000000b
413 shufps xmm5,xmm5,01010101b
414 shufps xmm6,xmm6,10101010b
415 shufps xmm7,xmm7,11111111b
416
417 mulps xmm4,xmm0
418 mulps xmm5,xmm1
419 mulps xmm6,xmm2
420 mulps xmm7,xmm3
421
422 addps xmm4,xmm5
423 addps xmm4,xmm6
424 addps xmm4,xmm7
425
426 movaps xmmword ptr[eax+16],xmm4
427
428
429// r[2][0],r[2][1],r[2][2],r[2][3]
430
431 movaps xmm4,xmmword ptr[ecx+32]
432 movaps xmm5,xmm4
433 movaps xmm6,xmm4
434 movaps xmm7,xmm4
435
436 shufps xmm4,xmm4,00000000b
437 shufps xmm5,xmm5,01010101b
438 shufps xmm6,xmm6,10101010b
439 shufps xmm7,xmm7,11111111b
440
441 mulps xmm4,xmm0
442 mulps xmm5,xmm1
443 mulps xmm6,xmm2
444 mulps xmm7,xmm3
445
446 addps xmm4,xmm5
447 addps xmm4,xmm6
448 addps xmm4,xmm7
449
450 movaps xmmword ptr[eax+32],xmm4
451
452// r[3][0],r[3][1],r[3][2],r[3][3]
453
454 movaps xmm4,xmmword ptr[ecx+48]
455 movaps xmm5,xmm4
456 movaps xmm6,xmm4
457 movaps xmm7,xmm4
458
459 shufps xmm4,xmm4,00000000b
460 shufps xmm5,xmm5,01010101b
461 shufps xmm6,xmm6,10101010b
462 shufps xmm7,xmm7,11111111b
463
464 mulps xmm4,xmm0
465 mulps xmm5,xmm1
466 mulps xmm6,xmm2
467 mulps xmm7,xmm3
468
469 addps xmm4,xmm5
470 addps xmm4,xmm6
471 addps xmm4,xmm7
472
473 movaps xmmword ptr[eax+48],xmm4
474 }
475#endif // _WIN32
476 }
477
478
479
480 void math_init()
481 {
482#ifndef __ARM_NEON__
483#ifndef _DEBUG
484 int IsSSE = FALSE;
485#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
486 int edx, eax;
487 GLIDE64_TRY
488 {
489 #if defined(__x86_64__)
490 asm volatile(" cpuid; "
491 : "=a"(eax), "=d"(edx)
492 : "0"(1)
493 : "rbx", "rcx"
494 );
495 #else
496 asm volatile(" push %%ebx; "
497 " push %%ecx; "
498 " cpuid; "
499 " pop %%ecx; "
500 " pop %%ebx; "
501 : "=a"(eax), "=d"(edx)
502 : "0"(1)
503 :
504 );
505 #endif
506 }
507 GLIDE64_CATCH
508 { return; }
509 // Check for SSE
510 if (edx & (1 << 25))
511 IsSSE = TRUE;
512#elif !defined(NO_ASM) && !defined(NOSSE)
513 DWORD dwEdx;
514 __try
515 {
516 __asm
517 {
518 mov eax,1
519 cpuid
520 mov dwEdx,edx
521 }
522 }
523 __except(EXCEPTION_EXECUTE_HANDLER)
524 {
525 return;
526 }
527
528 if (dwEdx & (1<<25))
529 {
530 if (dwEdx & (1<<24))
531 {
532 __try
533 {
534 __asm xorps xmm0, xmm0
535 IsSSE = TRUE;
536 }
537 __except(EXCEPTION_EXECUTE_HANDLER)
538 {
539 return;
540 }
541 }
542 }
543#endif // _WIN32
544 if (IsSSE)
545 {
546 MulMatrices = MulMatricesSSE;
547 LOG("3DNOW! detected.\n");
548 }
549
550#endif //_DEBUG
551#endif //__ARM_NEON__
552 }