rice: initial NEON transform code
[mupen64plus-pandora.git] / source / gles2rice / src / RenderBase_neon.S
CommitLineData
5c6423ae 1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2014
3 *
4 * This work is licensed under the terms of GNU GPL version 2 or later.
5 * See the COPYING file in the top-level directory.
6 */
7
8#include "arm_features.h"
9
10.syntax unified
11.text
12.align 3
13
14.macro do_mac_flags rr1 rr2 rr3
15 cmp \rr1, #1
16.endm
17
18
19/*
20 * ProcessVertexData register map:
21 *
22 * q | d | c code
23 * 0 0
24 * 1
25 * 1 2
26 * 3
27 * 2 4
28 * 5
29 * 3 6
30 * 7
31 * 4 8
32 * 9
33 * 5 10
34 * 11
35 * 6 12
36 * 13
37 * 7 14 g_vecProjected
38 * 15
39 * 8 16
40 * 17
41 * ...
42 * 12 24 gRSPworldProject _11,_21,_31,_41
43 * 25
44 * 13 26 gRSPworldProject _12,_22,_32,_42
45 * 27
46 * 14 28 gRSPworldProject _13,_23,_33,_43
47 * 29
48 * 15 30 gRSPworldProject _14,_24,_34,_44
49 * 31
50 *
51 * r4 vtx[], 16 bytes:
52 * short y, x, flag, z, tv, tu;
53 * / uint8 a, b, g, r;
54 * \ char a, z, y, x;
55 *
56 * outputs:
57 * r0 - XVECTOR4 *g_vtxTransformed
58 * r1 - XVECTOR4 *g_vecProjected
59 * r2 - uint32 *g_dwVtxDifColor
60 * r3 - VECTOR2 *g_fVtxTxtCoords
61 * sp+00 - float *g_fFogCoord
62 * sp+04 - uint32 *g_clipFlag2
63 * inputs:
64 * r11 sp+08 - uint32 dwNum
65 * sp+0c - int neon_flags
66 * r4 sp+10 - FiddledVtx vtx[]
67 * sp+14 - Light *gRSPlights
68 * sp+18 - float *fRSPAmbientLightRGBA
69 * sp+1c - XMATRIX *gRSPworldProjectTransported
70 * sp+20 - XMATRIX *gRSPmodelViewTop
71 * sp+24 - uint32 gRSPnumLights
72 * sp+28 - float gRSPfFogMin
73 */
74FUNCTION(pv_neon): @
75 ldr r12, [sp, #0x10]
76 pld [r12]
77
78 push {r4-r11,lr}
79 vpush {q4-q7}
80
81 mov r4, r12 @ vtx
82 ldr r12, [sp, #0x64+0x1c]
83 ldr r11, [sp, #0x64+0x08]
84 vld1.32 {q12,q13}, [r12, :128]! @ load gRSPworldProject
85 vld1.32 {q14,q15}, [r12, :128]
86
870:
88 vld1.16 d4[1], [r4]! @ y
89 vld1.16 d4[0], [r4]! @ x
90 vld1.16 d4[3], [r4]! @ flag
91 vld1.16 d4[2], [r4]! @ z
92 vld1.16 d5[1], [r4]! @ v
93 vld1.16 d5[0], [r4]! @ u
94 vmovl.s16 q0, d4
95 vmovl.s16 q1, d5
96 mov r12, #0x3f800000 @ 1.0f
97 vcvt.f32.s32 q2, q0 @ q2 = vtx_raw
98 vcvt.f32.s32 q3, q1 @ d6 = float u, v
99 vmov.32 d5[1], r12 @ q2 = { x, y, x, 1.0f }
100 vmul.f32 q4, q2, q12
101 vmul.f32 q5, q2, q13
102 vmul.f32 q6, q2, q14
103 vmul.f32 q7, q2, q15
104 /* wrt u,v to g_fVtxTxtCoords */ vst1.32 {d6}, [r3]!
105 vpadd.f32 d7, d14, d15
106 vpadd.f32 d6, d12, d13
107 vpadd.f32 d5, d10, d11
108 vpadd.f32 d4, d8, d9
109 vpadd.f32 d1, d6, d7 @ g_vtxTransformed .z .w
110 vpadd.f32 d0, d4, d5 @ g_vtxTransformed .x .y
111
112#if 1
113 vrecpe.f32 d2, d1 @ inv .z(unused) .w
114 vrecps.f32 d3, d1, d2 @ step
115 /* wrt g_vtxTransformed */ vst1.32 {q0}, [r0]!
116 vmul.f32 d2, d3, d2 @ better inv
117 vrecps.f32 d3, d1, d2 @ step
118 vmul.f32 d2, d3, d2 @ better inv
119 #if 0
120 vrecps.f32 d3, d1, d2 @ step
121 vmul.f32 d2, d3, d2 @ better inv
122 #endif
123#else
124 vmov.f32 s5, r12
125 /* wrt g_vtxTransformed */ vst1.32 {q0}, [r0]!
126 vdiv.f32 s5, s5, s3
127#endif
128
129 vmul.f32 q7, q0, d2[1]
130 vshr.u64 d2, #32
131 vsli.64 d15, d2, #32
132 /* wrt g_vecProjected */ vst1.32 {q7}, [r1]!
133
134 vpop {q4-q7}
135 pop {r4-r11,pc}
136 .size pv_neon, .-pv_neon
137
138
139@ vim:filetype=armasm