add M-HT's neon scalers
[fceu.git] / drivers / arm / neon_eagle2x.S
CommitLineData
7127faf3 1@@\r
2@@ Copyright (C) 2012 Roman Pauer\r
3@@\r
4@@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r
5@@ this software and associated documentation files (the "Software"), to deal in\r
6@@ the Software without restriction, including without limitation the rights to\r
7@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r
8@@ of the Software, and to permit persons to whom the Software is furnished to do\r
9@@ so, subject to the following conditions:\r
10@@\r
11@@ The above copyright notice and this permission notice shall be included in all\r
12@@ copies or substantial portions of the Software.\r
13@@\r
14@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
15@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
16@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
17@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
18@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
19@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
20@@ SOFTWARE.\r
21@@\r
22\r
23.arm\r
24\r
25.include "neon_eagle2x.Sinc"\r
26.include "neon_normalxx.Sinc"\r
27\r
28.global neon_eagle2x_8_8\r
29.global neon_eagle2x_16_16\r
30.global neon_eagle2x_8_16\r
31\r
32.align 4\r
33neon_eagle2x_8_8:\r
34\r
35@ r0 = const uint8_t *src\r
36@ r1 = uint8_t *dst\r
37@ r2 = unsigned int width (pixels)\r
38@ r3 = unsigned int srcstride (bytes)\r
39@ [sp] = unsigned int dststride (bytes)\r
40@ [sp+4] = unsigned int height\r
41@ lr = return address\r
42\r
43 ldr ip, [sp] @ ip = dststride\r
44 push {r4-r10}\r
45 ldr r9, [sp, #(8*4)] @ r9 = height\r
46 sub r4, r0, r3 @ r4 = src - srcstride\r
47 mov r10, sp @ oldsp = sp\r
48 add r5, r0, r3 @ r5 = src + srcstride\r
49 bic sp, sp, #31 @ align sp to 32 bytes\r
50 add r6, r1, ip @ r6 = dst + dststride\r
51 sub sp, sp, #64 @ sp -= 64\r
52 sub r3, r3, r2 @ r3 = srcstride - width\r
53 vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r
54 add r7, sp, #32 @ r7 = sp + 32\r
55 sub ip, ip, r2 @ ip = dststride - width\r
56 vst1.64 {d12-d15}, [r7:256] @ save q6,q7\r
57 lsl ip, #1 @ ip = 2 * dststride - 2 * width\r
58 mov r7, r2 @ r7 = width\r
59 sub r9, r9, #2 @ r9 = height - 2\r
60\r
61\r
62@ r0 = src\r
63@ r1 = dst\r
64@ r2 = width\r
65@ r3 = srcdiff (srcstride - width)\r
66@ r4 = src - srcstride\r
67@ r5 = src + srcstride\r
68@ r6 = dst + dststride\r
69@ r7 = counter\r
70@ r8 = tmpreg\r
71@ r9 = height\r
72@ r10 = oldsp\r
73@ ip = dstdiff (2 * dststride - 2 * width)\r
74\r
75 @ first line\r
76 neon_eagle2x_8_8_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
77\r
78 add r0, r0, r3\r
79 add r4, r4, r3\r
80 add r5, r5, r3\r
81 add r1, r1, ip\r
82 add r6, r6, ip\r
83\r
84 @ middle lines\r
85 101:\r
86 mov r7, r2\r
87\r
88 neon_eagle2x_8_8_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
89\r
90 subS r9, r9, #1\r
91 add r0, r0, r3\r
92 add r4, r4, r3\r
93 add r5, r5, r3\r
94 add r1, r1, ip\r
95 add r6, r6, ip\r
96 bne 101b\r
97\r
98 @ last line\r
99 mov r7, r2\r
100\r
101 neon_eagle2x_8_8_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
102\r
103 add ip, sp, #32 @ ip = sp + 32\r
104 vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r
105 mov sp, r10 @ sp = oldsp\r
106 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
107 pop {r4-r10}\r
108 bx lr\r
109\r
110@ end procedure neon_eagle2x_8_8\r
111\r
112\r
113neon_eagle2x_16_16:\r
114\r
115@ r0 = const uint16_t *src\r
116@ r1 = uint16_t *dst\r
117@ r2 = unsigned int width (pixels)\r
118@ r3 = unsigned int srcstride (bytes)\r
119@ [sp] = unsigned int dststride (bytes)\r
120@ [sp+4] = unsigned int height\r
121@ lr = return address\r
122\r
123 ldr ip, [sp] @ ip = dststride\r
124 push {r4-r10}\r
125 ldr r9, [sp, #(8*4)] @ r9 = height\r
126 sub r4, r0, r3 @ r4 = src - srcstride\r
127 mov r10, sp @ oldsp = sp\r
128 add r5, r0, r3 @ r5 = src + srcstride\r
129 bic sp, sp, #31 @ align sp to 32 bytes\r
130 add r6, r1, ip @ r6 = dst + dststride\r
131 sub sp, sp, #64 @ sp -= 64\r
132 sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width\r
133 vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r
134 add r7, sp, #32 @ r7 = sp + 32\r
135 sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width\r
136 vst1.64 {d12-d15}, [r7:256] @ save q6,q7\r
137 lsl ip, #1 @ ip = 2 * dststride - 4 * width\r
138 mov r7, r2 @ r7 = width\r
139 sub r9, r9, #2 @ r9 = height - 2\r
140\r
141@ r0 = src\r
142@ r1 = dst\r
143@ r2 = width\r
144@ r3 = srcdiff (srcstride - 2 * width)\r
145@ r4 = src - srcstride\r
146@ r5 = src + srcstride\r
147@ r6 = dst + dststride\r
148@ r7 = counter\r
149@ r8 = tmpreg\r
150@ r9 = height\r
151@ r10 = oldsp\r
152@ ip = dstdiff (2 * dststride - 4 * width)\r
153\r
154 @ first line\r
155 neon_eagle2x_16_16_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
156\r
157 add r0, r0, r3\r
158 add r4, r4, r3\r
159 add r5, r5, r3\r
160 add r1, r1, ip\r
161 add r6, r6, ip\r
162\r
163 @ middle lines\r
164 101:\r
165 mov r7, r2\r
166\r
167 neon_eagle2x_16_16_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
168\r
169 subS r9, r9, #1\r
170 add r0, r0, r3\r
171 add r4, r4, r3\r
172 add r5, r5, r3\r
173 add r1, r1, ip\r
174 add r6, r6, ip\r
175 bne 101b\r
176\r
177 @ last line\r
178 mov r7, r2\r
179\r
180 neon_eagle2x_16_16_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
181\r
182 add ip, sp, #32 @ ip = sp + 32\r
183 vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r
184 mov sp, r10 @ sp = oldsp\r
185 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
186 pop {r4-r10}\r
187 bx lr\r
188\r
189@ end procedure neon_eagle2x_16_16\r
190\r
191\r
192neon_eagle2x_8_16:\r
193\r
194@ r0 = const uint8_t *src\r
195@ r1 = uint8_t *dst\r
196@ r2 = const uint32_t *palette\r
197@ r3 = unsigned int width (pixels)\r
198@ [sp] = unsigned int srcstride (bytes)\r
199@ [sp+4] = unsigned int dststride (bytes)\r
200@ [sp+8] = unsigned int height\r
201@ lr = return address\r
202\r
203@ three temporary lines\r
204\r
205 ldr ip, [sp] @ ip = srcstride\r
206 push {r4-r11,lr}\r
207 ldr r4, [sp, #(4*10)] @ r4 = dststride\r
208 ldr r5, [sp, #(4*11)] @ r5 = height\r
209 mov r6, sp @ r6 = sp\r
210 sub ip, ip, r3 @ ip = srcstride - width\r
211 bic sp, sp, #31 @ align sp to 32 bytes\r
212 sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width\r
213 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
214 sub r5, r5, #2 @ height -= 2\r
215 mov r10, sp @ tmpline3 = sp\r
216 lsl r7, #1 @ r7 = 2 * dststride - 4 * width\r
217 bic sp, sp, #31 @ align sp to 32 bytes\r
218 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
219 mov r11, sp @ tmpline2 = sp\r
220 bic sp, sp, #31 @ align sp to 32 bytes\r
221 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
222 mov lr, sp @ tmpline1 = sp\r
223 bic sp, sp, #31 @ align sp to 32 bytes\r
224 sub r8, sp, #64 @ r8 = sp - 64\r
225 vst1.64 {d8-d11}, [r8:256] @ save q4,q5\r
226 sub r9, sp, #32 @ r9 = sp - 32\r
227 vst1.64 {d12-d15}, [r9:256] @ save q6,q7\r
228 sub sp, sp, #(36 + 64) @ sp -= (36 + 64)\r
229 str r6, [sp] @ oldsp = r6\r
230 str r5, [sp, #4] @ height = r5\r
231 str ip, [sp, #8] @ srcdiff = ip\r
232 str r7, [sp, #12] @ dstdiff = r7\r
233 str r4, [sp, #16] @ dststride = r4\r
234 str lr, [sp, #20] @ tmpline1 = lr\r
235 str r11, [sp, #24] @ tmpline2 = r11\r
236 str r10, [sp, #28] @ tmpline3 = r10\r
237 str r3, [sp, #32] @ width = r3\r
238\r
239@ r0 = src\r
240@ r1 = dst\r
241@ r2 = palette\r
242@ r3 = counter\r
243@ r4 = dst2\r
244\r
245@ r11 = bufptr1\r
246@ ip = bufptr2\r
247@ lr = bufptr3\r
248\r
249@ [sp] = oldsp\r
250@ [sp, #4] = height\r
251@ [sp, #8] = srcdiff (srcstride - width)\r
252@ [sp, #12] = dstdiff (2 * dststride - 4 * width)\r
253@ [sp, #16] = dststride\r
254@ [sp, #20] = tmpline1\r
255@ [sp, #24] = tmpline2\r
256@ [sp, #28] = tmpline3\r
257@ [sp, #32] = width\r
258\r
259 @ lr = tmpline1\r
260 @ r3 = counter\r
261\r
262 @ first line\r
263 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
264\r
265 ldr r7, [sp, #8] @ r7 = srcdiff\r
266 ldr r3, [sp, #32] @ counter = width\r
267 ldr lr, [sp, #24] @ bufptr3 = tmpline2\r
268 add r0, r0, r7 @ src += srcdiff\r
269\r
270 @ second line\r
271 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
272\r
273 ldr r9, [sp, #16] @ r9 = dststride\r
274 ldr r3, [sp, #32] @ counter = width\r
275 ldr ip, [sp, #20] @ bufptr2 = tmpline1\r
276 ldr lr, [sp, #24] @ bufptr3 = tmpline2\r
277 add r4, r1, r9 @ dst2 = dst + dststride\r
278\r
279 @ first temporary line\r
280 neon_eagle2x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
281\r
282 ldr r7, [sp, #8] @ r7 = srcdiff\r
283 ldr r8, [sp, #12] @ r8 = dstdiff\r
284 ldr r3, [sp, #32] @ counter = width\r
285 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
286 add r0, r0, r7 @ src += srcdiff\r
287 add r1, r1, r8 @ dst += dstdiff\r
288\r
289 100:\r
290\r
291 @ line n+1\r
292 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
293\r
294 ldr r9, [sp, #16] @ r9 = dststride\r
295 ldr r11, [sp, #20] @ bufptr1 = tmpline1\r
296 ldr ip, [sp, #24] @ bufptr2 = tmpline2\r
297 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
298 add r4, r1, r9 @ dst2 = dst + dststride\r
299 ldr r3, [sp, #32] @ counter = width\r
300 str r11, [sp, #28] @ tmpline3 = bufptr1\r
301 str ip, [sp, #20] @ tmpline1 = bufptr2\r
302 str lr, [sp, #24] @ tmpline2 = bufptr3\r
303\r
304 @ temporary line n\r
305 neon_eagle2x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
306\r
307 ldr r6, [sp, #4] @ r6 = height\r
308 ldr r7, [sp, #8] @ r7 = srcdiff\r
309 ldr r8, [sp, #12] @ r8 = dstdiff\r
310 ldr r3, [sp, #32] @ counter = width\r
311 subS r6, r6, #1 @ height--\r
312 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
313 add r0, r0, r7 @ src += srcdiff\r
314 add r1, r1, r8 @ dst += dstdiff\r
315 str r6, [sp, #4] @ height = r6\r
316 bne 100b\r
317\r
318\r
319 ldr r9, [sp, #16] @ r9 = dststride\r
320 ldr r11, [sp, #20] @ bufptr1 = tmpline1\r
321 ldr ip, [sp, #24] @ bufptr2 = tmpline2\r
322 add r4, r1, r9 @ dst2 = dst + dststride\r
323\r
324 @ last temporary line\r
325 neon_eagle2x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
326\r
327\r
328 add r6, sp, #36 @ r6 = sp + 36\r
329 ldr sp, [sp] @ sp = oldsp\r
330 vld1.64 {d8-d11}, [r6:256] @ restore q4,q5\r
331 add ip, r6, #32 @ ip = r6 + 32\r
332 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
333 pop {r4-r11,lr}\r
334 bx lr\r
335\r
336@ end procedure neon_eagle2x_8_16\r
337\r