integrate M-HT's neon scalers
[fceu.git] / drivers / arm / neon_scale2x.S
CommitLineData
7127faf3 1@@\r
2@@ Copyright (C) 2012 Roman Pauer\r
3@@\r
4@@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r
5@@ this software and associated documentation files (the "Software"), to deal in\r
6@@ the Software without restriction, including without limitation the rights to\r
7@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r
8@@ of the Software, and to permit persons to whom the Software is furnished to do\r
9@@ so, subject to the following conditions:\r
10@@\r
11@@ The above copyright notice and this permission notice shall be included in all\r
12@@ copies or substantial portions of the Software.\r
13@@\r
14@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
15@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
16@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
17@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
18@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
19@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
20@@ SOFTWARE.\r
21@@\r
22\r
23.arm\r
24\r
88e59df3 25#include "neon_scale2x.Sinc"\r
26#include "neon_normalxx.Sinc"\r
7127faf3 27\r
28.global neon_scale2x_8_8\r
29.global neon_scale2x_16_16\r
30.global neon_scale2x_8_16\r
31\r
32.align 4\r
33neon_scale2x_8_8:\r
34\r
35@ r0 = const uint8_t *src\r
36@ r1 = uint8_t *dst\r
37@ r2 = unsigned int width (pixels)\r
38@ r3 = unsigned int srcstride (bytes)\r
39@ [sp] = unsigned int dststride (bytes)\r
40@ [sp+4] = unsigned int height\r
41@ lr = return address\r
42\r
43 ldr ip, [sp] @ ip = dststride\r
44 push {r4-r9}\r
45 ldr r9, [sp, #(7*4)] @ r9 = height\r
46 sub r4, r0, r3 @ r4 = src - srcstride\r
47 add r5, r0, r3 @ r5 = src + srcstride\r
48 add r6, r1, ip @ r6 = dst + dststride\r
49 sub r3, r3, r2 @ r3 = srcstride - width\r
50 sub ip, ip, r2 @ ip = dststride - width\r
51 lsl ip, #1 @ ip = 2 * dststride - 2 * width\r
52 mov r7, r2 @ r7 = width\r
53 sub r9, r9, #2 @ r9 = height - 2\r
54\r
55@ r0 = src\r
56@ r1 = dst\r
57@ r2 = width\r
58@ r3 = srcdiff (srcstride - width)\r
59@ r4 = src - srcstride\r
60@ r5 = src + srcstride\r
61@ r6 = dst + dststride\r
62@ r7 = counter\r
63@ r8 = tmpreg\r
64@ r9 = height\r
65@ ip = dstdiff (2 * dststride - 2 * width)\r
66\r
67 @ first line\r
68 neon_scale2x_8_8_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
69\r
70 add r0, r0, r3\r
71 add r4, r4, r3\r
72 add r5, r5, r3\r
73 add r1, r1, ip\r
74 add r6, r6, ip\r
75\r
76 @ middle lines\r
77 101:\r
78 mov r7, r2\r
79\r
80 neon_scale2x_8_8_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
81\r
82 subS r9, r9, #1\r
83 add r0, r0, r3\r
84 add r4, r4, r3\r
85 add r5, r5, r3\r
86 add r1, r1, ip\r
87 add r6, r6, ip\r
88 bne 101b\r
89\r
90 @ last line\r
91 mov r7, r2\r
92\r
93 neon_scale2x_8_8_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
94\r
95 pop {r4-r9}\r
96 bx lr\r
97\r
98@ end procedure neon_scale2x_8_8\r
99\r
100\r
101neon_scale2x_16_16:\r
102\r
103@ r0 = const uint16_t *src\r
104@ r1 = uint16_t *dst\r
105@ r2 = unsigned int width (pixels)\r
106@ r3 = unsigned int srcstride (bytes)\r
107@ [sp] = unsigned int dststride (bytes)\r
108@ [sp+4] = unsigned int height\r
109@ lr = return address\r
110\r
111 ldr ip, [sp] @ ip = dststride\r
112 push {r4-r9}\r
113 ldr r9, [sp, #(7*4)] @ r9 = height\r
114 sub r4, r0, r3 @ r4 = src - srcstride\r
115 add r5, r0, r3 @ r5 = src + srcstride\r
116 add r6, r1, ip @ r6 = dst + dststride\r
117 sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width\r
118 sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width\r
119 lsl ip, #1 @ ip = 2 * dststride - 4 * width\r
120 mov r7, r2 @ r7 = width\r
121 sub r9, r9, #2 @ r9 = height - 2\r
122\r
123@ r0 = src\r
124@ r1 = dst\r
125@ r2 = width\r
126@ r3 = srcdiff (srcstride - 2 * width)\r
127@ r4 = src - srcstride\r
128@ r5 = src + srcstride\r
129@ r6 = dst + dststride\r
130@ r7 = counter\r
131@ r8 = tmpreg\r
132@ r9 = height\r
133@ ip = dstdiff (2 * dststride - 4 * width)\r
134\r
135 @ first line\r
136 neon_scale2x_16_16_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
137\r
138 add r0, r0, r3\r
139 add r4, r4, r3\r
140 add r5, r5, r3\r
141 add r1, r1, ip\r
142 add r6, r6, ip\r
143\r
144 @ middle lines\r
145 101:\r
146 mov r7, r2\r
147\r
148 neon_scale2x_16_16_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
149\r
150 subS r9, r9, #1\r
151 add r0, r0, r3\r
152 add r4, r4, r3\r
153 add r5, r5, r3\r
154 add r1, r1, ip\r
155 add r6, r6, ip\r
156 bne 101b\r
157\r
158 @ last line\r
159 mov r7, r2\r
160\r
161 neon_scale2x_16_16_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
162\r
163 pop {r4-r9}\r
164 bx lr\r
165\r
166@ end procedure neon_scale2x_16_16\r
167\r
168\r
169neon_scale2x_8_16:\r
170\r
171@ r0 = const uint8_t *src\r
172@ r1 = uint8_t *dst\r
173@ r2 = const uint32_t *palette\r
174@ r3 = unsigned int width (pixels)\r
175@ [sp] = unsigned int srcstride (bytes)\r
176@ [sp+4] = unsigned int dststride (bytes)\r
177@ [sp+8] = unsigned int height\r
178@ lr = return address\r
179\r
180@ three temporary lines\r
181\r
182 ldr ip, [sp] @ ip = srcstride\r
183 push {r4-r11,lr}\r
184 ldr r4, [sp, #(4*10)] @ r4 = dststride\r
185 ldr r5, [sp, #(4*11)] @ r5 = height\r
186 mov r6, sp @ r6 = sp\r
187 sub ip, ip, r3 @ ip = srcstride - width\r
188 bic sp, sp, #31 @ align sp to 32 bytes\r
189 sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width\r
190 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
191 sub r5, r5, #2 @ height -= 2\r
192 mov r10, sp @ tmpline3 = sp\r
193 lsl r7, #1 @ r7 = 2 * dststride - 4 * width\r
194 bic sp, sp, #31 @ align sp to 32 bytes\r
195 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
196 mov r11, sp @ tmpline2 = sp\r
197 bic sp, sp, #31 @ align sp to 32 bytes\r
198 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
199 mov lr, sp @ tmpline1 = sp\r
200 bic sp, sp, #31 @ align sp to 32 bytes\r
201 sub sp, sp, #36\r
202 str r6, [sp] @ oldsp = r6\r
203 str r5, [sp, #4] @ height = r5\r
204 str ip, [sp, #8] @ srcdiff = ip\r
205 str r7, [sp, #12] @ dstdiff = r7\r
206 str r4, [sp, #16] @ dststride = r4\r
207 str lr, [sp, #20] @ tmpline1 = lr\r
208 str r11, [sp, #24] @ tmpline2 = r11\r
209 str r10, [sp, #28] @ tmpline3 = r10\r
210 str r3, [sp, #32] @ width = r3\r
211\r
212@ r0 = src\r
213@ r1 = dst\r
214@ r2 = palette\r
215@ r3 = counter\r
216@ r4 = dst2\r
217\r
218@ r11 = bufptr1\r
219@ ip = bufptr2\r
220@ lr = bufptr3\r
221\r
222@ [sp] = oldsp\r
223@ [sp, #4] = height\r
224@ [sp, #8] = srcdiff (srcstride - width)\r
225@ [sp, #12] = dstdiff (2 * dststride - 4 * width)\r
226@ [sp, #16] = dststride\r
227@ [sp, #20] = tmpline1\r
228@ [sp, #24] = tmpline2\r
229@ [sp, #28] = tmpline3\r
230@ [sp, #32] = width\r
231\r
232 @ lr = tmpline1\r
233 @ r3 = counter\r
234\r
235 @ first line\r
236 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
237\r
238 ldr r7, [sp, #8] @ r7 = srcdiff\r
239 ldr r3, [sp, #32] @ counter = width\r
240 ldr lr, [sp, #24] @ bufptr3 = tmpline2\r
241 add r0, r0, r7 @ src += srcdiff\r
242\r
243 @ second line\r
244 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
245\r
246 ldr r9, [sp, #16] @ r9 = dststride\r
247 ldr r3, [sp, #32] @ counter = width\r
248 ldr ip, [sp, #20] @ bufptr2 = tmpline1\r
249 ldr lr, [sp, #24] @ bufptr3 = tmpline2\r
250 add r4, r1, r9 @ dst2 = dst + dststride\r
251\r
252 @ first temporary line\r
253 neon_scale2x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
254\r
255 ldr r7, [sp, #8] @ r7 = srcdiff\r
256 ldr r8, [sp, #12] @ r8 = dstdiff\r
257 ldr r3, [sp, #32] @ counter = width\r
258 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
259 add r0, r0, r7 @ src += srcdiff\r
260 add r1, r1, r8 @ dst += dstdiff\r
261\r
262 100:\r
263\r
264 @ line n+1\r
265 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
266\r
267 ldr r9, [sp, #16] @ r9 = dststride\r
268 ldr r11, [sp, #20] @ bufptr1 = tmpline1\r
269 ldr ip, [sp, #24] @ bufptr2 = tmpline2\r
270 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
271 add r4, r1, r9 @ dst2 = dst + dststride\r
272 ldr r3, [sp, #32] @ counter = width\r
273 str r11, [sp, #28] @ tmpline3 = bufptr1\r
274 str ip, [sp, #20] @ tmpline1 = bufptr2\r
275 str lr, [sp, #24] @ tmpline2 = bufptr3\r
276\r
277 @ temporary line n\r
278 neon_scale2x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
279\r
280 ldr r6, [sp, #4] @ r6 = height\r
281 ldr r7, [sp, #8] @ r7 = srcdiff\r
282 ldr r8, [sp, #12] @ r8 = dstdiff\r
283 ldr r3, [sp, #32] @ counter = width\r
284 subS r6, r6, #1 @ height--\r
285 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
286 add r0, r0, r7 @ src += srcdiff\r
287 add r1, r1, r8 @ dst += dstdiff\r
288 str r6, [sp, #4] @ height = r6\r
289 bne 100b\r
290\r
291\r
292 ldr r9, [sp, #16] @ r9 = dststride\r
293 ldr r11, [sp, #20] @ bufptr1 = tmpline1\r
294 ldr ip, [sp, #24] @ bufptr2 = tmpline2\r
295 add r4, r1, r9 @ dst2 = dst + dststride\r
296\r
297 @ last temporary line\r
298 neon_scale2x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
299\r
300\r
301 ldr sp, [sp] @ sp = oldsp\r
302 pop {r4-r11,lr}\r
303 bx lr\r
304\r
305@ end procedure neon_scale2x_8_16\r
306\r