integrate M-HT's neon scalers
[gpsp.git] / arm / neon_scale3x.S
CommitLineData
e38fee1b 1@@\r
2@@ Copyright (C) 2012 Roman Pauer\r
3@@\r
4@@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r
5@@ this software and associated documentation files (the "Software"), to deal in\r
6@@ the Software without restriction, including without limitation the rights to\r
7@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r
8@@ of the Software, and to permit persons to whom the Software is furnished to do\r
9@@ so, subject to the following conditions:\r
10@@\r
11@@ The above copyright notice and this permission notice shall be included in all\r
12@@ copies or substantial portions of the Software.\r
13@@\r
14@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
15@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
16@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
17@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
18@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
19@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
20@@ SOFTWARE.\r
21@@\r
22\r
23.arm\r
24\r
25#include "neon_scale3x.Sinc"\r
26#include "neon_normalxx.Sinc"\r
27\r
28.global neon_scale3x_8_8\r
29.global neon_scale3x_16_16\r
30.global neon_scale3x_8_16\r
31\r
32.align 4\r
33neon_scale3x_8_8:\r
34\r
35@ r0 = const uint8_t *src\r
36@ r1 = uint8_t *dst\r
37@ r2 = unsigned int width (pixels)\r
38@ r3 = unsigned int srcstride (bytes)\r
39@ [sp] = unsigned int dststride (bytes)\r
40@ [sp+4] = unsigned int height\r
41@ lr = return address\r
42\r
43 ldr ip, [sp] @ ip = dststride\r
44 push {r4-r11}\r
45 ldr r9, [sp, #(9*4)] @ r9 = height\r
46 sub r4, r0, r3 @ r4 = src - srcstride\r
47 mov r11, sp @ oldsp = sp\r
48 add r5, r0, r3 @ r5 = src + srcstride\r
49 bic sp, sp, #31 @ align sp to 32 bytes\r
50 add r6, r1, ip @ r6 = dst + dststride\r
51 sub sp, sp, #64 @ sp -= 64\r
52 sub r3, r3, r2 @ r3 = srcstride - width\r
53 vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r
54 add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride\r
55 add r8, sp, #32 @ r8 = sp + 32\r
56 sub ip, ip, r2 @ ip = dststride - width\r
57 vst1.64 {d12-d15}, [r8:256] @ save q6,q7\r
58 add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 3 * width\r
59 mov r8, r2 @ r8 = width\r
60 sub r9, r9, #2 @ r9 = height - 2\r
61\r
62\r
63@ r0 = src\r
64@ r1 = dst\r
65@ r2 = width\r
66@ r3 = srcdiff (srcstride - width)\r
67@ r4 = src - srcstride\r
68@ r5 = src + srcstride\r
69@ r6 = dst + dststride\r
70@ r7 = dst + 2 * dststride\r
71@ r8 = counter\r
72@ r9 = height\r
73@ r10 = tmpreg\r
74@ r11 = oldsp\r
75@ ip = dstdiff (3 * dststride - 3 * width)\r
76\r
77 @ first line\r
78 neon_scale3x_8_8_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r
79\r
80 add r0, r0, r3\r
81 add r4, r4, r3\r
82 add r5, r5, r3\r
83 add r1, r1, ip\r
84 add r6, r6, ip\r
85 add r7, r7, ip\r
86\r
87 @ middle lines\r
88 101:\r
89 mov r8, r2\r
90\r
91 neon_scale3x_8_8_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r
92\r
93 subS r9, r9, #1\r
94 add r0, r0, r3\r
95 add r4, r4, r3\r
96 add r5, r5, r3\r
97 add r1, r1, ip\r
98 add r6, r6, ip\r
99 add r7, r7, ip\r
100 bne 101b\r
101\r
102 @ last line\r
103 mov r8, r2\r
104\r
105 neon_scale3x_8_8_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r
106\r
107 add ip, sp, #32 @ ip = sp + 32\r
108 vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r
109 mov sp, r11 @ sp = oldsp\r
110 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
111 pop {r4-r11}\r
112 bx lr\r
113\r
114@ end procedure neon_scale3x_8_8\r
115\r
116\r
117neon_scale3x_16_16:\r
118\r
119@ r0 = const uint16_t *src\r
120@ r1 = uint16_t *dst\r
121@ r2 = unsigned int width (pixels)\r
122@ r3 = unsigned int srcstride (bytes)\r
123@ [sp] = unsigned int dststride (bytes)\r
124@ [sp+4] = unsigned int height\r
125@ lr = return address\r
126\r
127 ldr ip, [sp] @ ip = dststride\r
128 push {r4-r11}\r
129 ldr r9, [sp, #(9*4)] @ r9 = height\r
130 sub r4, r0, r3 @ r4 = src - srcstride\r
131 mov r11, sp @ oldsp = sp\r
132 add r5, r0, r3 @ r5 = src + srcstride\r
133 bic sp, sp, #31 @ align sp to 32 bytes\r
134 add r6, r1, ip @ r6 = dst + dststride\r
135 sub sp, sp, #64 @ sp -= 64\r
136 sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width\r
137 vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r
138 add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride\r
139 add r8, sp, #32 @ r8 = sp + 32\r
140 sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width\r
141 vst1.64 {d12-d15}, [r8:256] @ save q6,q7\r
142 add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 6 * width\r
143 mov r8, r2 @ r8 = width\r
144 sub r9, r9, #2 @ r9 = height - 2\r
145\r
146@ r0 = src\r
147@ r1 = dst\r
148@ r2 = width\r
149@ r3 = srcdiff (srcstride - 2 * width)\r
150@ r4 = src - srcstride\r
151@ r5 = src + srcstride\r
152@ r6 = dst + dststride\r
153@ r7 = dst + 2 * dststride\r
154@ r8 = counter\r
155@ r9 = height\r
156@ r10 = tmpreg\r
157@ r11 = oldsp\r
158@ ip = dstdiff (3 * dststride - 6 * width)\r
159\r
160 @ first line\r
161 neon_scale3x_16_16_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r
162\r
163 add r0, r0, r3\r
164 add r4, r4, r3\r
165 add r5, r5, r3\r
166 add r1, r1, ip\r
167 add r6, r6, ip\r
168 add r7, r7, ip\r
169\r
170 @ middle lines\r
171 101:\r
172 mov r8, r2\r
173\r
174 neon_scale3x_16_16_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r
175\r
176 subS r9, r9, #1\r
177 add r0, r0, r3\r
178 add r4, r4, r3\r
179 add r5, r5, r3\r
180 add r1, r1, ip\r
181 add r6, r6, ip\r
182 add r7, r7, ip\r
183 bne 101b\r
184\r
185 @ last line\r
186 mov r8, r2\r
187\r
188 neon_scale3x_16_16_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r
189\r
190 add ip, sp, #32 @ ip = sp + 32\r
191 vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r
192 mov sp, r11 @ sp = oldsp\r
193 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
194 pop {r4-r11}\r
195 bx lr\r
196\r
197@ end procedure neon_scale3x_16_16\r
198\r
199\r
200neon_scale3x_8_16:\r
201\r
202@ r0 = const uint8_t *src\r
203@ r1 = uint8_t *dst\r
204@ r2 = const uint32_t *palette\r
205@ r3 = unsigned int width (pixels)\r
206@ [sp] = unsigned int srcstride (bytes)\r
207@ [sp+4] = unsigned int dststride (bytes)\r
208@ [sp+8] = unsigned int height\r
209@ lr = return address\r
210\r
211@ three temporary lines\r
212\r
213 ldr ip, [sp] @ ip = srcstride\r
214 push {r4-r11,lr}\r
215 ldr r4, [sp, #(4*10)] @ r4 = dststride\r
216 ldr r5, [sp, #(4*11)] @ r5 = height\r
217 mov r6, sp @ r6 = sp\r
218 sub ip, ip, r3 @ ip = srcstride - width\r
219 bic sp, sp, #31 @ align sp to 32 bytes\r
220 sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width\r
221 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
222 sub r5, r5, #2 @ height -= 2\r
223 mov r10, sp @ tmpline3 = sp\r
224 add r7, r7, r7, lsl #1 @ r7 = 3 * dststride - 6 * width\r
225 bic sp, sp, #31 @ align sp to 32 bytes\r
226 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
227 mov r11, sp @ tmpline2 = sp\r
228 bic sp, sp, #31 @ align sp to 32 bytes\r
229 sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
230 mov lr, sp @ tmpline1 = sp\r
231 bic sp, sp, #31 @ align sp to 32 bytes\r
232 sub r8, sp, #64 @ r8 = sp - 64\r
233 vst1.64 {d8-d11}, [r8:256] @ save q4,q5\r
234 sub r9, sp, #32 @ r9 = sp - 32\r
235 vst1.64 {d12-d15}, [r9:256] @ save q6,q7\r
236 sub sp, sp, #(36 + 64) @ sp -= (36 + 64)\r
237 str r6, [sp] @ oldsp = r6\r
238 str r5, [sp, #4] @ height = r5\r
239 str ip, [sp, #8] @ srcdiff = ip\r
240 str r7, [sp, #12] @ dstdiff = r7\r
241 str r4, [sp, #16] @ dststride = r4\r
242 str lr, [sp, #20] @ tmpline1 = lr\r
243 str r11, [sp, #24] @ tmpline2 = r11\r
244 str r10, [sp, #28] @ tmpline3 = r10\r
245 str r3, [sp, #32] @ width = r3\r
246\r
247@ r0 = src\r
248@ r1 = dst\r
249@ r2 = palette\r
250@ r3 = counter\r
251@ r4 = dst2\r
252@ r5 = dst3\r
253\r
254@ r11 = bufptr1\r
255@ ip = bufptr2\r
256@ lr = bufptr3\r
257\r
258@ [sp] = oldsp\r
259@ [sp, #4] = height\r
260@ [sp, #8] = srcdiff (srcstride - width)\r
261@ [sp, #12] = dstdiff (3 * dststride - 6 * width)\r
262@ [sp, #16] = dststride\r
263@ [sp, #20] = tmpline1\r
264@ [sp, #24] = tmpline2\r
265@ [sp, #28] = tmpline3\r
266@ [sp, #32] = width\r
267\r
268 @ lr = tmpline1\r
269 @ r3 = counter\r
270\r
271 @ first line\r
272 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
273\r
274 ldr r7, [sp, #8] @ r7 = srcdiff\r
275 ldr r3, [sp, #32] @ counter = width\r
276 ldr lr, [sp, #24] @ bufptr3 = tmpline2\r
277 add r0, r0, r7 @ src += srcdiff\r
278\r
279 @ second line\r
280 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
281\r
282 ldr r9, [sp, #16] @ r9 = dststride\r
283 ldr r3, [sp, #32] @ counter = width\r
284 ldr ip, [sp, #20] @ bufptr2 = tmpline1\r
285 ldr lr, [sp, #24] @ bufptr3 = tmpline2\r
286 add r4, r1, r9 @ dst2 = dst + dststride\r
287 add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride\r
288\r
289 @ first temporary line\r
290 neon_scale3x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0\r
291\r
292 ldr r7, [sp, #8] @ r7 = srcdiff\r
293 ldr r8, [sp, #12] @ r8 = dstdiff\r
294 ldr r3, [sp, #32] @ counter = width\r
295 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
296 add r0, r0, r7 @ src += srcdiff\r
297 add r1, r1, r8 @ dst += dstdiff\r
298\r
299 100:\r
300\r
301 @ line n+1\r
302 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
303\r
304 ldr r9, [sp, #16] @ r9 = dststride\r
305 ldr r11, [sp, #20] @ bufptr1 = tmpline1\r
306 ldr ip, [sp, #24] @ bufptr2 = tmpline2\r
307 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
308 add r4, r1, r9 @ dst2 = dst + dststride\r
309 ldr r3, [sp, #32] @ counter = width\r
310 add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride\r
311 str r11, [sp, #28] @ tmpline3 = bufptr1\r
312 str ip, [sp, #20] @ tmpline1 = bufptr2\r
313 str lr, [sp, #24] @ tmpline2 = bufptr3\r
314\r
315 @ temporary line n\r
316 neon_scale3x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0\r
317\r
318 ldr r6, [sp, #4] @ r6 = height\r
319 ldr r7, [sp, #8] @ r7 = srcdiff\r
320 ldr r8, [sp, #12] @ r8 = dstdiff\r
321 ldr r3, [sp, #32] @ counter = width\r
322 subS r6, r6, #1 @ height--\r
323 ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
324 add r0, r0, r7 @ src += srcdiff\r
325 add r1, r1, r8 @ dst += dstdiff\r
326 str r6, [sp, #4] @ height = r6\r
327 bne 100b\r
328\r
329\r
330 ldr r9, [sp, #16] @ r9 = dststride\r
331 ldr r11, [sp, #20] @ bufptr1 = tmpline1\r
332 ldr ip, [sp, #24] @ bufptr2 = tmpline2\r
333 add r4, r1, r9 @ dst2 = dst + dststride\r
334 add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride\r
335\r
336 @ last temporary line\r
337 neon_scale3x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0\r
338\r
339\r
340 add r6, sp, #36 @ r6 = sp + 36\r
341 ldr sp, [sp] @ sp = oldsp\r
342 vld1.64 {d8-d11}, [r6:256] @ restore q4,q5\r
343 add ip, r6, #32 @ ip = r6 + 32\r
344 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
345 pop {r4-r11,lr}\r
346 bx lr\r
347\r
348@ end procedure neon_scale3x_8_16\r
349\r