e38fee1b |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r |
3 | @@\r |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
6 | @@ the Software without restriction, including without limitation the rights to\r |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
9 | @@ so, subject to the following conditions:\r |
10 | @@\r |
11 | @@ The above copyright notice and this permission notice shall be included in all\r |
12 | @@ copies or substantial portions of the Software.\r |
13 | @@\r |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
20 | @@ SOFTWARE.\r |
21 | @@\r |
22 | \r |
23 | .arm\r |
24 | \r |
25 | #include "neon_scale3x.Sinc"\r |
26 | #include "neon_normalxx.Sinc"\r |
27 | \r |
28 | .global neon_scale3x_8_8\r |
29 | .global neon_scale3x_16_16\r |
30 | .global neon_scale3x_8_16\r |
31 | \r |
32 | .align 4\r |
33 | neon_scale3x_8_8:\r |
34 | \r |
35 | @ r0 = const uint8_t *src\r |
36 | @ r1 = uint8_t *dst\r |
37 | @ r2 = unsigned int width (pixels)\r |
38 | @ r3 = unsigned int srcstride (bytes)\r |
39 | @ [sp] = unsigned int dststride (bytes)\r |
40 | @ [sp+4] = unsigned int height\r |
41 | @ lr = return address\r |
42 | \r |
43 | ldr ip, [sp] @ ip = dststride\r |
44 | push {r4-r11}\r |
45 | ldr r9, [sp, #(9*4)] @ r9 = height\r |
46 | sub r4, r0, r3 @ r4 = src - srcstride\r |
47 | mov r11, sp @ oldsp = sp\r |
48 | add r5, r0, r3 @ r5 = src + srcstride\r |
49 | bic sp, sp, #31 @ align sp to 32 bytes\r |
50 | add r6, r1, ip @ r6 = dst + dststride\r |
51 | sub sp, sp, #64 @ sp -= 64\r |
52 | sub r3, r3, r2 @ r3 = srcstride - width\r |
53 | vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r |
54 | add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride\r |
55 | add r8, sp, #32 @ r8 = sp + 32\r |
56 | sub ip, ip, r2 @ ip = dststride - width\r |
57 | vst1.64 {d12-d15}, [r8:256] @ save q6,q7\r |
58 | add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 3 * width\r |
59 | mov r8, r2 @ r8 = width\r |
60 | sub r9, r9, #2 @ r9 = height - 2\r |
61 | \r |
62 | \r |
63 | @ r0 = src\r |
64 | @ r1 = dst\r |
65 | @ r2 = width\r |
66 | @ r3 = srcdiff (srcstride - width)\r |
67 | @ r4 = src - srcstride\r |
68 | @ r5 = src + srcstride\r |
69 | @ r6 = dst + dststride\r |
70 | @ r7 = dst + 2 * dststride\r |
71 | @ r8 = counter\r |
72 | @ r9 = height\r |
73 | @ r10 = tmpreg\r |
74 | @ r11 = oldsp\r |
75 | @ ip = dstdiff (3 * dststride - 3 * width)\r |
76 | \r |
77 | @ first line\r |
78 | neon_scale3x_8_8_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r |
79 | \r |
80 | add r0, r0, r3\r |
81 | add r4, r4, r3\r |
82 | add r5, r5, r3\r |
83 | add r1, r1, ip\r |
84 | add r6, r6, ip\r |
85 | add r7, r7, ip\r |
86 | \r |
87 | @ middle lines\r |
88 | 101:\r |
89 | mov r8, r2\r |
90 | \r |
91 | neon_scale3x_8_8_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r |
92 | \r |
93 | subS r9, r9, #1\r |
94 | add r0, r0, r3\r |
95 | add r4, r4, r3\r |
96 | add r5, r5, r3\r |
97 | add r1, r1, ip\r |
98 | add r6, r6, ip\r |
99 | add r7, r7, ip\r |
100 | bne 101b\r |
101 | \r |
102 | @ last line\r |
103 | mov r8, r2\r |
104 | \r |
105 | neon_scale3x_8_8_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r |
106 | \r |
107 | add ip, sp, #32 @ ip = sp + 32\r |
108 | vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r |
109 | mov sp, r11 @ sp = oldsp\r |
110 | vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r |
111 | pop {r4-r11}\r |
112 | bx lr\r |
113 | \r |
114 | @ end procedure neon_scale3x_8_8\r |
115 | \r |
116 | \r |
117 | neon_scale3x_16_16:\r |
118 | \r |
119 | @ r0 = const uint16_t *src\r |
120 | @ r1 = uint16_t *dst\r |
121 | @ r2 = unsigned int width (pixels)\r |
122 | @ r3 = unsigned int srcstride (bytes)\r |
123 | @ [sp] = unsigned int dststride (bytes)\r |
124 | @ [sp+4] = unsigned int height\r |
125 | @ lr = return address\r |
126 | \r |
127 | ldr ip, [sp] @ ip = dststride\r |
128 | push {r4-r11}\r |
129 | ldr r9, [sp, #(9*4)] @ r9 = height\r |
130 | sub r4, r0, r3 @ r4 = src - srcstride\r |
131 | mov r11, sp @ oldsp = sp\r |
132 | add r5, r0, r3 @ r5 = src + srcstride\r |
133 | bic sp, sp, #31 @ align sp to 32 bytes\r |
134 | add r6, r1, ip @ r6 = dst + dststride\r |
135 | sub sp, sp, #64 @ sp -= 64\r |
136 | sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width\r |
137 | vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r |
138 | add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride\r |
139 | add r8, sp, #32 @ r8 = sp + 32\r |
140 | sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width\r |
141 | vst1.64 {d12-d15}, [r8:256] @ save q6,q7\r |
142 | add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 6 * width\r |
143 | mov r8, r2 @ r8 = width\r |
144 | sub r9, r9, #2 @ r9 = height - 2\r |
145 | \r |
146 | @ r0 = src\r |
147 | @ r1 = dst\r |
148 | @ r2 = width\r |
149 | @ r3 = srcdiff (srcstride - 2 * width)\r |
150 | @ r4 = src - srcstride\r |
151 | @ r5 = src + srcstride\r |
152 | @ r6 = dst + dststride\r |
153 | @ r7 = dst + 2 * dststride\r |
154 | @ r8 = counter\r |
155 | @ r9 = height\r |
156 | @ r10 = tmpreg\r |
157 | @ r11 = oldsp\r |
158 | @ ip = dstdiff (3 * dststride - 6 * width)\r |
159 | \r |
160 | @ first line\r |
161 | neon_scale3x_16_16_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r |
162 | \r |
163 | add r0, r0, r3\r |
164 | add r4, r4, r3\r |
165 | add r5, r5, r3\r |
166 | add r1, r1, ip\r |
167 | add r6, r6, ip\r |
168 | add r7, r7, ip\r |
169 | \r |
170 | @ middle lines\r |
171 | 101:\r |
172 | mov r8, r2\r |
173 | \r |
174 | neon_scale3x_16_16_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r |
175 | \r |
176 | subS r9, r9, #1\r |
177 | add r0, r0, r3\r |
178 | add r4, r4, r3\r |
179 | add r5, r5, r3\r |
180 | add r1, r1, ip\r |
181 | add r6, r6, ip\r |
182 | add r7, r7, ip\r |
183 | bne 101b\r |
184 | \r |
185 | @ last line\r |
186 | mov r8, r2\r |
187 | \r |
188 | neon_scale3x_16_16_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0\r |
189 | \r |
190 | add ip, sp, #32 @ ip = sp + 32\r |
191 | vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r |
192 | mov sp, r11 @ sp = oldsp\r |
193 | vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r |
194 | pop {r4-r11}\r |
195 | bx lr\r |
196 | \r |
197 | @ end procedure neon_scale3x_16_16\r |
198 | \r |
199 | \r |
200 | neon_scale3x_8_16:\r |
201 | \r |
202 | @ r0 = const uint8_t *src\r |
203 | @ r1 = uint8_t *dst\r |
204 | @ r2 = const uint32_t *palette\r |
205 | @ r3 = unsigned int width (pixels)\r |
206 | @ [sp] = unsigned int srcstride (bytes)\r |
207 | @ [sp+4] = unsigned int dststride (bytes)\r |
208 | @ [sp+8] = unsigned int height\r |
209 | @ lr = return address\r |
210 | \r |
211 | @ three temporary lines\r |
212 | \r |
213 | ldr ip, [sp] @ ip = srcstride\r |
214 | push {r4-r11,lr}\r |
215 | ldr r4, [sp, #(4*10)] @ r4 = dststride\r |
216 | ldr r5, [sp, #(4*11)] @ r5 = height\r |
217 | mov r6, sp @ r6 = sp\r |
218 | sub ip, ip, r3 @ ip = srcstride - width\r |
219 | bic sp, sp, #31 @ align sp to 32 bytes\r |
220 | sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width\r |
221 | sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r |
222 | sub r5, r5, #2 @ height -= 2\r |
223 | mov r10, sp @ tmpline3 = sp\r |
224 | add r7, r7, r7, lsl #1 @ r7 = 3 * dststride - 6 * width\r |
225 | bic sp, sp, #31 @ align sp to 32 bytes\r |
226 | sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r |
227 | mov r11, sp @ tmpline2 = sp\r |
228 | bic sp, sp, #31 @ align sp to 32 bytes\r |
229 | sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r |
230 | mov lr, sp @ tmpline1 = sp\r |
231 | bic sp, sp, #31 @ align sp to 32 bytes\r |
232 | sub r8, sp, #64 @ r8 = sp - 64\r |
233 | vst1.64 {d8-d11}, [r8:256] @ save q4,q5\r |
234 | sub r9, sp, #32 @ r9 = sp - 32\r |
235 | vst1.64 {d12-d15}, [r9:256] @ save q6,q7\r |
236 | sub sp, sp, #(36 + 64) @ sp -= (36 + 64)\r |
237 | str r6, [sp] @ oldsp = r6\r |
238 | str r5, [sp, #4] @ height = r5\r |
239 | str ip, [sp, #8] @ srcdiff = ip\r |
240 | str r7, [sp, #12] @ dstdiff = r7\r |
241 | str r4, [sp, #16] @ dststride = r4\r |
242 | str lr, [sp, #20] @ tmpline1 = lr\r |
243 | str r11, [sp, #24] @ tmpline2 = r11\r |
244 | str r10, [sp, #28] @ tmpline3 = r10\r |
245 | str r3, [sp, #32] @ width = r3\r |
246 | \r |
247 | @ r0 = src\r |
248 | @ r1 = dst\r |
249 | @ r2 = palette\r |
250 | @ r3 = counter\r |
251 | @ r4 = dst2\r |
252 | @ r5 = dst3\r |
253 | \r |
254 | @ r11 = bufptr1\r |
255 | @ ip = bufptr2\r |
256 | @ lr = bufptr3\r |
257 | \r |
258 | @ [sp] = oldsp\r |
259 | @ [sp, #4] = height\r |
260 | @ [sp, #8] = srcdiff (srcstride - width)\r |
261 | @ [sp, #12] = dstdiff (3 * dststride - 6 * width)\r |
262 | @ [sp, #16] = dststride\r |
263 | @ [sp, #20] = tmpline1\r |
264 | @ [sp, #24] = tmpline2\r |
265 | @ [sp, #28] = tmpline3\r |
266 | @ [sp, #32] = width\r |
267 | \r |
268 | @ lr = tmpline1\r |
269 | @ r3 = counter\r |
270 | \r |
271 | @ first line\r |
272 | neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r |
273 | \r |
274 | ldr r7, [sp, #8] @ r7 = srcdiff\r |
275 | ldr r3, [sp, #32] @ counter = width\r |
276 | ldr lr, [sp, #24] @ bufptr3 = tmpline2\r |
277 | add r0, r0, r7 @ src += srcdiff\r |
278 | \r |
279 | @ second line\r |
280 | neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r |
281 | \r |
282 | ldr r9, [sp, #16] @ r9 = dststride\r |
283 | ldr r3, [sp, #32] @ counter = width\r |
284 | ldr ip, [sp, #20] @ bufptr2 = tmpline1\r |
285 | ldr lr, [sp, #24] @ bufptr3 = tmpline2\r |
286 | add r4, r1, r9 @ dst2 = dst + dststride\r |
287 | add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride\r |
288 | \r |
289 | @ first temporary line\r |
290 | neon_scale3x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0\r |
291 | \r |
292 | ldr r7, [sp, #8] @ r7 = srcdiff\r |
293 | ldr r8, [sp, #12] @ r8 = dstdiff\r |
294 | ldr r3, [sp, #32] @ counter = width\r |
295 | ldr lr, [sp, #28] @ bufptr3 = tmpline3\r |
296 | add r0, r0, r7 @ src += srcdiff\r |
297 | add r1, r1, r8 @ dst += dstdiff\r |
298 | \r |
299 | 100:\r |
300 | \r |
301 | @ line n+1\r |
302 | neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r |
303 | \r |
304 | ldr r9, [sp, #16] @ r9 = dststride\r |
305 | ldr r11, [sp, #20] @ bufptr1 = tmpline1\r |
306 | ldr ip, [sp, #24] @ bufptr2 = tmpline2\r |
307 | ldr lr, [sp, #28] @ bufptr3 = tmpline3\r |
308 | add r4, r1, r9 @ dst2 = dst + dststride\r |
309 | ldr r3, [sp, #32] @ counter = width\r |
310 | add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride\r |
311 | str r11, [sp, #28] @ tmpline3 = bufptr1\r |
312 | str ip, [sp, #20] @ tmpline1 = bufptr2\r |
313 | str lr, [sp, #24] @ tmpline2 = bufptr3\r |
314 | \r |
315 | @ temporary line n\r |
316 | neon_scale3x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0\r |
317 | \r |
318 | ldr r6, [sp, #4] @ r6 = height\r |
319 | ldr r7, [sp, #8] @ r7 = srcdiff\r |
320 | ldr r8, [sp, #12] @ r8 = dstdiff\r |
321 | ldr r3, [sp, #32] @ counter = width\r |
322 | subS r6, r6, #1 @ height--\r |
323 | ldr lr, [sp, #28] @ bufptr3 = tmpline3\r |
324 | add r0, r0, r7 @ src += srcdiff\r |
325 | add r1, r1, r8 @ dst += dstdiff\r |
326 | str r6, [sp, #4] @ height = r6\r |
327 | bne 100b\r |
328 | \r |
329 | \r |
330 | ldr r9, [sp, #16] @ r9 = dststride\r |
331 | ldr r11, [sp, #20] @ bufptr1 = tmpline1\r |
332 | ldr ip, [sp, #24] @ bufptr2 = tmpline2\r |
333 | add r4, r1, r9 @ dst2 = dst + dststride\r |
334 | add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride\r |
335 | \r |
336 | @ last temporary line\r |
337 | neon_scale3x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0\r |
338 | \r |
339 | \r |
340 | add r6, sp, #36 @ r6 = sp + 36\r |
341 | ldr sp, [sp] @ sp = oldsp\r |
342 | vld1.64 {d8-d11}, [r6:256] @ restore q4,q5\r |
343 | add ip, r6, #32 @ ip = r6 + 32\r |
344 | vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r |
345 | pop {r4-r11,lr}\r |
346 | bx lr\r |
347 | \r |
348 | @ end procedure neon_scale3x_8_16\r |
349 | \r |