2 @@ Copyright (C) 2012 Roman Pauer
\r
4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
\r
5 @@ this software and associated documentation files (the "Software"), to deal in
\r
6 @@ the Software without restriction, including without limitation the rights to
\r
7 @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
\r
8 @@ of the Software, and to permit persons to whom the Software is furnished to do
\r
9 @@ so, subject to the following conditions:
\r
11 @@ The above copyright notice and this permission notice shall be included in all
\r
12 @@ copies or substantial portions of the Software.
\r
14 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
\r
15 @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
\r
16 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
\r
17 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
\r
18 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
\r
19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
\r
25 #include "neon_scale3x.Sinc"
\r
26 #include "neon_normalxx.Sinc"
\r
28 .global neon_scale3x_8_8
\r
29 .global neon_scale3x_16_16
\r
30 .global neon_scale3x_8_16
\r
35 @ r0 = const uint8_t *src
\r
37 @ r2 = unsigned int width (pixels)
\r
38 @ r3 = unsigned int srcstride (bytes)
\r
39 @ [sp] = unsigned int dststride (bytes)
\r
40 @ [sp+4] = unsigned int height
\r
41 @ lr = return address
\r
43 ldr ip, [sp] @ ip = dststride
\r
45 ldr r9, [sp, #(9*4)] @ r9 = height
\r
46 sub r4, r0, r3 @ r4 = src - srcstride
\r
47 mov r11, sp @ oldsp = sp
\r
48 add r5, r0, r3 @ r5 = src + srcstride
\r
49 bic sp, sp, #31 @ align sp to 32 bytes
\r
50 add r6, r1, ip @ r6 = dst + dststride
\r
51 sub sp, sp, #64 @ sp -= 64
\r
52 sub r3, r3, r2 @ r3 = srcstride - width
\r
53 vst1.64 {d8-d11}, [sp:256] @ save q4,q5
\r
54 add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride
\r
55 add r8, sp, #32 @ r8 = sp + 32
\r
56 sub ip, ip, r2 @ ip = dststride - width
\r
57 vst1.64 {d12-d15}, [r8:256] @ save q6,q7
\r
58 add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 3 * width
\r
59 mov r8, r2 @ r8 = width
\r
60 sub r9, r9, #2 @ r9 = height - 2
\r
66 @ r3 = srcdiff (srcstride - width)
\r
67 @ r4 = src - srcstride
\r
68 @ r5 = src + srcstride
\r
69 @ r6 = dst + dststride
\r
70 @ r7 = dst + 2 * dststride
\r
75 @ ip = dstdiff (3 * dststride - 3 * width)
\r
78 neon_scale3x_8_8_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
\r
91 neon_scale3x_8_8_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
\r
105 neon_scale3x_8_8_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
\r
107 add ip, sp, #32 @ ip = sp + 32
\r
108 vld1.64 {d8-d11}, [sp:256] @ restore q4,q5
\r
109 mov sp, r11 @ sp = oldsp
\r
110 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
\r
114 @ end procedure neon_scale3x_8_8
\r
117 neon_scale3x_16_16:
\r
119 @ r0 = const uint16_t *src
\r
120 @ r1 = uint16_t *dst
\r
121 @ r2 = unsigned int width (pixels)
\r
122 @ r3 = unsigned int srcstride (bytes)
\r
123 @ [sp] = unsigned int dststride (bytes)
\r
124 @ [sp+4] = unsigned int height
\r
125 @ lr = return address
\r
127 ldr ip, [sp] @ ip = dststride
\r
129 ldr r9, [sp, #(9*4)] @ r9 = height
\r
130 sub r4, r0, r3 @ r4 = src - srcstride
\r
131 mov r11, sp @ oldsp = sp
\r
132 add r5, r0, r3 @ r5 = src + srcstride
\r
133 bic sp, sp, #31 @ align sp to 32 bytes
\r
134 add r6, r1, ip @ r6 = dst + dststride
\r
135 sub sp, sp, #64 @ sp -= 64
\r
136 sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width
\r
137 vst1.64 {d8-d11}, [sp:256] @ save q4,q5
\r
138 add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride
\r
139 add r8, sp, #32 @ r8 = sp + 32
\r
140 sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width
\r
141 vst1.64 {d12-d15}, [r8:256] @ save q6,q7
\r
142 add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 6 * width
\r
143 mov r8, r2 @ r8 = width
\r
144 sub r9, r9, #2 @ r9 = height - 2
\r
149 @ r3 = srcdiff (srcstride - 2 * width)
\r
150 @ r4 = src - srcstride
\r
151 @ r5 = src + srcstride
\r
152 @ r6 = dst + dststride
\r
153 @ r7 = dst + 2 * dststride
\r
158 @ ip = dstdiff (3 * dststride - 6 * width)
\r
161 neon_scale3x_16_16_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
\r
174 neon_scale3x_16_16_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
\r
188 neon_scale3x_16_16_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
\r
190 add ip, sp, #32 @ ip = sp + 32
\r
191 vld1.64 {d8-d11}, [sp:256] @ restore q4,q5
\r
192 mov sp, r11 @ sp = oldsp
\r
193 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
\r
197 @ end procedure neon_scale3x_16_16
\r
202 @ r0 = const uint8_t *src
\r
203 @ r1 = uint8_t *dst
\r
204 @ r2 = const uint32_t *palette
\r
205 @ r3 = unsigned int width (pixels)
\r
206 @ [sp] = unsigned int srcstride (bytes)
\r
207 @ [sp+4] = unsigned int dststride (bytes)
\r
208 @ [sp+8] = unsigned int height
\r
209 @ lr = return address
\r
211 @ three temporary lines
\r
213 ldr ip, [sp] @ ip = srcstride
\r
215 ldr r4, [sp, #(4*10)] @ r4 = dststride
\r
216 ldr r5, [sp, #(4*11)] @ r5 = height
\r
217 mov r6, sp @ r6 = sp
\r
218 sub ip, ip, r3 @ ip = srcstride - width
\r
219 bic sp, sp, #31 @ align sp to 32 bytes
\r
220 sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width
\r
221 sub sp, sp, r3, lsl #1 @ sp -= 2 * width
\r
222 sub r5, r5, #2 @ height -= 2
\r
223 mov r10, sp @ tmpline3 = sp
\r
224 add r7, r7, r7, lsl #1 @ r7 = 3 * dststride - 6 * width
\r
225 bic sp, sp, #31 @ align sp to 32 bytes
\r
226 sub sp, sp, r3, lsl #1 @ sp -= 2 * width
\r
227 mov r11, sp @ tmpline2 = sp
\r
228 bic sp, sp, #31 @ align sp to 32 bytes
\r
229 sub sp, sp, r3, lsl #1 @ sp -= 2 * width
\r
230 mov lr, sp @ tmpline1 = sp
\r
231 bic sp, sp, #31 @ align sp to 32 bytes
\r
232 sub r8, sp, #64 @ r8 = sp - 64
\r
233 vst1.64 {d8-d11}, [r8:256] @ save q4,q5
\r
234 sub r9, sp, #32 @ r9 = sp - 32
\r
235 vst1.64 {d12-d15}, [r9:256] @ save q6,q7
\r
236 sub sp, sp, #(36 + 64) @ sp -= (36 + 64)
\r
237 str r6, [sp] @ oldsp = r6
\r
238 str r5, [sp, #4] @ height = r5
\r
239 str ip, [sp, #8] @ srcdiff = ip
\r
240 str r7, [sp, #12] @ dstdiff = r7
\r
241 str r4, [sp, #16] @ dststride = r4
\r
242 str lr, [sp, #20] @ tmpline1 = lr
\r
243 str r11, [sp, #24] @ tmpline2 = r11
\r
244 str r10, [sp, #28] @ tmpline3 = r10
\r
245 str r3, [sp, #32] @ width = r3
\r
259 @ [sp, #4] = height
\r
260 @ [sp, #8] = srcdiff (srcstride - width)
\r
261 @ [sp, #12] = dstdiff (3 * dststride - 6 * width)
\r
262 @ [sp, #16] = dststride
\r
263 @ [sp, #20] = tmpline1
\r
264 @ [sp, #24] = tmpline2
\r
265 @ [sp, #28] = tmpline3
\r
266 @ [sp, #32] = width
\r
272 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
\r
274 ldr r7, [sp, #8] @ r7 = srcdiff
\r
275 ldr r3, [sp, #32] @ counter = width
\r
276 ldr lr, [sp, #24] @ bufptr3 = tmpline2
\r
277 add r0, r0, r7 @ src += srcdiff
\r
280 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
\r
282 ldr r9, [sp, #16] @ r9 = dststride
\r
283 ldr r3, [sp, #32] @ counter = width
\r
284 ldr ip, [sp, #20] @ bufptr2 = tmpline1
\r
285 ldr lr, [sp, #24] @ bufptr3 = tmpline2
\r
286 add r4, r1, r9 @ dst2 = dst + dststride
\r
287 add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
\r
289 @ first temporary line
\r
290 neon_scale3x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
\r
292 ldr r7, [sp, #8] @ r7 = srcdiff
\r
293 ldr r8, [sp, #12] @ r8 = dstdiff
\r
294 ldr r3, [sp, #32] @ counter = width
\r
295 ldr lr, [sp, #28] @ bufptr3 = tmpline3
\r
296 add r0, r0, r7 @ src += srcdiff
\r
297 add r1, r1, r8 @ dst += dstdiff
\r
302 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
\r
304 ldr r9, [sp, #16] @ r9 = dststride
\r
305 ldr r11, [sp, #20] @ bufptr1 = tmpline1
\r
306 ldr ip, [sp, #24] @ bufptr2 = tmpline2
\r
307 ldr lr, [sp, #28] @ bufptr3 = tmpline3
\r
308 add r4, r1, r9 @ dst2 = dst + dststride
\r
309 ldr r3, [sp, #32] @ counter = width
\r
310 add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
\r
311 str r11, [sp, #28] @ tmpline3 = bufptr1
\r
312 str ip, [sp, #20] @ tmpline1 = bufptr2
\r
313 str lr, [sp, #24] @ tmpline2 = bufptr3
\r
316 neon_scale3x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
\r
318 ldr r6, [sp, #4] @ r6 = height
\r
319 ldr r7, [sp, #8] @ r7 = srcdiff
\r
320 ldr r8, [sp, #12] @ r8 = dstdiff
\r
321 ldr r3, [sp, #32] @ counter = width
\r
322 subS r6, r6, #1 @ height--
\r
323 ldr lr, [sp, #28] @ bufptr3 = tmpline3
\r
324 add r0, r0, r7 @ src += srcdiff
\r
325 add r1, r1, r8 @ dst += dstdiff
\r
326 str r6, [sp, #4] @ height = r6
\r
330 ldr r9, [sp, #16] @ r9 = dststride
\r
331 ldr r11, [sp, #20] @ bufptr1 = tmpline1
\r
332 ldr ip, [sp, #24] @ bufptr2 = tmpline2
\r
333 add r4, r1, r9 @ dst2 = dst + dststride
\r
334 add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
\r
336 @ last temporary line
\r
337 neon_scale3x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
\r
340 add r6, sp, #36 @ r6 = sp + 36
\r
341 ldr sp, [sp] @ sp = oldsp
\r
342 vld1.64 {d8-d11}, [r6:256] @ restore q4,q5
\r
343 add ip, r6, #32 @ ip = r6 + 32
\r
344 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
\r
348 @ end procedure neon_scale3x_8_16
\r