--- /dev/null
+@@\r
+@@ Copyright (C) 2012 Roman Pauer\r
+@@\r
+@@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r
+@@ this software and associated documentation files (the "Software"), to deal in\r
+@@ the Software without restriction, including without limitation the rights to\r
+@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r
+@@ of the Software, and to permit persons to whom the Software is furnished to do\r
+@@ so, subject to the following conditions:\r
+@@\r
+@@ The above copyright notice and this permission notice shall be included in all\r
+@@ copies or substantial portions of the Software.\r
+@@\r
+@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+@@ SOFTWARE.\r
+@@\r
+\r
+.arm\r
+\r
+.include "neon_eagle2x.Sinc"\r
+.include "neon_normalxx.Sinc"\r
+\r
+.global neon_eagle2x_8_8\r
+.global neon_eagle2x_16_16\r
+.global neon_eagle2x_8_16\r
+\r
+.align 4\r
+neon_eagle2x_8_8:\r
+\r
+@ r0 = const uint8_t *src\r
+@ r1 = uint8_t *dst\r
+@ r2 = unsigned int width (pixels)\r
+@ r3 = unsigned int srcstride (bytes)\r
+@ [sp] = unsigned int dststride (bytes)\r
+@ [sp+4] = unsigned int height\r
+@ lr = return address\r
+\r
+ ldr ip, [sp] @ ip = dststride\r
+ push {r4-r10}\r
+ ldr r9, [sp, #(8*4)] @ r9 = height\r
+ sub r4, r0, r3 @ r4 = src - srcstride\r
+ mov r10, sp @ oldsp = sp\r
+ add r5, r0, r3 @ r5 = src + srcstride\r
+ bic sp, sp, #31 @ align sp to 32 bytes\r
+ add r6, r1, ip @ r6 = dst + dststride\r
+ sub sp, sp, #64 @ sp -= 64\r
+ sub r3, r3, r2 @ r3 = srcstride - width\r
+ vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r
+ add r7, sp, #32 @ r7 = sp + 32\r
+ sub ip, ip, r2 @ ip = dststride - width\r
+ vst1.64 {d12-d15}, [r7:256] @ save q6,q7\r
+ lsl ip, #1 @ ip = 2 * dststride - 2 * width\r
+ mov r7, r2 @ r7 = width\r
+ sub r9, r9, #2 @ r9 = height - 2\r
+\r
+\r
+@ r0 = src\r
+@ r1 = dst\r
+@ r2 = width\r
+@ r3 = srcdiff (srcstride - width)\r
+@ r4 = src - srcstride\r
+@ r5 = src + srcstride\r
+@ r6 = dst + dststride\r
+@ r7 = counter\r
+@ r8 = tmpreg\r
+@ r9 = height\r
+@ r10 = oldsp\r
+@ ip = dstdiff (2 * dststride - 2 * width)\r
+\r
+ @ first line\r
+ neon_eagle2x_8_8_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+ add r0, r0, r3\r
+ add r4, r4, r3\r
+ add r5, r5, r3\r
+ add r1, r1, ip\r
+ add r6, r6, ip\r
+\r
+ @ middle lines\r
+ 101:\r
+ mov r7, r2\r
+\r
+ neon_eagle2x_8_8_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+ subS r9, r9, #1\r
+ add r0, r0, r3\r
+ add r4, r4, r3\r
+ add r5, r5, r3\r
+ add r1, r1, ip\r
+ add r6, r6, ip\r
+ bne 101b\r
+\r
+ @ last line\r
+ mov r7, r2\r
+\r
+ neon_eagle2x_8_8_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+ add ip, sp, #32 @ ip = sp + 32\r
+ vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r
+ mov sp, r10 @ sp = oldsp\r
+ vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
+ pop {r4-r10}\r
+ bx lr\r
+\r
+@ end procedure neon_eagle2x_8_8\r
+\r
+\r
+neon_eagle2x_16_16:\r
+\r
+@ r0 = const uint16_t *src\r
+@ r1 = uint16_t *dst\r
+@ r2 = unsigned int width (pixels)\r
+@ r3 = unsigned int srcstride (bytes)\r
+@ [sp] = unsigned int dststride (bytes)\r
+@ [sp+4] = unsigned int height\r
+@ lr = return address\r
+\r
+ ldr ip, [sp] @ ip = dststride\r
+ push {r4-r10}\r
+ ldr r9, [sp, #(8*4)] @ r9 = height\r
+ sub r4, r0, r3 @ r4 = src - srcstride\r
+ mov r10, sp @ oldsp = sp\r
+ add r5, r0, r3 @ r5 = src + srcstride\r
+ bic sp, sp, #31 @ align sp to 32 bytes\r
+ add r6, r1, ip @ r6 = dst + dststride\r
+ sub sp, sp, #64 @ sp -= 64\r
+ sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width\r
+ vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r
+ add r7, sp, #32 @ r7 = sp + 32\r
+ sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width\r
+ vst1.64 {d12-d15}, [r7:256] @ save q6,q7\r
+ lsl ip, #1 @ ip = 2 * dststride - 4 * width\r
+ mov r7, r2 @ r7 = width\r
+ sub r9, r9, #2 @ r9 = height - 2\r
+\r
+@ r0 = src\r
+@ r1 = dst\r
+@ r2 = width\r
+@ r3 = srcdiff (srcstride - 2 * width)\r
+@ r4 = src - srcstride\r
+@ r5 = src + srcstride\r
+@ r6 = dst + dststride\r
+@ r7 = counter\r
+@ r8 = tmpreg\r
+@ r9 = height\r
+@ r10 = oldsp\r
+@ ip = dstdiff (2 * dststride - 4 * width)\r
+\r
+ @ first line\r
+ neon_eagle2x_16_16_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+ add r0, r0, r3\r
+ add r4, r4, r3\r
+ add r5, r5, r3\r
+ add r1, r1, ip\r
+ add r6, r6, ip\r
+\r
+ @ middle lines\r
+ 101:\r
+ mov r7, r2\r
+\r
+ neon_eagle2x_16_16_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+ subS r9, r9, #1\r
+ add r0, r0, r3\r
+ add r4, r4, r3\r
+ add r5, r5, r3\r
+ add r1, r1, ip\r
+ add r6, r6, ip\r
+ bne 101b\r
+\r
+ @ last line\r
+ mov r7, r2\r
+\r
+ neon_eagle2x_16_16_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+ add ip, sp, #32 @ ip = sp + 32\r
+ vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r
+ mov sp, r10 @ sp = oldsp\r
+ vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
+ pop {r4-r10}\r
+ bx lr\r
+\r
+@ end procedure neon_eagle2x_16_16\r
+\r
+\r
+neon_eagle2x_8_16:\r
+\r
+@ r0 = const uint8_t *src\r
+@ r1 = uint8_t *dst\r
+@ r2 = const uint32_t *palette\r
+@ r3 = unsigned int width (pixels)\r
+@ [sp] = unsigned int srcstride (bytes)\r
+@ [sp+4] = unsigned int dststride (bytes)\r
+@ [sp+8] = unsigned int height\r
+@ lr = return address\r
+\r
+@ three temporary lines\r
+\r
+ ldr ip, [sp] @ ip = srcstride\r
+ push {r4-r11,lr}\r
+ ldr r4, [sp, #(4*10)] @ r4 = dststride\r
+ ldr r5, [sp, #(4*11)] @ r5 = height\r
+ mov r6, sp @ r6 = sp\r
+ sub ip, ip, r3 @ ip = srcstride - width\r
+ bic sp, sp, #31 @ align sp to 32 bytes\r
+ sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width\r
+ sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
+ sub r5, r5, #2 @ height -= 2\r
+ mov r10, sp @ tmpline3 = sp\r
+ lsl r7, #1 @ r7 = 2 * dststride - 4 * width\r
+ bic sp, sp, #31 @ align sp to 32 bytes\r
+ sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
+ mov r11, sp @ tmpline2 = sp\r
+ bic sp, sp, #31 @ align sp to 32 bytes\r
+ sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r
+ mov lr, sp @ tmpline1 = sp\r
+ bic sp, sp, #31 @ align sp to 32 bytes\r
+ sub r8, sp, #64 @ r8 = sp - 64\r
+ vst1.64 {d8-d11}, [r8:256] @ save q4,q5\r
+ sub r9, sp, #32 @ r9 = sp - 32\r
+ vst1.64 {d12-d15}, [r9:256] @ save q6,q7\r
+ sub sp, sp, #(36 + 64) @ sp -= (36 + 64)\r
+ str r6, [sp] @ oldsp = r6\r
+ str r5, [sp, #4] @ height = r5\r
+ str ip, [sp, #8] @ srcdiff = ip\r
+ str r7, [sp, #12] @ dstdiff = r7\r
+ str r4, [sp, #16] @ dststride = r4\r
+ str lr, [sp, #20] @ tmpline1 = lr\r
+ str r11, [sp, #24] @ tmpline2 = r11\r
+ str r10, [sp, #28] @ tmpline3 = r10\r
+ str r3, [sp, #32] @ width = r3\r
+\r
+@ r0 = src\r
+@ r1 = dst\r
+@ r2 = palette\r
+@ r3 = counter\r
+@ r4 = dst2\r
+\r
+@ r11 = bufptr1\r
+@ ip = bufptr2\r
+@ lr = bufptr3\r
+\r
+@ [sp] = oldsp\r
+@ [sp, #4] = height\r
+@ [sp, #8] = srcdiff (srcstride - width)\r
+@ [sp, #12] = dstdiff (2 * dststride - 4 * width)\r
+@ [sp, #16] = dststride\r
+@ [sp, #20] = tmpline1\r
+@ [sp, #24] = tmpline2\r
+@ [sp, #28] = tmpline3\r
+@ [sp, #32] = width\r
+\r
+ @ lr = tmpline1\r
+ @ r3 = counter\r
+\r
+ @ first line\r
+ neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
+\r
+ ldr r7, [sp, #8] @ r7 = srcdiff\r
+ ldr r3, [sp, #32] @ counter = width\r
+ ldr lr, [sp, #24] @ bufptr3 = tmpline2\r
+ add r0, r0, r7 @ src += srcdiff\r
+\r
+ @ second line\r
+ neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
+\r
+ ldr r9, [sp, #16] @ r9 = dststride\r
+ ldr r3, [sp, #32] @ counter = width\r
+ ldr ip, [sp, #20] @ bufptr2 = tmpline1\r
+ ldr lr, [sp, #24] @ bufptr3 = tmpline2\r
+ add r4, r1, r9 @ dst2 = dst + dststride\r
+\r
+ @ first temporary line\r
+ neon_eagle2x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
+\r
+ ldr r7, [sp, #8] @ r7 = srcdiff\r
+ ldr r8, [sp, #12] @ r8 = dstdiff\r
+ ldr r3, [sp, #32] @ counter = width\r
+ ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
+ add r0, r0, r7 @ src += srcdiff\r
+ add r1, r1, r8 @ dst += dstdiff\r
+\r
+ 100:\r
+\r
+ @ line n+1\r
+ neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
+\r
+ ldr r9, [sp, #16] @ r9 = dststride\r
+ ldr r11, [sp, #20] @ bufptr1 = tmpline1\r
+ ldr ip, [sp, #24] @ bufptr2 = tmpline2\r
+ ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
+ add r4, r1, r9 @ dst2 = dst + dststride\r
+ ldr r3, [sp, #32] @ counter = width\r
+ str r11, [sp, #28] @ tmpline3 = bufptr1\r
+ str ip, [sp, #20] @ tmpline1 = bufptr2\r
+ str lr, [sp, #24] @ tmpline2 = bufptr3\r
+\r
+ @ temporary line n\r
+ neon_eagle2x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
+\r
+ ldr r6, [sp, #4] @ r6 = height\r
+ ldr r7, [sp, #8] @ r7 = srcdiff\r
+ ldr r8, [sp, #12] @ r8 = dstdiff\r
+ ldr r3, [sp, #32] @ counter = width\r
+ subS r6, r6, #1 @ height--\r
+ ldr lr, [sp, #28] @ bufptr3 = tmpline3\r
+ add r0, r0, r7 @ src += srcdiff\r
+ add r1, r1, r8 @ dst += dstdiff\r
+ str r6, [sp, #4] @ height = r6\r
+ bne 100b\r
+\r
+\r
+ ldr r9, [sp, #16] @ r9 = dststride\r
+ ldr r11, [sp, #20] @ bufptr1 = tmpline1\r
+ ldr ip, [sp, #24] @ bufptr2 = tmpline2\r
+ add r4, r1, r9 @ dst2 = dst + dststride\r
+\r
+ @ last temporary line\r
+ neon_eagle2x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
+\r
+\r
+ add r6, sp, #36 @ r6 = sp + 36\r
+ ldr sp, [sp] @ sp = oldsp\r
+ vld1.64 {d8-d11}, [r6:256] @ restore q4,q5\r
+ add ip, r6, #32 @ ip = r6 + 32\r
+ vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
+ pop {r4-r11,lr}\r
+ bx lr\r
+\r
+@ end procedure neon_eagle2x_8_16\r
+\r