add M-HT's neon scalers
[fceu.git] / drivers / arm / neon_eagle2x.S
diff --git a/drivers/arm/neon_eagle2x.S b/drivers/arm/neon_eagle2x.S
new file mode 100644 (file)
index 0000000..c4e96c2
--- /dev/null
@@ -0,0 +1,337 @@
+@@\r
+@@  Copyright (C) 2012 Roman Pauer\r
+@@\r
+@@  Permission is hereby granted, free of charge, to any person obtaining a copy of\r
+@@  this software and associated documentation files (the "Software"), to deal in\r
+@@  the Software without restriction, including without limitation the rights to\r
+@@  use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r
+@@  of the Software, and to permit persons to whom the Software is furnished to do\r
+@@  so, subject to the following conditions:\r
+@@\r
+@@  The above copyright notice and this permission notice shall be included in all\r
+@@  copies or substantial portions of the Software.\r
+@@\r
+@@  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+@@  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+@@  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+@@  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+@@  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+@@  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+@@  SOFTWARE.\r
+@@\r
+\r
+.arm\r
+\r
+.include "neon_eagle2x.Sinc"\r
+.include "neon_normalxx.Sinc"\r
+\r
+.global neon_eagle2x_8_8\r
+.global neon_eagle2x_16_16\r
+.global neon_eagle2x_8_16\r
+\r
+.align 4\r
+neon_eagle2x_8_8:\r
+\r
+@ r0     = const uint8_t *src\r
+@ r1     = uint8_t *dst\r
+@ r2     = unsigned int width (pixels)\r
+@ r3     = unsigned int srcstride (bytes)\r
+@ [sp]   = unsigned int dststride (bytes)\r
+@ [sp+4] = unsigned int height\r
+@ lr     = return address\r
+\r
+        ldr    ip, [sp]                    @ ip = dststride\r
+        push {r4-r10}\r
+        ldr    r9, [sp, #(8*4)]            @ r9 = height\r
+        sub r4, r0, r3                  @ r4 = src - srcstride\r
+        mov r10, sp                     @ oldsp = sp\r
+        add r5, r0, r3                  @ r5 = src + srcstride\r
+        bic sp, sp, #31                 @ align sp to 32 bytes\r
+        add r6, r1, ip                  @ r6 = dst + dststride\r
+        sub sp, sp, #64                 @ sp -= 64\r
+        sub r3, r3, r2                  @ r3 = srcstride - width\r
+        vst1.64 {d8-d11}, [sp:256]      @ save q4,q5\r
+        add r7, sp, #32                 @ r7 = sp + 32\r
+        sub ip, ip, r2                  @ ip = dststride - width\r
+        vst1.64 {d12-d15}, [r7:256]     @ save q6,q7\r
+        lsl ip, #1                      @ ip = 2 * dststride - 2 * width\r
+        mov r7, r2                      @ r7 = width\r
+        sub r9, r9, #2                  @ r9 = height - 2\r
+\r
+\r
+@ r0  = src\r
+@ r1  = dst\r
+@ r2  = width\r
+@ r3  = srcdiff (srcstride - width)\r
+@ r4  = src - srcstride\r
+@ r5  = src + srcstride\r
+@ r6  = dst + dststride\r
+@ r7  = counter\r
+@ r8  = tmpreg\r
+@ r9  = height\r
+@ r10 = oldsp\r
+@ ip  = dstdiff (2 * dststride - 2 * width)\r
+\r
+    @ first line\r
+        neon_eagle2x_8_8_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+        add r0, r0, r3\r
+        add r4, r4, r3\r
+        add r5, r5, r3\r
+        add r1, r1, ip\r
+        add r6, r6, ip\r
+\r
+    @ middle lines\r
+    101:\r
+        mov r7, r2\r
+\r
+        neon_eagle2x_8_8_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+        subS r9, r9, #1\r
+        add r0, r0, r3\r
+        add r4, r4, r3\r
+        add r5, r5, r3\r
+        add r1, r1, ip\r
+        add r6, r6, ip\r
+        bne 101b\r
+\r
+    @ last line\r
+        mov r7, r2\r
+\r
+        neon_eagle2x_8_8_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+        add ip, sp, #32                 @ ip = sp + 32\r
+        vld1.64 {d8-d11}, [sp:256]      @ restore q4,q5\r
+        mov sp, r10                     @ sp = oldsp\r
+        vld1.64 {d12-d15}, [ip:256]     @ restore q6,q7\r
+        pop {r4-r10}\r
+        bx lr\r
+\r
+@ end procedure neon_eagle2x_8_8\r
+\r
+\r
+neon_eagle2x_16_16:\r
+\r
+@ r0     = const uint16_t *src\r
+@ r1     = uint16_t *dst\r
+@ r2     = unsigned int width (pixels)\r
+@ r3     = unsigned int srcstride (bytes)\r
+@ [sp]   = unsigned int dststride (bytes)\r
+@ [sp+4] = unsigned int height\r
+@ lr     = return address\r
+\r
+        ldr    ip, [sp]                    @ ip = dststride\r
+        push {r4-r10}\r
+        ldr    r9, [sp, #(8*4)]            @ r9 = height\r
+        sub r4, r0, r3                  @ r4 = src - srcstride\r
+        mov r10, sp                     @ oldsp = sp\r
+        add r5, r0, r3                  @ r5 = src + srcstride\r
+        bic sp, sp, #31                 @ align sp to 32 bytes\r
+        add r6, r1, ip                  @ r6 = dst + dststride\r
+        sub sp, sp, #64                 @ sp -= 64\r
+        sub r3, r3, r2, lsl #1          @ r3 = srcstride - 2 * width\r
+        vst1.64 {d8-d11}, [sp:256]      @ save q4,q5\r
+        add r7, sp, #32                 @ r7 = sp + 32\r
+        sub ip, ip, r2, lsl #1          @ ip = dststride - 2 * width\r
+        vst1.64 {d12-d15}, [r7:256]     @ save q6,q7\r
+        lsl ip, #1                      @ ip = 2 * dststride - 4 * width\r
+        mov r7, r2                      @ r7 = width\r
+        sub r9, r9, #2                  @ r9 = height - 2\r
+\r
+@ r0  = src\r
+@ r1  = dst\r
+@ r2  = width\r
+@ r3  = srcdiff (srcstride - 2 * width)\r
+@ r4  = src - srcstride\r
+@ r5  = src + srcstride\r
+@ r6  = dst + dststride\r
+@ r7  = counter\r
+@ r8  = tmpreg\r
+@ r9  = height\r
+@ r10 = oldsp\r
+@ ip  = dstdiff (2 * dststride - 4 * width)\r
+\r
+    @ first line\r
+        neon_eagle2x_16_16_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+        add r0, r0, r3\r
+        add r4, r4, r3\r
+        add r5, r5, r3\r
+        add r1, r1, ip\r
+        add r6, r6, ip\r
+\r
+    @ middle lines\r
+    101:\r
+        mov r7, r2\r
+\r
+        neon_eagle2x_16_16_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+        subS r9, r9, #1\r
+        add r0, r0, r3\r
+        add r4, r4, r3\r
+        add r5, r5, r3\r
+        add r1, r1, ip\r
+        add r6, r6, ip\r
+        bne 101b\r
+\r
+    @ last line\r
+        mov r7, r2\r
+\r
+        neon_eagle2x_16_16_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r
+\r
+        add ip, sp, #32                 @ ip = sp + 32\r
+        vld1.64 {d8-d11}, [sp:256]      @ restore q4,q5\r
+        mov sp, r10                     @ sp = oldsp\r
+        vld1.64 {d12-d15}, [ip:256]     @ restore q6,q7\r
+        pop {r4-r10}\r
+        bx lr\r
+\r
+@ end procedure neon_eagle2x_16_16\r
+\r
+\r
+neon_eagle2x_8_16:\r
+\r
+@ r0     = const uint8_t *src\r
+@ r1     = uint8_t *dst\r
+@ r2     = const uint32_t *palette\r
+@ r3     = unsigned int width (pixels)\r
+@ [sp]   = unsigned int srcstride (bytes)\r
+@ [sp+4] = unsigned int dststride (bytes)\r
+@ [sp+8] = unsigned int height\r
+@ lr     = return address\r
+\r
+@ three temporary lines\r
+\r
+        ldr    ip, [sp]                @ ip = srcstride\r
+        push {r4-r11,lr}\r
+        ldr r4, [sp, #(4*10)]       @ r4 = dststride\r
+        ldr r5, [sp, #(4*11)]       @ r5 = height\r
+        mov r6, sp                  @ r6 = sp\r
+        sub ip, ip, r3              @ ip = srcstride - width\r
+        bic sp, sp, #31             @ align sp to 32 bytes\r
+        sub r7, r4, r3, lsl #1      @ r7 = dststride - 2 * width\r
+        sub sp, sp, r3, lsl #1      @ sp -= 2 * width\r
+        sub r5, r5, #2              @ height -= 2\r
+        mov r10, sp                 @ tmpline3 = sp\r
+        lsl r7, #1                  @ r7 = 2 * dststride - 4 * width\r
+        bic sp, sp, #31             @ align sp to 32 bytes\r
+        sub sp, sp, r3, lsl #1      @ sp -= 2 * width\r
+        mov r11, sp                 @ tmpline2 = sp\r
+        bic sp, sp, #31             @ align sp to 32 bytes\r
+        sub sp, sp, r3, lsl #1      @ sp -= 2 * width\r
+        mov lr, sp                  @ tmpline1 = sp\r
+        bic sp, sp, #31             @ align sp to 32 bytes\r
+        sub r8, sp, #64             @ r8 = sp - 64\r
+        vst1.64 {d8-d11}, [r8:256]  @ save q4,q5\r
+        sub r9, sp, #32             @ r9 = sp - 32\r
+        vst1.64 {d12-d15}, [r9:256] @ save q6,q7\r
+        sub sp, sp, #(36 + 64)      @ sp -= (36 + 64)\r
+        str r6, [sp]                @ oldsp = r6\r
+        str r5, [sp, #4]            @ height = r5\r
+        str ip, [sp, #8]            @ srcdiff = ip\r
+        str r7, [sp, #12]           @ dstdiff = r7\r
+        str r4, [sp, #16]           @ dststride = r4\r
+        str lr, [sp, #20]           @ tmpline1 = lr\r
+        str r11, [sp, #24]          @ tmpline2 = r11\r
+        str r10, [sp, #28]          @ tmpline3 = r10\r
+        str r3, [sp, #32]           @ width = r3\r
+\r
+@ r0  = src\r
+@ r1  = dst\r
+@ r2  = palette\r
+@ r3  = counter\r
+@ r4  = dst2\r
+\r
+@ r11 = bufptr1\r
+@ ip  = bufptr2\r
+@ lr  = bufptr3\r
+\r
+@ [sp]      = oldsp\r
+@ [sp, #4]  = height\r
+@ [sp, #8]  = srcdiff (srcstride - width)\r
+@ [sp, #12] = dstdiff (2 * dststride - 4 * width)\r
+@ [sp, #16] = dststride\r
+@ [sp, #20] = tmpline1\r
+@ [sp, #24] = tmpline2\r
+@ [sp, #28] = tmpline3\r
+@ [sp, #32] = width\r
+\r
+    @ lr = tmpline1\r
+    @ r3 = counter\r
+\r
+    @ first line\r
+        neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
+\r
+        ldr r7, [sp, #8]            @ r7 = srcdiff\r
+        ldr r3, [sp, #32]           @ counter = width\r
+        ldr lr, [sp, #24]           @ bufptr3 = tmpline2\r
+        add r0, r0, r7              @ src += srcdiff\r
+\r
+    @ second line\r
+        neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
+\r
+        ldr r9, [sp, #16]           @ r9 = dststride\r
+        ldr r3, [sp, #32]           @ counter = width\r
+        ldr ip, [sp, #20]           @ bufptr2 = tmpline1\r
+        ldr lr, [sp, #24]           @ bufptr3 = tmpline2\r
+        add r4, r1, r9              @ dst2 = dst + dststride\r
+\r
+    @ first temporary line\r
+        neon_eagle2x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
+\r
+        ldr r7, [sp, #8]            @ r7 = srcdiff\r
+        ldr r8, [sp, #12]           @ r8 = dstdiff\r
+        ldr r3, [sp, #32]           @ counter = width\r
+        ldr lr, [sp, #28]           @ bufptr3 = tmpline3\r
+        add r0, r0, r7              @ src += srcdiff\r
+        add r1, r1, r8              @ dst += dstdiff\r
+\r
+    100:\r
+\r
+    @ line n+1\r
+        neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r
+\r
+        ldr r9, [sp, #16]           @ r9 = dststride\r
+        ldr r11, [sp, #20]          @ bufptr1 = tmpline1\r
+        ldr ip, [sp, #24]           @ bufptr2 = tmpline2\r
+        ldr lr, [sp, #28]           @ bufptr3 = tmpline3\r
+        add r4, r1, r9              @ dst2 = dst + dststride\r
+        ldr r3, [sp, #32]           @ counter = width\r
+        str r11, [sp, #28]          @ tmpline3 = bufptr1\r
+        str ip, [sp, #20]           @ tmpline1 = bufptr2\r
+        str lr, [sp, #24]           @ tmpline2 = bufptr3\r
+\r
+    @ temporary line n\r
+        neon_eagle2x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
+\r
+        ldr r6, [sp, #4]            @ r6 = height\r
+        ldr r7, [sp, #8]            @ r7 = srcdiff\r
+        ldr r8, [sp, #12]           @ r8 = dstdiff\r
+        ldr r3, [sp, #32]           @ counter = width\r
+        subS r6, r6, #1             @ height--\r
+        ldr lr, [sp, #28]           @ bufptr3 = tmpline3\r
+        add r0, r0, r7              @ src += srcdiff\r
+        add r1, r1, r8              @ dst += dstdiff\r
+        str r6, [sp, #4]            @ height = r6\r
+        bne 100b\r
+\r
+\r
+        ldr r9, [sp, #16]           @ r9 = dststride\r
+        ldr r11, [sp, #20]          @ bufptr1 = tmpline1\r
+        ldr ip, [sp, #24]           @ bufptr2 = tmpline2\r
+        add r4, r1, r9              @ dst2 = dst + dststride\r
+\r
+    @ last temporary line\r
+        neon_eagle2x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, 1, 0\r
+\r
+\r
+        add r6, sp, #36             @ r6 = sp + 36\r
+        ldr sp, [sp]                @ sp = oldsp\r
+        vld1.64 {d8-d11}, [r6:256]  @ restore q4,q5\r
+        add ip, r6, #32             @ ip = r6 + 32\r
+        vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r
+        pop {r4-r11,lr}\r
+        bx lr\r
+\r
+@ end procedure neon_eagle2x_8_16\r
+\r