@@ @@ Copyright (C) 2012 Roman Pauer @@ @@ Permission is hereby granted, free of charge, to any person obtaining a copy of @@ this software and associated documentation files (the "Software"), to deal in @@ the Software without restriction, including without limitation the rights to @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies @@ of the Software, and to permit persons to whom the Software is furnished to do @@ so, subject to the following conditions: @@ @@ The above copyright notice and this permission notice shall be included in all @@ copies or substantial portions of the Software. @@ @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE @@ SOFTWARE. @@ .arm .include "neon_eagle2x.Sinc" .include "neon_normalxx.Sinc" .global neon_eagle2x_8_8 .global neon_eagle2x_16_16 .global neon_eagle2x_8_16 .align 4 neon_eagle2x_8_8: @ r0 = const uint8_t *src @ r1 = uint8_t *dst @ r2 = unsigned int width (pixels) @ r3 = unsigned int srcstride (bytes) @ [sp] = unsigned int dststride (bytes) @ [sp+4] = unsigned int height @ lr = return address ldr ip, [sp] @ ip = dststride push {r4-r10} ldr r9, [sp, #(8*4)] @ r9 = height sub r4, r0, r3 @ r4 = src - srcstride mov r10, sp @ oldsp = sp add r5, r0, r3 @ r5 = src + srcstride bic sp, sp, #31 @ align sp to 32 bytes add r6, r1, ip @ r6 = dst + dststride sub sp, sp, #64 @ sp -= 64 sub r3, r3, r2 @ r3 = srcstride - width vst1.64 {d8-d11}, [sp:256] @ save q4,q5 add r7, sp, #32 @ r7 = sp + 32 sub ip, ip, r2 @ ip = dststride - width vst1.64 {d12-d15}, [r7:256] @ save q6,q7 lsl ip, #1 @ ip = 2 * dststride - 2 * width mov r7, r2 @ r7 = width sub r9, r9, #2 @ r9 = height - 2 @ r0 = src @ r1 = dst @ r2 = width @ r3 = srcdiff (srcstride - width) @ r4 = src - srcstride @ r5 = src + srcstride @ r6 = dst + dststride @ r7 = counter @ r8 = tmpreg @ r9 = height @ r10 = oldsp @ ip = dstdiff (2 * dststride - 2 * width) @ first line neon_eagle2x_8_8_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0 add r0, r0, r3 add r4, r4, r3 add r5, r5, r3 add r1, r1, ip add r6, r6, ip @ middle lines 101: mov r7, r2 neon_eagle2x_8_8_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0 subS r9, r9, #1 add r0, r0, r3 add r4, r4, r3 add r5, r5, r3 add r1, r1, ip add r6, r6, ip bne 101b @ last line mov r7, r2 neon_eagle2x_8_8_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0 add ip, sp, #32 @ ip = sp + 32 vld1.64 {d8-d11}, [sp:256] @ restore q4,q5 mov sp, r10 @ sp = oldsp vld1.64 {d12-d15}, [ip:256] @ restore q6,q7 pop {r4-r10} bx lr @ end procedure neon_eagle2x_8_8 neon_eagle2x_16_16: @ r0 = const uint16_t *src @ r1 = uint16_t *dst @ r2 = unsigned int width (pixels) @ r3 = unsigned int srcstride (bytes) @ [sp] = unsigned int dststride (bytes) @ [sp+4] = unsigned int height @ lr = return address ldr ip, [sp] @ ip = dststride push {r4-r10} ldr r9, [sp, #(8*4)] @ r9 = height sub r4, r0, r3 @ r4 = src - srcstride mov r10, sp @ oldsp = sp add r5, r0, r3 @ r5 = src + srcstride bic sp, sp, #31 @ align sp to 32 bytes add r6, r1, ip @ r6 = dst + dststride sub sp, sp, #64 @ sp -= 64 sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width vst1.64 {d8-d11}, [sp:256] @ save q4,q5 add r7, sp, #32 @ r7 = sp + 32 sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width vst1.64 {d12-d15}, [r7:256] @ save q6,q7 lsl ip, #1 @ ip = 2 * dststride - 4 * width mov r7, r2 @ r7 = width sub r9, r9, #2 @ r9 = height - 2 @ r0 = src @ r1 = dst @ r2 = width @ r3 = srcdiff (srcstride - 2 * width) @ r4 = src - srcstride @ r5 = src + srcstride @ r6 = dst + dststride @ r7 = counter @ r8 = tmpreg @ r9 = height @ r10 = oldsp @ ip = dstdiff (2 * dststride - 4 * width) @ first line neon_eagle2x_16_16_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0 add r0, r0, r3 add r4, r4, r3 add r5, r5, r3 add r1, r1, ip add r6, r6, ip @ middle lines 101: mov r7, r2 neon_eagle2x_16_16_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0 subS r9, r9, #1 add r0, r0, r3 add r4, r4, r3 add r5, r5, r3 add r1, r1, ip add r6, r6, ip bne 101b @ last line mov r7, r2 neon_eagle2x_16_16_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0 add ip, sp, #32 @ ip = sp + 32 vld1.64 {d8-d11}, [sp:256] @ restore q4,q5 mov sp, r10 @ sp = oldsp vld1.64 {d12-d15}, [ip:256] @ restore q6,q7 pop {r4-r10} bx lr @ end procedure neon_eagle2x_16_16 neon_eagle2x_8_16: @ r0 = const uint8_t *src @ r1 = uint8_t *dst @ r2 = const uint32_t *palette @ r3 = unsigned int width (pixels) @ [sp] = unsigned int srcstride (bytes) @ [sp+4] = unsigned int dststride (bytes) @ [sp+8] = unsigned int height @ lr = return address @ three temporary lines ldr ip, [sp] @ ip = srcstride push {r4-r11,lr} ldr r4, [sp, #(4*10)] @ r4 = dststride ldr r5, [sp, #(4*11)] @ r5 = height mov r6, sp @ r6 = sp sub ip, ip, r3 @ ip = srcstride - width bic sp, sp, #31 @ align sp to 32 bytes sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width sub sp, sp, r3, lsl #1 @ sp -= 2 * width sub r5, r5, #2 @ height -= 2 mov r10, sp @ tmpline3 = sp lsl r7, #1 @ r7 = 2 * dststride - 4 * width bic sp, sp, #31 @ align sp to 32 bytes sub sp, sp, r3, lsl #1 @ sp -= 2 * width mov r11, sp @ tmpline2 = sp bic sp, sp, #31 @ align sp to 32 bytes sub sp, sp, r3, lsl #1 @ sp -= 2 * width mov lr, sp @ tmpline1 = sp bic sp, sp, #31 @ align sp to 32 bytes sub r8, sp, #64 @ r8 = sp - 64 vst1.64 {d8-d11}, [r8:256] @ save q4,q5 sub r9, sp, #32 @ r9 = sp - 32 vst1.64 {d12-d15}, [r9:256] @ save q6,q7 sub sp, sp, #(36 + 64) @ sp -= (36 + 64) str r6, [sp] @ oldsp = r6 str r5, [sp, #4] @ height = r5 str ip, [sp, #8] @ srcdiff = ip str r7, [sp, #12] @ dstdiff = r7 str r4, [sp, #16] @ dststride = r4 str lr, [sp, #20] @ tmpline1 = lr str r11, [sp, #24] @ tmpline2 = r11 str r10, [sp, #28] @ tmpline3 = r10 str r3, [sp, #32] @ width = r3 @ r0 = src @ r1 = dst @ r2 = palette @ r3 = counter @ r4 = dst2 @ r11 = bufptr1 @ ip = bufptr2 @ lr = bufptr3 @ [sp] = oldsp @ [sp, #4] = height @ [sp, #8] = srcdiff (srcstride - width) @ [sp, #12] = dstdiff (2 * dststride - 4 * width) @ [sp, #16] = dststride @ [sp, #20] = tmpline1 @ [sp, #24] = tmpline2 @ [sp, #28] = tmpline3 @ [sp, #32] = width @ lr = tmpline1 @ r3 = counter @ first line neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip ldr r7, [sp, #8] @ r7 = srcdiff ldr r3, [sp, #32] @ counter = width ldr lr, [sp, #24] @ bufptr3 = tmpline2 add r0, r0, r7 @ src += srcdiff @ second line neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip ldr r9, [sp, #16] @ r9 = dststride ldr r3, [sp, #32] @ counter = width ldr ip, [sp, #20] @ bufptr2 = tmpline1 ldr lr, [sp, #24] @ bufptr3 = tmpline2 add r4, r1, r9 @ dst2 = dst + dststride @ first temporary line neon_eagle2x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, 1, 0 ldr r7, [sp, #8] @ r7 = srcdiff ldr r8, [sp, #12] @ r8 = dstdiff ldr r3, [sp, #32] @ counter = width ldr lr, [sp, #28] @ bufptr3 = tmpline3 add r0, r0, r7 @ src += srcdiff add r1, r1, r8 @ dst += dstdiff 100: @ line n+1 neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip ldr r9, [sp, #16] @ r9 = dststride ldr r11, [sp, #20] @ bufptr1 = tmpline1 ldr ip, [sp, #24] @ bufptr2 = tmpline2 ldr lr, [sp, #28] @ bufptr3 = tmpline3 add r4, r1, r9 @ dst2 = dst + dststride ldr r3, [sp, #32] @ counter = width str r11, [sp, #28] @ tmpline3 = bufptr1 str ip, [sp, #20] @ tmpline1 = bufptr2 str lr, [sp, #24] @ tmpline2 = bufptr3 @ temporary line n neon_eagle2x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, 1, 0 ldr r6, [sp, #4] @ r6 = height ldr r7, [sp, #8] @ r7 = srcdiff ldr r8, [sp, #12] @ r8 = dstdiff ldr r3, [sp, #32] @ counter = width subS r6, r6, #1 @ height-- ldr lr, [sp, #28] @ bufptr3 = tmpline3 add r0, r0, r7 @ src += srcdiff add r1, r1, r8 @ dst += dstdiff str r6, [sp, #4] @ height = r6 bne 100b ldr r9, [sp, #16] @ r9 = dststride ldr r11, [sp, #20] @ bufptr1 = tmpline1 ldr ip, [sp, #24] @ bufptr2 = tmpline2 add r4, r1, r9 @ dst2 = dst + dststride @ last temporary line neon_eagle2x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, 1, 0 add r6, sp, #36 @ r6 = sp + 36 ldr sp, [sp] @ sp = oldsp vld1.64 {d8-d11}, [r6:256] @ restore q4,q5 add ip, r6, #32 @ ip = r6 + 32 vld1.64 {d12-d15}, [ip:256] @ restore q6,q7 pop {r4-r11,lr} bx lr @ end procedure neon_eagle2x_8_16