| 1 | @@\r |
| 2 | @@ Copyright (C) 2012 Roman Pauer\r |
| 3 | @@\r |
| 4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
| 5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
| 6 | @@ the Software without restriction, including without limitation the rights to\r |
| 7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
| 8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
| 9 | @@ so, subject to the following conditions:\r |
| 10 | @@\r |
| 11 | @@ The above copyright notice and this permission notice shall be included in all\r |
| 12 | @@ copies or substantial portions of the Software.\r |
| 13 | @@\r |
| 14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
| 15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
| 16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
| 17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
| 18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
| 19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
| 20 | @@ SOFTWARE.\r |
| 21 | @@\r |
| 22 | \r |
| 23 | .arm\r |
| 24 | \r |
| 25 | .include "neon_eagle2x.Sinc"\r |
| 26 | .include "neon_normalxx.Sinc"\r |
| 27 | \r |
| 28 | .global neon_eagle2x_8_8\r |
| 29 | .global neon_eagle2x_16_16\r |
| 30 | .global neon_eagle2x_8_16\r |
| 31 | \r |
| 32 | .align 4\r |
| 33 | neon_eagle2x_8_8:\r |
| 34 | \r |
| 35 | @ r0 = const uint8_t *src\r |
| 36 | @ r1 = uint8_t *dst\r |
| 37 | @ r2 = unsigned int width (pixels)\r |
| 38 | @ r3 = unsigned int srcstride (bytes)\r |
| 39 | @ [sp] = unsigned int dststride (bytes)\r |
| 40 | @ [sp+4] = unsigned int height\r |
| 41 | @ lr = return address\r |
| 42 | \r |
| 43 | ldr ip, [sp] @ ip = dststride\r |
| 44 | push {r4-r10}\r |
| 45 | ldr r9, [sp, #(8*4)] @ r9 = height\r |
| 46 | sub r4, r0, r3 @ r4 = src - srcstride\r |
| 47 | mov r10, sp @ oldsp = sp\r |
| 48 | add r5, r0, r3 @ r5 = src + srcstride\r |
| 49 | bic sp, sp, #31 @ align sp to 32 bytes\r |
| 50 | add r6, r1, ip @ r6 = dst + dststride\r |
| 51 | sub sp, sp, #64 @ sp -= 64\r |
| 52 | sub r3, r3, r2 @ r3 = srcstride - width\r |
| 53 | vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r |
| 54 | add r7, sp, #32 @ r7 = sp + 32\r |
| 55 | sub ip, ip, r2 @ ip = dststride - width\r |
| 56 | vst1.64 {d12-d15}, [r7:256] @ save q6,q7\r |
| 57 | lsl ip, #1 @ ip = 2 * dststride - 2 * width\r |
| 58 | mov r7, r2 @ r7 = width\r |
| 59 | sub r9, r9, #2 @ r9 = height - 2\r |
| 60 | \r |
| 61 | \r |
| 62 | @ r0 = src\r |
| 63 | @ r1 = dst\r |
| 64 | @ r2 = width\r |
| 65 | @ r3 = srcdiff (srcstride - width)\r |
| 66 | @ r4 = src - srcstride\r |
| 67 | @ r5 = src + srcstride\r |
| 68 | @ r6 = dst + dststride\r |
| 69 | @ r7 = counter\r |
| 70 | @ r8 = tmpreg\r |
| 71 | @ r9 = height\r |
| 72 | @ r10 = oldsp\r |
| 73 | @ ip = dstdiff (2 * dststride - 2 * width)\r |
| 74 | \r |
| 75 | @ first line\r |
| 76 | neon_eagle2x_8_8_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r |
| 77 | \r |
| 78 | add r0, r0, r3\r |
| 79 | add r4, r4, r3\r |
| 80 | add r5, r5, r3\r |
| 81 | add r1, r1, ip\r |
| 82 | add r6, r6, ip\r |
| 83 | \r |
| 84 | @ middle lines\r |
| 85 | 101:\r |
| 86 | mov r7, r2\r |
| 87 | \r |
| 88 | neon_eagle2x_8_8_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r |
| 89 | \r |
| 90 | subS r9, r9, #1\r |
| 91 | add r0, r0, r3\r |
| 92 | add r4, r4, r3\r |
| 93 | add r5, r5, r3\r |
| 94 | add r1, r1, ip\r |
| 95 | add r6, r6, ip\r |
| 96 | bne 101b\r |
| 97 | \r |
| 98 | @ last line\r |
| 99 | mov r7, r2\r |
| 100 | \r |
| 101 | neon_eagle2x_8_8_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r |
| 102 | \r |
| 103 | add ip, sp, #32 @ ip = sp + 32\r |
| 104 | vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r |
| 105 | mov sp, r10 @ sp = oldsp\r |
| 106 | vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r |
| 107 | pop {r4-r10}\r |
| 108 | bx lr\r |
| 109 | \r |
| 110 | @ end procedure neon_eagle2x_8_8\r |
| 111 | \r |
| 112 | \r |
| 113 | neon_eagle2x_16_16:\r |
| 114 | \r |
| 115 | @ r0 = const uint16_t *src\r |
| 116 | @ r1 = uint16_t *dst\r |
| 117 | @ r2 = unsigned int width (pixels)\r |
| 118 | @ r3 = unsigned int srcstride (bytes)\r |
| 119 | @ [sp] = unsigned int dststride (bytes)\r |
| 120 | @ [sp+4] = unsigned int height\r |
| 121 | @ lr = return address\r |
| 122 | \r |
| 123 | ldr ip, [sp] @ ip = dststride\r |
| 124 | push {r4-r10}\r |
| 125 | ldr r9, [sp, #(8*4)] @ r9 = height\r |
| 126 | sub r4, r0, r3 @ r4 = src - srcstride\r |
| 127 | mov r10, sp @ oldsp = sp\r |
| 128 | add r5, r0, r3 @ r5 = src + srcstride\r |
| 129 | bic sp, sp, #31 @ align sp to 32 bytes\r |
| 130 | add r6, r1, ip @ r6 = dst + dststride\r |
| 131 | sub sp, sp, #64 @ sp -= 64\r |
| 132 | sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width\r |
| 133 | vst1.64 {d8-d11}, [sp:256] @ save q4,q5\r |
| 134 | add r7, sp, #32 @ r7 = sp + 32\r |
| 135 | sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width\r |
| 136 | vst1.64 {d12-d15}, [r7:256] @ save q6,q7\r |
| 137 | lsl ip, #1 @ ip = 2 * dststride - 4 * width\r |
| 138 | mov r7, r2 @ r7 = width\r |
| 139 | sub r9, r9, #2 @ r9 = height - 2\r |
| 140 | \r |
| 141 | @ r0 = src\r |
| 142 | @ r1 = dst\r |
| 143 | @ r2 = width\r |
| 144 | @ r3 = srcdiff (srcstride - 2 * width)\r |
| 145 | @ r4 = src - srcstride\r |
| 146 | @ r5 = src + srcstride\r |
| 147 | @ r6 = dst + dststride\r |
| 148 | @ r7 = counter\r |
| 149 | @ r8 = tmpreg\r |
| 150 | @ r9 = height\r |
| 151 | @ r10 = oldsp\r |
| 152 | @ ip = dstdiff (2 * dststride - 4 * width)\r |
| 153 | \r |
| 154 | @ first line\r |
| 155 | neon_eagle2x_16_16_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0\r |
| 156 | \r |
| 157 | add r0, r0, r3\r |
| 158 | add r4, r4, r3\r |
| 159 | add r5, r5, r3\r |
| 160 | add r1, r1, ip\r |
| 161 | add r6, r6, ip\r |
| 162 | \r |
| 163 | @ middle lines\r |
| 164 | 101:\r |
| 165 | mov r7, r2\r |
| 166 | \r |
| 167 | neon_eagle2x_16_16_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0\r |
| 168 | \r |
| 169 | subS r9, r9, #1\r |
| 170 | add r0, r0, r3\r |
| 171 | add r4, r4, r3\r |
| 172 | add r5, r5, r3\r |
| 173 | add r1, r1, ip\r |
| 174 | add r6, r6, ip\r |
| 175 | bne 101b\r |
| 176 | \r |
| 177 | @ last line\r |
| 178 | mov r7, r2\r |
| 179 | \r |
| 180 | neon_eagle2x_16_16_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0\r |
| 181 | \r |
| 182 | add ip, sp, #32 @ ip = sp + 32\r |
| 183 | vld1.64 {d8-d11}, [sp:256] @ restore q4,q5\r |
| 184 | mov sp, r10 @ sp = oldsp\r |
| 185 | vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r |
| 186 | pop {r4-r10}\r |
| 187 | bx lr\r |
| 188 | \r |
| 189 | @ end procedure neon_eagle2x_16_16\r |
| 190 | \r |
| 191 | \r |
| 192 | neon_eagle2x_8_16:\r |
| 193 | \r |
| 194 | @ r0 = const uint8_t *src\r |
| 195 | @ r1 = uint8_t *dst\r |
| 196 | @ r2 = const uint32_t *palette\r |
| 197 | @ r3 = unsigned int width (pixels)\r |
| 198 | @ [sp] = unsigned int srcstride (bytes)\r |
| 199 | @ [sp+4] = unsigned int dststride (bytes)\r |
| 200 | @ [sp+8] = unsigned int height\r |
| 201 | @ lr = return address\r |
| 202 | \r |
| 203 | @ three temporary lines\r |
| 204 | \r |
| 205 | ldr ip, [sp] @ ip = srcstride\r |
| 206 | push {r4-r11,lr}\r |
| 207 | ldr r4, [sp, #(4*10)] @ r4 = dststride\r |
| 208 | ldr r5, [sp, #(4*11)] @ r5 = height\r |
| 209 | mov r6, sp @ r6 = sp\r |
| 210 | sub ip, ip, r3 @ ip = srcstride - width\r |
| 211 | bic sp, sp, #31 @ align sp to 32 bytes\r |
| 212 | sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width\r |
| 213 | sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r |
| 214 | sub r5, r5, #2 @ height -= 2\r |
| 215 | mov r10, sp @ tmpline3 = sp\r |
| 216 | lsl r7, #1 @ r7 = 2 * dststride - 4 * width\r |
| 217 | bic sp, sp, #31 @ align sp to 32 bytes\r |
| 218 | sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r |
| 219 | mov r11, sp @ tmpline2 = sp\r |
| 220 | bic sp, sp, #31 @ align sp to 32 bytes\r |
| 221 | sub sp, sp, r3, lsl #1 @ sp -= 2 * width\r |
| 222 | mov lr, sp @ tmpline1 = sp\r |
| 223 | bic sp, sp, #31 @ align sp to 32 bytes\r |
| 224 | sub r8, sp, #64 @ r8 = sp - 64\r |
| 225 | vst1.64 {d8-d11}, [r8:256] @ save q4,q5\r |
| 226 | sub r9, sp, #32 @ r9 = sp - 32\r |
| 227 | vst1.64 {d12-d15}, [r9:256] @ save q6,q7\r |
| 228 | sub sp, sp, #(36 + 64) @ sp -= (36 + 64)\r |
| 229 | str r6, [sp] @ oldsp = r6\r |
| 230 | str r5, [sp, #4] @ height = r5\r |
| 231 | str ip, [sp, #8] @ srcdiff = ip\r |
| 232 | str r7, [sp, #12] @ dstdiff = r7\r |
| 233 | str r4, [sp, #16] @ dststride = r4\r |
| 234 | str lr, [sp, #20] @ tmpline1 = lr\r |
| 235 | str r11, [sp, #24] @ tmpline2 = r11\r |
| 236 | str r10, [sp, #28] @ tmpline3 = r10\r |
| 237 | str r3, [sp, #32] @ width = r3\r |
| 238 | \r |
| 239 | @ r0 = src\r |
| 240 | @ r1 = dst\r |
| 241 | @ r2 = palette\r |
| 242 | @ r3 = counter\r |
| 243 | @ r4 = dst2\r |
| 244 | \r |
| 245 | @ r11 = bufptr1\r |
| 246 | @ ip = bufptr2\r |
| 247 | @ lr = bufptr3\r |
| 248 | \r |
| 249 | @ [sp] = oldsp\r |
| 250 | @ [sp, #4] = height\r |
| 251 | @ [sp, #8] = srcdiff (srcstride - width)\r |
| 252 | @ [sp, #12] = dstdiff (2 * dststride - 4 * width)\r |
| 253 | @ [sp, #16] = dststride\r |
| 254 | @ [sp, #20] = tmpline1\r |
| 255 | @ [sp, #24] = tmpline2\r |
| 256 | @ [sp, #28] = tmpline3\r |
| 257 | @ [sp, #32] = width\r |
| 258 | \r |
| 259 | @ lr = tmpline1\r |
| 260 | @ r3 = counter\r |
| 261 | \r |
| 262 | @ first line\r |
| 263 | neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r |
| 264 | \r |
| 265 | ldr r7, [sp, #8] @ r7 = srcdiff\r |
| 266 | ldr r3, [sp, #32] @ counter = width\r |
| 267 | ldr lr, [sp, #24] @ bufptr3 = tmpline2\r |
| 268 | add r0, r0, r7 @ src += srcdiff\r |
| 269 | \r |
| 270 | @ second line\r |
| 271 | neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r |
| 272 | \r |
| 273 | ldr r9, [sp, #16] @ r9 = dststride\r |
| 274 | ldr r3, [sp, #32] @ counter = width\r |
| 275 | ldr ip, [sp, #20] @ bufptr2 = tmpline1\r |
| 276 | ldr lr, [sp, #24] @ bufptr3 = tmpline2\r |
| 277 | add r4, r1, r9 @ dst2 = dst + dststride\r |
| 278 | \r |
| 279 | @ first temporary line\r |
| 280 | neon_eagle2x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, 1, 0\r |
| 281 | \r |
| 282 | ldr r7, [sp, #8] @ r7 = srcdiff\r |
| 283 | ldr r8, [sp, #12] @ r8 = dstdiff\r |
| 284 | ldr r3, [sp, #32] @ counter = width\r |
| 285 | ldr lr, [sp, #28] @ bufptr3 = tmpline3\r |
| 286 | add r0, r0, r7 @ src += srcdiff\r |
| 287 | add r1, r1, r8 @ dst += dstdiff\r |
| 288 | \r |
| 289 | 100:\r |
| 290 | \r |
| 291 | @ line n+1\r |
| 292 | neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip\r |
| 293 | \r |
| 294 | ldr r9, [sp, #16] @ r9 = dststride\r |
| 295 | ldr r11, [sp, #20] @ bufptr1 = tmpline1\r |
| 296 | ldr ip, [sp, #24] @ bufptr2 = tmpline2\r |
| 297 | ldr lr, [sp, #28] @ bufptr3 = tmpline3\r |
| 298 | add r4, r1, r9 @ dst2 = dst + dststride\r |
| 299 | ldr r3, [sp, #32] @ counter = width\r |
| 300 | str r11, [sp, #28] @ tmpline3 = bufptr1\r |
| 301 | str ip, [sp, #20] @ tmpline1 = bufptr2\r |
| 302 | str lr, [sp, #24] @ tmpline2 = bufptr3\r |
| 303 | \r |
| 304 | @ temporary line n\r |
| 305 | neon_eagle2x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, 1, 0\r |
| 306 | \r |
| 307 | ldr r6, [sp, #4] @ r6 = height\r |
| 308 | ldr r7, [sp, #8] @ r7 = srcdiff\r |
| 309 | ldr r8, [sp, #12] @ r8 = dstdiff\r |
| 310 | ldr r3, [sp, #32] @ counter = width\r |
| 311 | subS r6, r6, #1 @ height--\r |
| 312 | ldr lr, [sp, #28] @ bufptr3 = tmpline3\r |
| 313 | add r0, r0, r7 @ src += srcdiff\r |
| 314 | add r1, r1, r8 @ dst += dstdiff\r |
| 315 | str r6, [sp, #4] @ height = r6\r |
| 316 | bne 100b\r |
| 317 | \r |
| 318 | \r |
| 319 | ldr r9, [sp, #16] @ r9 = dststride\r |
| 320 | ldr r11, [sp, #20] @ bufptr1 = tmpline1\r |
| 321 | ldr ip, [sp, #24] @ bufptr2 = tmpline2\r |
| 322 | add r4, r1, r9 @ dst2 = dst + dststride\r |
| 323 | \r |
| 324 | @ last temporary line\r |
| 325 | neon_eagle2x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, 1, 0\r |
| 326 | \r |
| 327 | \r |
| 328 | add r6, sp, #36 @ r6 = sp + 36\r |
| 329 | ldr sp, [sp] @ sp = oldsp\r |
| 330 | vld1.64 {d8-d11}, [r6:256] @ restore q4,q5\r |
| 331 | add ip, r6, #32 @ ip = r6 + 32\r |
| 332 | vld1.64 {d12-d15}, [ip:256] @ restore q6,q7\r |
| 333 | pop {r4-r11,lr}\r |
| 334 | bx lr\r |
| 335 | \r |
| 336 | @ end procedure neon_eagle2x_8_16\r |
| 337 | \r |