| 1 | @@\r |
| 2 | @@ Copyright (C) 2012 Roman Pauer\r |
| 3 | @@\r |
| 4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
| 5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
| 6 | @@ the Software without restriction, including without limitation the rights to\r |
| 7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
| 8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
| 9 | @@ so, subject to the following conditions:\r |
| 10 | @@\r |
| 11 | @@ The above copyright notice and this permission notice shall be included in all\r |
| 12 | @@ copies or substantial portions of the Software.\r |
| 13 | @@\r |
| 14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
| 15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
| 16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
| 17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
| 18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
| 19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
| 20 | @@ SOFTWARE.\r |
| 21 | @@\r |
| 22 | \r |
| 23 | \r |
| 24 | \r |
| 25 | .macro _neon_normalxx_8_16_line_middle src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride, dA, dB\r |
| 26 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 27 | \r |
| 28 | ldr \reg2, [\src, #4] @ reg2 = src[4-7]\r |
| 29 | \r |
| 30 | ldr \reg3, [\src, #8] @ reg3 = src[8-11]\r |
| 31 | \r |
| 32 | ldr \reg4, [\src, #12] @ reg4 = src[12-15]\r |
| 33 | ubfx \reg5, \reg1, #0, #8 @ reg5 = src[0]\r |
| 34 | \r |
| 35 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[0]]\r |
| 36 | ubfx \reg6, \reg1, #8, #8 @ reg6 = src[1]\r |
| 37 | \r |
| 38 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[1]]\r |
| 39 | ubfx \reg7, \reg1, #16, #8 @ reg7 = src[2]\r |
| 40 | \r |
| 41 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[2]]\r |
| 42 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 43 | \r |
| 44 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 45 | ubfx \reg8, \reg2, #0, #8 @ reg8 = src[4]\r |
| 46 | \r |
| 47 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[4]]\r |
| 48 | ubfx \reg9, \reg2, #8, #8 @ reg9 = src[5]\r |
| 49 | \r |
| 50 | ldr \reg9, [\pal, \reg9, lsl #2] @ reg9 = pal[src[5]]\r |
| 51 | bfi \reg5, \reg6, #16, #16 @ reg5 = pal[src[0]] | pal[src[1]] << 16\r |
| 52 | \r |
| 53 | bfi \reg7, \reg1, #16, #16 @ reg7 = pal[src[2]] | pal[src[3]] << 16\r |
| 54 | ubfx \reg6, \reg2, #16, #8 @ reg6 = src[6]\r |
| 55 | \r |
| 56 | vmov d16, \reg5, \reg7 @ d16 = pal[src[0-3]]\r |
| 57 | lsr \reg2, \reg2, #24 @ reg2 = src[7]\r |
| 58 | \r |
| 59 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[6]]\r |
| 60 | bfi \reg8, \reg9, #16, #16 @ reg8 = pal[src[4]] | pal[src[5]] << 16\r |
| 61 | \r |
| 62 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[7]]\r |
| 63 | ubfx \reg1, \reg3, #0, #8 @ reg1 = src[8]\r |
| 64 | \r |
| 65 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[8]]\r |
| 66 | ubfx \reg5, \reg3, #8, #8 @ reg5 = src[9]\r |
| 67 | \r |
| 68 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[9]]\r |
| 69 | ubfx \reg7, \reg3, #16, #8 @ reg7 = src[10]\r |
| 70 | \r |
| 71 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[10]]\r |
| 72 | bfi \reg6, \reg2, #16, #16 @ reg6 = pal[src[6]] | pal[src[7]] << 16\r |
| 73 | \r |
| 74 | vmov d17, \reg8, \reg6 @ d17 = pal[src[4-7]]\r |
| 75 | lsr \reg3, \reg3, #24 @ reg3 = src[11]\r |
| 76 | \r |
| 77 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[11]]\r |
| 78 | ubfx \reg2, \reg4, #0, #8 @ reg2 = src[12]\r |
| 79 | \r |
| 80 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[12]]\r |
| 81 | ubfx \reg6, \reg4, #8, #8 @ reg6 = src[13]\r |
| 82 | \r |
| 83 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[13]]\r |
| 84 | ubfx \reg8, \reg4, #16, #8 @ reg8 = src[14]\r |
| 85 | \r |
| 86 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[14]]\r |
| 87 | lsr \reg4, \reg4, #24 @ reg4 = src[15]\r |
| 88 | \r |
| 89 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[15]]\r |
| 90 | bfi \reg1, \reg5, #16, #16 @ reg1 = pal[src[8]] | pal[src[9]] << 16\r |
| 91 | \r |
| 92 | bfi \reg7, \reg3, #16, #16 @ reg7 = pal[src[10]] | pal[src[11]] << 16\r |
| 93 | bfi \reg2, \reg6, #16, #16 @ reg2 = pal[src[12]] | pal[src[13]] << 16\r |
| 94 | \r |
| 95 | vmov \dA, \reg1, \reg7 @ dA = pal[src[8-11]]\r |
| 96 | sub \counter, \counter, #16 @ counter -= 16\r |
| 97 | \r |
| 98 | bfi \reg8, \reg4, #16, #16 @ reg8 = pal[src[14]] | pal[src[15]] << 16\r |
| 99 | add \src, \src, #16 @ src += 16\r |
| 100 | \r |
| 101 | vmov \dB, \reg2, \reg8 @ dB = pal[src[12-15]]\r |
| 102 | cmp \counter, #16\r |
| 103 | .endm\r |
| 104 | \r |
| 105 | .macro neon_normal1x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9\r |
| 106 | @ align src to 4 bytes\r |
| 107 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
| 108 | beq 10f\r |
| 109 | \r |
| 110 | @ first 1-3 pixels\r |
| 111 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 112 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
| 113 | \r |
| 114 | add \src, \src, \reg5 @ src += reg5\r |
| 115 | sub \counter, \counter, \reg5 @ counter -= reg5\r |
| 116 | \r |
| 117 | subS \reg5, \reg5, #1 @ reg5--\r |
| 118 | \r |
| 119 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 120 | ubfxne \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 121 | \r |
| 122 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[reg2]\r |
| 123 | \r |
| 124 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[reg3]\r |
| 125 | \r |
| 126 | strh \reg2, [\dst] @ dst[0] = reg2\r |
| 127 | \r |
| 128 | strneh \reg3, [\dst, #2]! @ dst[1] = reg3; dst++\r |
| 129 | subneS \reg5, \reg5, #1 @ reg5--\r |
| 130 | \r |
| 131 | ubfxne \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 132 | add \dst, \dst, #2 @ dst++\r |
| 133 | \r |
| 134 | ldrne \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[reg4]\r |
| 135 | \r |
| 136 | strneh \reg4, [\dst], #2 @ dst[2] = reg4; dst++\r |
| 137 | \r |
| 138 | @ middle pixels (16 per iteration)\r |
| 139 | 10:\r |
| 140 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, , d18, d19\r |
| 141 | \r |
| 142 | vst1.16 {d16-d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 2*16\r |
| 143 | bhs 10b\r |
| 144 | \r |
| 145 | @ last 0-15 bytes\r |
| 146 | \r |
| 147 | cmp \counter, #0\r |
| 148 | beq 40f\r |
| 149 | \r |
| 150 | cmp \counter, #4\r |
| 151 | blo 30f\r |
| 152 | \r |
| 153 | @ 4-12 pixels (4 pre iteration)\r |
| 154 | 20:\r |
| 155 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 156 | sub \counter, \counter, #4 @ counter -= 4\r |
| 157 | \r |
| 158 | add \src, \src, #4 @ src += 4\r |
| 159 | add \dst, \dst, #(2*4) @ dst += 4\r |
| 160 | \r |
| 161 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 162 | cmp \counter, #4\r |
| 163 | \r |
| 164 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
| 165 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 166 | \r |
| 167 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
| 168 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 169 | \r |
| 170 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
| 171 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 172 | \r |
| 173 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 174 | \r |
| 175 | strh \reg2, [\dst, #-8] @ dst[0] = reg2\r |
| 176 | \r |
| 177 | strh \reg3, [\dst, #-6] @ dst[1] = reg3\r |
| 178 | \r |
| 179 | strh \reg4, [\dst, #-4] @ dst[2] = reg4\r |
| 180 | \r |
| 181 | strh \reg1, [\dst, #-2] @ dst[3] = reg1\r |
| 182 | bhs 20b\r |
| 183 | \r |
| 184 | cmp \counter, #0\r |
| 185 | beq 40f\r |
| 186 | \r |
| 187 | @ last 1-3 pixels\r |
| 188 | 30:\r |
| 189 | ldrb \reg1, [\src] @ reg1 = src[0]\r |
| 190 | subS \counter, \counter, #1 @ counter--\r |
| 191 | \r |
| 192 | ldrneb \reg2, [\src, #1]! @ reg2 = src[1]; src++\r |
| 193 | \r |
| 194 | add \src, \src, #1 @ src++\r |
| 195 | \r |
| 196 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 197 | \r |
| 198 | ldrne \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[1]]\r |
| 199 | \r |
| 200 | strh \reg1, [\dst] @ dst[0] = reg1\r |
| 201 | \r |
| 202 | strneh \reg2, [\dst, #2]! @ dst[1] = reg2; dst++\r |
| 203 | subneS \counter, \counter, #1 @ counter--\r |
| 204 | \r |
| 205 | ldrneb \reg3, [\src], #1 @ reg3 = src[2]; src++\r |
| 206 | add \dst, \dst, #2 @ dst++\r |
| 207 | \r |
| 208 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[2]]\r |
| 209 | \r |
| 210 | strneh \reg3, [\dst], #2 @ dst[2] = reg3; dst++\r |
| 211 | \r |
| 212 | 40:\r |
| 213 | .endm\r |
| 214 | \r |
| 215 | .macro neon_normal2x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
| 216 | @ align src to 4 bytes\r |
| 217 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
| 218 | beq 10f\r |
| 219 | \r |
| 220 | @ first 1-3 pixels\r |
| 221 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
| 222 | 1:\r |
| 223 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 224 | add \reg2, \dst, \dststride\r |
| 225 | \r |
| 226 | add \dst, \dst, #4 @ dst += 2*2\r |
| 227 | sub \counter, \counter, #1 @ counter--\r |
| 228 | \r |
| 229 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 230 | subS \reg5, \reg5, #1 @ reg5--\r |
| 231 | \r |
| 232 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r |
| 233 | \r |
| 234 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r |
| 235 | \r |
| 236 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
| 237 | \r |
| 238 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r |
| 239 | bne 1b\r |
| 240 | \r |
| 241 | @ middle pixels (16 per iteration)\r |
| 242 | 10:\r |
| 243 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d20, d21\r |
| 244 | \r |
| 245 | vmov q9, q8\r |
| 246 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
| 247 | \r |
| 248 | vmov q11, q10\r |
| 249 | vst2.16 {q8,q9}, [\dst]! @ dst[0-7] = q8-q9; dst += 2*2*8\r |
| 250 | \r |
| 251 | vst2.16 {q10,q11}, [\dst]! @ dst[8-15] = q10-q11; dst += 2*2*8\r |
| 252 | \r |
| 253 | vst2.16 {q8,q9}, [\reg1]! @ dst1[0-7] = q8-q9; dst1 += 2*2*8\r |
| 254 | \r |
| 255 | vst2.16 {q10,q11}, [\reg1]! @ dst1[8-15] = q10-q11; dst1 += 2*2*8\r |
| 256 | bhs 10b\r |
| 257 | \r |
| 258 | @ last 0-15 bytes\r |
| 259 | \r |
| 260 | cmp \counter, #0\r |
| 261 | beq 40f\r |
| 262 | \r |
| 263 | cmp \counter, #4\r |
| 264 | blo 30f\r |
| 265 | \r |
| 266 | @ 4-12 pixels (4 pre iteration)\r |
| 267 | 20:\r |
| 268 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 269 | sub \counter, \counter, #4 @ counter -= 4\r |
| 270 | \r |
| 271 | add \src, \src, #4 @ src += 4\r |
| 272 | \r |
| 273 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 274 | cmp \counter, #4\r |
| 275 | \r |
| 276 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
| 277 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 278 | \r |
| 279 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
| 280 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 281 | \r |
| 282 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
| 283 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 284 | \r |
| 285 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 286 | \r |
| 287 | add \reg5, \dst, \dststride\r |
| 288 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
| 289 | \r |
| 290 | vmov.32 d16[0], \reg2\r |
| 291 | \r |
| 292 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
| 293 | \r |
| 294 | vmov.32 d16[1], \reg4\r |
| 295 | \r |
| 296 | vmov d17, d16\r |
| 297 | \r |
| 298 | vst2.16 {d16,d17}, [\dst]! @ dst[0-7] = d16-d17; dst += 2*2*4\r |
| 299 | \r |
| 300 | vst2.16 {d16,d17}, [\reg5] @ dst1[0-7] = d16-d17\r |
| 301 | bhs 20b\r |
| 302 | \r |
| 303 | cmp \counter, #0\r |
| 304 | beq 40f\r |
| 305 | \r |
| 306 | @ last 1-3 pixels\r |
| 307 | 30:\r |
| 308 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 309 | add \reg2, \dst, \dststride\r |
| 310 | \r |
| 311 | add \dst, \dst, #4 @ dst += 2*2\r |
| 312 | \r |
| 313 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 314 | subS \counter, \counter, #1 @ counter--\r |
| 315 | \r |
| 316 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r |
| 317 | \r |
| 318 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r |
| 319 | \r |
| 320 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
| 321 | \r |
| 322 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r |
| 323 | bne 30b\r |
| 324 | \r |
| 325 | 40:\r |
| 326 | .endm\r |
| 327 | \r |
| 328 | .macro neon_normal3x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
| 329 | @ align src to 4 bytes\r |
| 330 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
| 331 | beq 10f\r |
| 332 | \r |
| 333 | @ first 1-3 pixels\r |
| 334 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
| 335 | 1:\r |
| 336 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 337 | add \reg2, \dst, \dststride\r |
| 338 | \r |
| 339 | add \reg3, \reg2, \dststride\r |
| 340 | add \dst, \dst, #6 @ dst += 3*2\r |
| 341 | \r |
| 342 | sub \counter, \counter, #1 @ counter--\r |
| 343 | \r |
| 344 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 345 | subS \reg5, \reg5, #1 @ reg5--\r |
| 346 | \r |
| 347 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r |
| 348 | \r |
| 349 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r |
| 350 | \r |
| 351 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r |
| 352 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
| 353 | \r |
| 354 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
| 355 | \r |
| 356 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r |
| 357 | \r |
| 358 | strh \reg1, [\reg3] @ dst2[0] = reg1\r |
| 359 | \r |
| 360 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r |
| 361 | bne 1b\r |
| 362 | \r |
| 363 | @ middle pixels (16 per iteration)\r |
| 364 | 10:\r |
| 365 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d22, d23\r |
| 366 | \r |
| 367 | vmov q9, q8\r |
| 368 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
| 369 | \r |
| 370 | vmov q10, q8\r |
| 371 | add \reg2, \dst, \dststride, lsl #1 @ reg1 = dst + 2 * dststride\r |
| 372 | \r |
| 373 | vmov q12, q11\r |
| 374 | vst3.16 {d16,d18,d20}, [\dst]! @ dst[0-3] = q8-q10[0]; dst += 3*2*4\r |
| 375 | \r |
| 376 | vmov q13, q11\r |
| 377 | vst3.16 {d17,d19,d21}, [\dst]! @ dst[4-7] = q8-q10[1]; dst += 3*2*4\r |
| 378 | \r |
| 379 | vst3.16 {d22,d24,d26}, [\dst]! @ dst[8-11] = q11-q13[0]; dst += 3*2*4\r |
| 380 | \r |
| 381 | vst3.16 {d23,d25,d27}, [\dst]! @ dst[12-15] = q11-q13[1]; dst += 3*2*4\r |
| 382 | \r |
| 383 | vst3.16 {d16,d18,d20}, [\reg1]! @ dst1[0-3] = q8-q10[0]; dst1 += 3*2*4\r |
| 384 | \r |
| 385 | vst3.16 {d17,d19,d21}, [\reg1]! @ dst1[4-7] = q8-q10[1]; dst1 += 3*2*4\r |
| 386 | \r |
| 387 | vst3.16 {d22,d24,d26}, [\reg1]! @ dst1[8-11] = q11-q13[0]; dst1 += 3*2*4\r |
| 388 | \r |
| 389 | vst3.16 {d23,d25,d27}, [\reg1]! @ dst1[12-15] = q11-q13[1]; dst1 += 3*2*4\r |
| 390 | \r |
| 391 | vst3.16 {d16,d18,d20}, [\reg2]! @ dst2[0-3] = q8-q10[0]; dst2 += 3*2*4\r |
| 392 | \r |
| 393 | vst3.16 {d17,d19,d21}, [\reg2]! @ dst2[4-7] = q8-q10[1]; dst2 += 3*2*4\r |
| 394 | \r |
| 395 | vst3.16 {d22,d24,d26}, [\reg2]! @ dst2[8-11] = q11-q13[0]; dst2 += 3*2*4\r |
| 396 | \r |
| 397 | vst3.16 {d23,d25,d27}, [\reg2]! @ dst2[12-15] = q11-q13[1]; dst2 += 3*2*4\r |
| 398 | bhs 10b\r |
| 399 | \r |
| 400 | @ last 0-15 bytes\r |
| 401 | \r |
| 402 | cmp \counter, #0\r |
| 403 | beq 40f\r |
| 404 | \r |
| 405 | cmp \counter, #4\r |
| 406 | blo 30f\r |
| 407 | \r |
| 408 | @ 4-12 pixels (4 pre iteration)\r |
| 409 | 20:\r |
| 410 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 411 | sub \counter, \counter, #4 @ counter -= 4\r |
| 412 | \r |
| 413 | add \src, \src, #4 @ src += 4\r |
| 414 | \r |
| 415 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 416 | cmp \counter, #4\r |
| 417 | \r |
| 418 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
| 419 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 420 | \r |
| 421 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
| 422 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 423 | \r |
| 424 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
| 425 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 426 | \r |
| 427 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 428 | \r |
| 429 | add \reg5, \dst, \dststride\r |
| 430 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
| 431 | \r |
| 432 | vmov.32 d16[0], \reg2\r |
| 433 | add \reg6, \reg5, \dststride\r |
| 434 | \r |
| 435 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
| 436 | \r |
| 437 | vmov.32 d16[1], \reg4\r |
| 438 | \r |
| 439 | vmov d17, d16\r |
| 440 | \r |
| 441 | vmov d18, d16\r |
| 442 | \r |
| 443 | vst3.16 {d16,d17,d18}, [\dst]! @ dst[0-11] = d16-d18; dst += 3*2*4\r |
| 444 | \r |
| 445 | vst3.16 {d16,d17,d18}, [\reg5] @ dst1[0-11] = d16-d18\r |
| 446 | \r |
| 447 | vst3.16 {d16,d17,d18}, [\reg6] @ dst2[0-11] = d16-d18\r |
| 448 | bhs 20b\r |
| 449 | \r |
| 450 | cmp \counter, #0\r |
| 451 | beq 40f\r |
| 452 | \r |
| 453 | @ last 1-3 pixels\r |
| 454 | 30:\r |
| 455 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 456 | add \reg2, \dst, \dststride\r |
| 457 | \r |
| 458 | add \reg3, \reg2, \dststride\r |
| 459 | add \dst, \dst, #6 @ dst += 3*2\r |
| 460 | \r |
| 461 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 462 | subS \counter, \counter, #1 @ counter--\r |
| 463 | \r |
| 464 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r |
| 465 | \r |
| 466 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r |
| 467 | \r |
| 468 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r |
| 469 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
| 470 | \r |
| 471 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
| 472 | \r |
| 473 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r |
| 474 | \r |
| 475 | strh \reg1, [\reg3] @ dst2[0] = reg1\r |
| 476 | \r |
| 477 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r |
| 478 | bne 30b\r |
| 479 | \r |
| 480 | 40:\r |
| 481 | .endm\r |
| 482 | \r |
| 483 | .macro neon_normal4x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
| 484 | @ align src to 4 bytes\r |
| 485 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
| 486 | beq 10f\r |
| 487 | \r |
| 488 | @ first 1-3 pixels\r |
| 489 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
| 490 | 1:\r |
| 491 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 492 | add \reg2, \dst, \dststride\r |
| 493 | \r |
| 494 | add \reg3, \reg2, \dststride\r |
| 495 | add \dst, \dst, #8 @ dst += 4*2\r |
| 496 | \r |
| 497 | sub \counter, \counter, #1 @ counter--\r |
| 498 | \r |
| 499 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 500 | add \reg4, \reg3, \dststride\r |
| 501 | \r |
| 502 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r |
| 503 | subS \reg5, \reg5, #1 @ reg5--\r |
| 504 | \r |
| 505 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r |
| 506 | \r |
| 507 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
| 508 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r |
| 509 | \r |
| 510 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r |
| 511 | \r |
| 512 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r |
| 513 | \r |
| 514 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r |
| 515 | \r |
| 516 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r |
| 517 | \r |
| 518 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r |
| 519 | \r |
| 520 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r |
| 521 | bne 1b\r |
| 522 | \r |
| 523 | @ middle pixels (16 per iteration)\r |
| 524 | 10:\r |
| 525 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d24, d25\r |
| 526 | \r |
| 527 | vmov q9, q8\r |
| 528 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
| 529 | \r |
| 530 | vmov q10, q8\r |
| 531 | add \reg2, \dst, \dststride, lsl #1 @ reg2 = dst + 2 * dststride\r |
| 532 | \r |
| 533 | vmov q11, q8\r |
| 534 | add \reg3, \reg1, \dststride,lsl #1 @ reg3 = dst + 3 * dststride\r |
| 535 | \r |
| 536 | vmov q13, q12\r |
| 537 | vst4.16 {d16,d18,d20,d22}, [\dst]! @ dst[0-3] = q8-q11[0]; dst += 4*2*4\r |
| 538 | \r |
| 539 | vmov q14, q12\r |
| 540 | \r |
| 541 | vmov q15, q12\r |
| 542 | vst4.16 {d17,d19,d21,d23}, [\dst]! @ dst[4-7] = q8-q11[1]; dst += 4*2*4\r |
| 543 | \r |
| 544 | vst4.16 {d24,d26,d28,d30}, [\dst]! @ dst[8-11] = q12-q15[0]; dst += 4*2*4\r |
| 545 | \r |
| 546 | vst4.16 {d25,d27,d29,d31}, [\dst]! @ dst[12-15] = q12-q15[1]; dst += 4*2*4\r |
| 547 | \r |
| 548 | vst4.16 {d16,d18,d20,d22}, [\reg1]! @ dst1[0-3] = q8-q11[0]; dst1 += 4*2*4\r |
| 549 | \r |
| 550 | vst4.16 {d17,d19,d21,d23}, [\reg1]! @ dst1[4-7] = q8-q11[1]; dst1 += 4*2*4\r |
| 551 | \r |
| 552 | vst4.16 {d24,d26,d28,d30}, [\reg1]! @ dst1[8-11] = q12-q15[0]; dst1 += 4*2*4\r |
| 553 | \r |
| 554 | vst4.16 {d25,d27,d29,d31}, [\reg1]! @ dst1[12-15] = q12-q15[1]; dst1 += 4*2*4\r |
| 555 | \r |
| 556 | vst4.16 {d16,d18,d20,d22}, [\reg2]! @ dst2[0-3] = q8-q11[0]; dst2 += 4*2*4\r |
| 557 | \r |
| 558 | vst4.16 {d17,d19,d21,d23}, [\reg2]! @ dst2[4-7] = q8-q11[1]; dst2 += 4*2*4\r |
| 559 | \r |
| 560 | vst4.16 {d24,d26,d28,d30}, [\reg2]! @ dst2[8-11] = q12-q15[0]; dst2 += 4*2*4\r |
| 561 | \r |
| 562 | vst4.16 {d25,d27,d29,d31}, [\reg2]! @ dst2[12-15] = q12-q15[1]; dst2 += 4*2*4\r |
| 563 | \r |
| 564 | vst4.16 {d16,d18,d20,d22}, [\reg3]! @ dst3[0-3] = q8-q11[0]; dst3 += 4*2*4\r |
| 565 | \r |
| 566 | vst4.16 {d17,d19,d21,d23}, [\reg3]! @ dst3[4-7] = q8-q11[1]; dst3 += 4*2*4\r |
| 567 | \r |
| 568 | vst4.16 {d24,d26,d28,d30}, [\reg3]! @ dst3[8-11] = q12-q15[0]; dst3 += 4*2*4\r |
| 569 | \r |
| 570 | vst4.16 {d25,d27,d29,d31}, [\reg3]! @ dst3[12-15] = q12-q15[1]; dst3 += 4*2*4\r |
| 571 | bhs 10b\r |
| 572 | \r |
| 573 | @ last 0-15 bytes\r |
| 574 | \r |
| 575 | cmp \counter, #0\r |
| 576 | beq 40f\r |
| 577 | \r |
| 578 | cmp \counter, #4\r |
| 579 | blo 30f\r |
| 580 | \r |
| 581 | @ 4-12 pixels (4 pre iteration)\r |
| 582 | 20:\r |
| 583 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 584 | sub \counter, \counter, #4 @ counter -= 4\r |
| 585 | \r |
| 586 | add \src, \src, #4 @ src += 4\r |
| 587 | \r |
| 588 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 589 | cmp \counter, #4\r |
| 590 | \r |
| 591 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
| 592 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 593 | \r |
| 594 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
| 595 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 596 | \r |
| 597 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
| 598 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 599 | \r |
| 600 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 601 | \r |
| 602 | add \reg5, \dst, \dststride\r |
| 603 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
| 604 | \r |
| 605 | vmov.32 d16[0], \reg2\r |
| 606 | add \reg6, \reg5, \dststride\r |
| 607 | \r |
| 608 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
| 609 | add \reg7, \reg6, \dststride\r |
| 610 | \r |
| 611 | vmov.32 d16[1], \reg4\r |
| 612 | \r |
| 613 | vmov d17, d16\r |
| 614 | \r |
| 615 | vmov d18, d16\r |
| 616 | \r |
| 617 | vmov d19, d16\r |
| 618 | \r |
| 619 | vst4.16 {d16,d17,d18,d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 4*2*4\r |
| 620 | \r |
| 621 | vst4.16 {d16,d17,d18,d19}, [\reg5] @ dst1[0-15] = d16-d19\r |
| 622 | \r |
| 623 | vst4.16 {d16,d17,d18,d19}, [\reg6] @ dst2[0-15] = d16-d19\r |
| 624 | \r |
| 625 | vst4.16 {d16,d17,d18,d19}, [\reg7] @ dst3[0-15] = d16-d19\r |
| 626 | bhs 20b\r |
| 627 | \r |
| 628 | cmp \counter, #0\r |
| 629 | beq 40f\r |
| 630 | \r |
| 631 | @ last 1-3 pixels\r |
| 632 | 30:\r |
| 633 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 634 | add \reg2, \dst, \dststride\r |
| 635 | \r |
| 636 | add \reg3, \reg2, \dststride\r |
| 637 | add \dst, \dst, #8 @ dst += 4*2\r |
| 638 | \r |
| 639 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 640 | add \reg4, \reg3, \dststride\r |
| 641 | \r |
| 642 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r |
| 643 | subS \counter, \counter, #1 @ counter--\r |
| 644 | \r |
| 645 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r |
| 646 | \r |
| 647 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
| 648 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r |
| 649 | \r |
| 650 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r |
| 651 | \r |
| 652 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r |
| 653 | \r |
| 654 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r |
| 655 | \r |
| 656 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r |
| 657 | \r |
| 658 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r |
| 659 | \r |
| 660 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r |
| 661 | bne 30b\r |
| 662 | \r |
| 663 | 40:\r |
| 664 | .endm\r |
| 665 | \r |