| 1 | @@\r |
| 2 | @@ Copyright (C) 2012 Roman Pauer\r |
| 3 | @@\r |
| 4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
| 5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
| 6 | @@ the Software without restriction, including without limitation the rights to\r |
| 7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
| 8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
| 9 | @@ so, subject to the following conditions:\r |
| 10 | @@\r |
| 11 | @@ The above copyright notice and this permission notice shall be included in all\r |
| 12 | @@ copies or substantial portions of the Software.\r |
| 13 | @@\r |
| 14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
| 15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
| 16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
| 17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
| 18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
| 19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
| 20 | @@ SOFTWARE.\r |
| 21 | @@\r |
| 22 | \r |
| 23 | \r |
| 24 | #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4)\r |
| 25 | /* can't use because gas wants ',' before ':' */\r |
| 26 | #define A128\r |
| 27 | #define A256\r |
| 28 | #else\r |
| 29 | #define A128 :128\r |
| 30 | #define A256 :256\r |
| 31 | #endif\r |
| 32 | \r |
| 33 | .macro bgr1555_to_rgb565 dr0 dr1 t0 t1 t2 ar\r |
| 34 | mov \ar, #0x07c0\r |
| 35 | vshl.u16 \t0, \dr0, #11\r |
| 36 | vshl.u16 \t1, \dr1, #11\r |
| 37 | vshl.u16 \dr0, \dr0, #1\r |
| 38 | vshl.u16 \dr1, \dr1, #1\r |
| 39 | vdup.16 \t2, \ar\r |
| 40 | vsri.u16 \t0, \dr0, #11\r |
| 41 | vsri.u16 \t1, \dr1, #11\r |
| 42 | vbif \dr0, \t0, \t2\r |
| 43 | vbif \dr1, \t1, \t2\r |
| 44 | .endm\r |
| 45 | \r |
| 46 | .macro _neon_normalxx_8_16_line_middle src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride, dA, dB\r |
| 47 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 48 | \r |
| 49 | ldr \reg2, [\src, #4] @ reg2 = src[4-7]\r |
| 50 | \r |
| 51 | ldr \reg3, [\src, #8] @ reg3 = src[8-11]\r |
| 52 | \r |
| 53 | ldr \reg4, [\src, #12] @ reg4 = src[12-15]\r |
| 54 | ubfx \reg5, \reg1, #0, #8 @ reg5 = src[0]\r |
| 55 | \r |
| 56 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[0]]\r |
| 57 | ubfx \reg6, \reg1, #8, #8 @ reg6 = src[1]\r |
| 58 | \r |
| 59 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[1]]\r |
| 60 | ubfx \reg7, \reg1, #16, #8 @ reg7 = src[2]\r |
| 61 | \r |
| 62 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[2]]\r |
| 63 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 64 | \r |
| 65 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 66 | ubfx \reg8, \reg2, #0, #8 @ reg8 = src[4]\r |
| 67 | \r |
| 68 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[4]]\r |
| 69 | ubfx \reg9, \reg2, #8, #8 @ reg9 = src[5]\r |
| 70 | \r |
| 71 | ldr \reg9, [\pal, \reg9, lsl #2] @ reg9 = pal[src[5]]\r |
| 72 | bfi \reg5, \reg6, #16, #16 @ reg5 = pal[src[0]] | pal[src[1]] << 16\r |
| 73 | \r |
| 74 | bfi \reg7, \reg1, #16, #16 @ reg7 = pal[src[2]] | pal[src[3]] << 16\r |
| 75 | ubfx \reg6, \reg2, #16, #8 @ reg6 = src[6]\r |
| 76 | \r |
| 77 | vmov d16, \reg5, \reg7 @ d16 = pal[src[0-3]]\r |
| 78 | lsr \reg2, \reg2, #24 @ reg2 = src[7]\r |
| 79 | \r |
| 80 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[6]]\r |
| 81 | bfi \reg8, \reg9, #16, #16 @ reg8 = pal[src[4]] | pal[src[5]] << 16\r |
| 82 | \r |
| 83 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[7]]\r |
| 84 | ubfx \reg1, \reg3, #0, #8 @ reg1 = src[8]\r |
| 85 | \r |
| 86 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[8]]\r |
| 87 | ubfx \reg5, \reg3, #8, #8 @ reg5 = src[9]\r |
| 88 | \r |
| 89 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[9]]\r |
| 90 | ubfx \reg7, \reg3, #16, #8 @ reg7 = src[10]\r |
| 91 | \r |
| 92 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[10]]\r |
| 93 | bfi \reg6, \reg2, #16, #16 @ reg6 = pal[src[6]] | pal[src[7]] << 16\r |
| 94 | \r |
| 95 | vmov d17, \reg8, \reg6 @ d17 = pal[src[4-7]]\r |
| 96 | lsr \reg3, \reg3, #24 @ reg3 = src[11]\r |
| 97 | \r |
| 98 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[11]]\r |
| 99 | ubfx \reg2, \reg4, #0, #8 @ reg2 = src[12]\r |
| 100 | \r |
| 101 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[12]]\r |
| 102 | ubfx \reg6, \reg4, #8, #8 @ reg6 = src[13]\r |
| 103 | \r |
| 104 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[13]]\r |
| 105 | ubfx \reg8, \reg4, #16, #8 @ reg8 = src[14]\r |
| 106 | \r |
| 107 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[14]]\r |
| 108 | lsr \reg4, \reg4, #24 @ reg4 = src[15]\r |
| 109 | \r |
| 110 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[15]]\r |
| 111 | bfi \reg1, \reg5, #16, #16 @ reg1 = pal[src[8]] | pal[src[9]] << 16\r |
| 112 | \r |
| 113 | bfi \reg7, \reg3, #16, #16 @ reg7 = pal[src[10]] | pal[src[11]] << 16\r |
| 114 | bfi \reg2, \reg6, #16, #16 @ reg2 = pal[src[12]] | pal[src[13]] << 16\r |
| 115 | \r |
| 116 | vmov \dA, \reg1, \reg7 @ dA = pal[src[8-11]]\r |
| 117 | sub \counter, \counter, #16 @ counter -= 16\r |
| 118 | \r |
| 119 | bfi \reg8, \reg4, #16, #16 @ reg8 = pal[src[14]] | pal[src[15]] << 16\r |
| 120 | add \src, \src, #16 @ src += 16\r |
| 121 | \r |
| 122 | vmov \dB, \reg2, \reg8 @ dB = pal[src[12-15]]\r |
| 123 | cmp \counter, #16\r |
| 124 | .endm\r |
| 125 | \r |
| 126 | .macro neon_normal1x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9\r |
| 127 | @ align src to 4 bytes\r |
| 128 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
| 129 | beq 10f\r |
| 130 | \r |
| 131 | @ first 1-3 pixels\r |
| 132 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 133 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
| 134 | \r |
| 135 | add \src, \src, \reg5 @ src += reg5\r |
| 136 | sub \counter, \counter, \reg5 @ counter -= reg5\r |
| 137 | \r |
| 138 | subS \reg5, \reg5, #1 @ reg5--\r |
| 139 | \r |
| 140 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 141 | ubfxne \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 142 | \r |
| 143 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[reg2]\r |
| 144 | \r |
| 145 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[reg3]\r |
| 146 | \r |
| 147 | strh \reg2, [\dst] @ dst[0] = reg2\r |
| 148 | \r |
| 149 | strneh \reg3, [\dst, #2]! @ dst[1] = reg3; dst++\r |
| 150 | subneS \reg5, \reg5, #1 @ reg5--\r |
| 151 | \r |
| 152 | ubfxne \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 153 | add \dst, \dst, #2 @ dst++\r |
| 154 | \r |
| 155 | ldrne \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[reg4]\r |
| 156 | \r |
| 157 | strneh \reg4, [\dst], #2 @ dst[2] = reg4; dst++\r |
| 158 | \r |
| 159 | @ middle pixels (16 per iteration)\r |
| 160 | 10:\r |
| 161 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, , d18, d19\r |
| 162 | \r |
| 163 | vst1.16 {d16-d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 2*16\r |
| 164 | bhs 10b\r |
| 165 | \r |
| 166 | @ last 0-15 bytes\r |
| 167 | \r |
| 168 | cmp \counter, #0\r |
| 169 | beq 40f\r |
| 170 | \r |
| 171 | cmp \counter, #4\r |
| 172 | blo 30f\r |
| 173 | \r |
| 174 | @ 4-12 pixels (4 pre iteration)\r |
| 175 | 20:\r |
| 176 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 177 | sub \counter, \counter, #4 @ counter -= 4\r |
| 178 | \r |
| 179 | add \src, \src, #4 @ src += 4\r |
| 180 | add \dst, \dst, #(2*4) @ dst += 4\r |
| 181 | \r |
| 182 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 183 | cmp \counter, #4\r |
| 184 | \r |
| 185 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
| 186 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 187 | \r |
| 188 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
| 189 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 190 | \r |
| 191 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
| 192 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 193 | \r |
| 194 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 195 | \r |
| 196 | strh \reg2, [\dst, #-8] @ dst[0] = reg2\r |
| 197 | \r |
| 198 | strh \reg3, [\dst, #-6] @ dst[1] = reg3\r |
| 199 | \r |
| 200 | strh \reg4, [\dst, #-4] @ dst[2] = reg4\r |
| 201 | \r |
| 202 | strh \reg1, [\dst, #-2] @ dst[3] = reg1\r |
| 203 | bhs 20b\r |
| 204 | \r |
| 205 | cmp \counter, #0\r |
| 206 | beq 40f\r |
| 207 | \r |
| 208 | @ last 1-3 pixels\r |
| 209 | 30:\r |
| 210 | ldrb \reg1, [\src] @ reg1 = src[0]\r |
| 211 | subS \counter, \counter, #1 @ counter--\r |
| 212 | \r |
| 213 | ldrneb \reg2, [\src, #1]! @ reg2 = src[1]; src++\r |
| 214 | \r |
| 215 | add \src, \src, #1 @ src++\r |
| 216 | \r |
| 217 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 218 | \r |
| 219 | ldrne \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[1]]\r |
| 220 | \r |
| 221 | strh \reg1, [\dst] @ dst[0] = reg1\r |
| 222 | \r |
| 223 | strneh \reg2, [\dst, #2]! @ dst[1] = reg2; dst++\r |
| 224 | subneS \counter, \counter, #1 @ counter--\r |
| 225 | \r |
| 226 | ldrneb \reg3, [\src], #1 @ reg3 = src[2]; src++\r |
| 227 | add \dst, \dst, #2 @ dst++\r |
| 228 | \r |
| 229 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[2]]\r |
| 230 | \r |
| 231 | strneh \reg3, [\dst], #2 @ dst[2] = reg3; dst++\r |
| 232 | \r |
| 233 | 40:\r |
| 234 | .endm\r |
| 235 | \r |
| 236 | .macro neon_normal2x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
| 237 | @ align src to 4 bytes\r |
| 238 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
| 239 | beq 10f\r |
| 240 | \r |
| 241 | @ first 1-3 pixels\r |
| 242 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
| 243 | 1:\r |
| 244 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 245 | add \reg2, \dst, \dststride\r |
| 246 | \r |
| 247 | add \dst, \dst, #4 @ dst += 2*2\r |
| 248 | sub \counter, \counter, #1 @ counter--\r |
| 249 | \r |
| 250 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 251 | subS \reg5, \reg5, #1 @ reg5--\r |
| 252 | \r |
| 253 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r |
| 254 | \r |
| 255 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r |
| 256 | \r |
| 257 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
| 258 | \r |
| 259 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r |
| 260 | bne 1b\r |
| 261 | \r |
| 262 | @ middle pixels (16 per iteration)\r |
| 263 | 10:\r |
| 264 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d20, d21\r |
| 265 | \r |
| 266 | vmov q9, q8\r |
| 267 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
| 268 | \r |
| 269 | vmov q11, q10\r |
| 270 | vst2.16 {q8,q9}, [\dst]! @ dst[0-7] = q8-q9; dst += 2*2*8\r |
| 271 | \r |
| 272 | vst2.16 {q10,q11}, [\dst]! @ dst[8-15] = q10-q11; dst += 2*2*8\r |
| 273 | \r |
| 274 | vst2.16 {q8,q9}, [\reg1]! @ dst1[0-7] = q8-q9; dst1 += 2*2*8\r |
| 275 | \r |
| 276 | vst2.16 {q10,q11}, [\reg1]! @ dst1[8-15] = q10-q11; dst1 += 2*2*8\r |
| 277 | bhs 10b\r |
| 278 | \r |
| 279 | @ last 0-15 bytes\r |
| 280 | \r |
| 281 | cmp \counter, #0\r |
| 282 | beq 40f\r |
| 283 | \r |
| 284 | cmp \counter, #4\r |
| 285 | blo 30f\r |
| 286 | \r |
| 287 | @ 4-12 pixels (4 pre iteration)\r |
| 288 | 20:\r |
| 289 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 290 | sub \counter, \counter, #4 @ counter -= 4\r |
| 291 | \r |
| 292 | add \src, \src, #4 @ src += 4\r |
| 293 | \r |
| 294 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 295 | cmp \counter, #4\r |
| 296 | \r |
| 297 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
| 298 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 299 | \r |
| 300 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
| 301 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 302 | \r |
| 303 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
| 304 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 305 | \r |
| 306 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 307 | \r |
| 308 | add \reg5, \dst, \dststride\r |
| 309 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
| 310 | \r |
| 311 | vmov.32 d16[0], \reg2\r |
| 312 | \r |
| 313 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
| 314 | \r |
| 315 | vmov.32 d16[1], \reg4\r |
| 316 | \r |
| 317 | vmov d17, d16\r |
| 318 | \r |
| 319 | vst2.16 {d16,d17}, [\dst]! @ dst[0-7] = d16-d17; dst += 2*2*4\r |
| 320 | \r |
| 321 | vst2.16 {d16,d17}, [\reg5] @ dst1[0-7] = d16-d17\r |
| 322 | bhs 20b\r |
| 323 | \r |
| 324 | cmp \counter, #0\r |
| 325 | beq 40f\r |
| 326 | \r |
| 327 | @ last 1-3 pixels\r |
| 328 | 30:\r |
| 329 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 330 | add \reg2, \dst, \dststride\r |
| 331 | \r |
| 332 | add \dst, \dst, #4 @ dst += 2*2\r |
| 333 | \r |
| 334 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 335 | subS \counter, \counter, #1 @ counter--\r |
| 336 | \r |
| 337 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r |
| 338 | \r |
| 339 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r |
| 340 | \r |
| 341 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
| 342 | \r |
| 343 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r |
| 344 | bne 30b\r |
| 345 | \r |
| 346 | 40:\r |
| 347 | .endm\r |
| 348 | \r |
| 349 | .macro neon_normal3x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
| 350 | @ align src to 4 bytes\r |
| 351 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
| 352 | beq 10f\r |
| 353 | \r |
| 354 | @ first 1-3 pixels\r |
| 355 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
| 356 | 1:\r |
| 357 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 358 | add \reg2, \dst, \dststride\r |
| 359 | \r |
| 360 | add \reg3, \reg2, \dststride\r |
| 361 | add \dst, \dst, #6 @ dst += 3*2\r |
| 362 | \r |
| 363 | sub \counter, \counter, #1 @ counter--\r |
| 364 | \r |
| 365 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 366 | subS \reg5, \reg5, #1 @ reg5--\r |
| 367 | \r |
| 368 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r |
| 369 | \r |
| 370 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r |
| 371 | \r |
| 372 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r |
| 373 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
| 374 | \r |
| 375 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
| 376 | \r |
| 377 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r |
| 378 | \r |
| 379 | strh \reg1, [\reg3] @ dst2[0] = reg1\r |
| 380 | \r |
| 381 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r |
| 382 | bne 1b\r |
| 383 | \r |
| 384 | @ middle pixels (16 per iteration)\r |
| 385 | 10:\r |
| 386 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d22, d23\r |
| 387 | \r |
| 388 | vmov q9, q8\r |
| 389 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
| 390 | \r |
| 391 | vmov q10, q8\r |
| 392 | add \reg2, \dst, \dststride, lsl #1 @ reg1 = dst + 2 * dststride\r |
| 393 | \r |
| 394 | vmov q12, q11\r |
| 395 | vst3.16 {d16,d18,d20}, [\dst]! @ dst[0-3] = q8-q10[0]; dst += 3*2*4\r |
| 396 | \r |
| 397 | vmov q13, q11\r |
| 398 | vst3.16 {d17,d19,d21}, [\dst]! @ dst[4-7] = q8-q10[1]; dst += 3*2*4\r |
| 399 | \r |
| 400 | vst3.16 {d22,d24,d26}, [\dst]! @ dst[8-11] = q11-q13[0]; dst += 3*2*4\r |
| 401 | \r |
| 402 | vst3.16 {d23,d25,d27}, [\dst]! @ dst[12-15] = q11-q13[1]; dst += 3*2*4\r |
| 403 | \r |
| 404 | vst3.16 {d16,d18,d20}, [\reg1]! @ dst1[0-3] = q8-q10[0]; dst1 += 3*2*4\r |
| 405 | \r |
| 406 | vst3.16 {d17,d19,d21}, [\reg1]! @ dst1[4-7] = q8-q10[1]; dst1 += 3*2*4\r |
| 407 | \r |
| 408 | vst3.16 {d22,d24,d26}, [\reg1]! @ dst1[8-11] = q11-q13[0]; dst1 += 3*2*4\r |
| 409 | \r |
| 410 | vst3.16 {d23,d25,d27}, [\reg1]! @ dst1[12-15] = q11-q13[1]; dst1 += 3*2*4\r |
| 411 | \r |
| 412 | vst3.16 {d16,d18,d20}, [\reg2]! @ dst2[0-3] = q8-q10[0]; dst2 += 3*2*4\r |
| 413 | \r |
| 414 | vst3.16 {d17,d19,d21}, [\reg2]! @ dst2[4-7] = q8-q10[1]; dst2 += 3*2*4\r |
| 415 | \r |
| 416 | vst3.16 {d22,d24,d26}, [\reg2]! @ dst2[8-11] = q11-q13[0]; dst2 += 3*2*4\r |
| 417 | \r |
| 418 | vst3.16 {d23,d25,d27}, [\reg2]! @ dst2[12-15] = q11-q13[1]; dst2 += 3*2*4\r |
| 419 | bhs 10b\r |
| 420 | \r |
| 421 | @ last 0-15 bytes\r |
| 422 | \r |
| 423 | cmp \counter, #0\r |
| 424 | beq 40f\r |
| 425 | \r |
| 426 | cmp \counter, #4\r |
| 427 | blo 30f\r |
| 428 | \r |
| 429 | @ 4-12 pixels (4 pre iteration)\r |
| 430 | 20:\r |
| 431 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 432 | sub \counter, \counter, #4 @ counter -= 4\r |
| 433 | \r |
| 434 | add \src, \src, #4 @ src += 4\r |
| 435 | \r |
| 436 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 437 | cmp \counter, #4\r |
| 438 | \r |
| 439 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
| 440 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 441 | \r |
| 442 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
| 443 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 444 | \r |
| 445 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
| 446 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 447 | \r |
| 448 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 449 | \r |
| 450 | add \reg5, \dst, \dststride\r |
| 451 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
| 452 | \r |
| 453 | vmov.32 d16[0], \reg2\r |
| 454 | add \reg6, \reg5, \dststride\r |
| 455 | \r |
| 456 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
| 457 | \r |
| 458 | vmov.32 d16[1], \reg4\r |
| 459 | \r |
| 460 | vmov d17, d16\r |
| 461 | \r |
| 462 | vmov d18, d16\r |
| 463 | \r |
| 464 | vst3.16 {d16,d17,d18}, [\dst]! @ dst[0-11] = d16-d18; dst += 3*2*4\r |
| 465 | \r |
| 466 | vst3.16 {d16,d17,d18}, [\reg5] @ dst1[0-11] = d16-d18\r |
| 467 | \r |
| 468 | vst3.16 {d16,d17,d18}, [\reg6] @ dst2[0-11] = d16-d18\r |
| 469 | bhs 20b\r |
| 470 | \r |
| 471 | cmp \counter, #0\r |
| 472 | beq 40f\r |
| 473 | \r |
| 474 | @ last 1-3 pixels\r |
| 475 | 30:\r |
| 476 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 477 | add \reg2, \dst, \dststride\r |
| 478 | \r |
| 479 | add \reg3, \reg2, \dststride\r |
| 480 | add \dst, \dst, #6 @ dst += 3*2\r |
| 481 | \r |
| 482 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 483 | subS \counter, \counter, #1 @ counter--\r |
| 484 | \r |
| 485 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r |
| 486 | \r |
| 487 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r |
| 488 | \r |
| 489 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r |
| 490 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
| 491 | \r |
| 492 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
| 493 | \r |
| 494 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r |
| 495 | \r |
| 496 | strh \reg1, [\reg3] @ dst2[0] = reg1\r |
| 497 | \r |
| 498 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r |
| 499 | bne 30b\r |
| 500 | \r |
| 501 | 40:\r |
| 502 | .endm\r |
| 503 | \r |
| 504 | .macro neon_normal4x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
| 505 | @ align src to 4 bytes\r |
| 506 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
| 507 | beq 10f\r |
| 508 | \r |
| 509 | @ first 1-3 pixels\r |
| 510 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
| 511 | 1:\r |
| 512 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 513 | add \reg2, \dst, \dststride\r |
| 514 | \r |
| 515 | add \reg3, \reg2, \dststride\r |
| 516 | add \dst, \dst, #8 @ dst += 4*2\r |
| 517 | \r |
| 518 | sub \counter, \counter, #1 @ counter--\r |
| 519 | \r |
| 520 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 521 | add \reg4, \reg3, \dststride\r |
| 522 | \r |
| 523 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r |
| 524 | subS \reg5, \reg5, #1 @ reg5--\r |
| 525 | \r |
| 526 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r |
| 527 | \r |
| 528 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
| 529 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r |
| 530 | \r |
| 531 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r |
| 532 | \r |
| 533 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r |
| 534 | \r |
| 535 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r |
| 536 | \r |
| 537 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r |
| 538 | \r |
| 539 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r |
| 540 | \r |
| 541 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r |
| 542 | bne 1b\r |
| 543 | \r |
| 544 | @ middle pixels (16 per iteration)\r |
| 545 | 10:\r |
| 546 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d24, d25\r |
| 547 | \r |
| 548 | vmov q9, q8\r |
| 549 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
| 550 | \r |
| 551 | vmov q10, q8\r |
| 552 | add \reg2, \dst, \dststride, lsl #1 @ reg2 = dst + 2 * dststride\r |
| 553 | \r |
| 554 | vmov q11, q8\r |
| 555 | add \reg3, \reg1, \dststride,lsl #1 @ reg3 = dst + 3 * dststride\r |
| 556 | \r |
| 557 | vmov q13, q12\r |
| 558 | vst4.16 {d16,d18,d20,d22}, [\dst]! @ dst[0-3] = q8-q11[0]; dst += 4*2*4\r |
| 559 | \r |
| 560 | vmov q14, q12\r |
| 561 | \r |
| 562 | vmov q15, q12\r |
| 563 | vst4.16 {d17,d19,d21,d23}, [\dst]! @ dst[4-7] = q8-q11[1]; dst += 4*2*4\r |
| 564 | \r |
| 565 | vst4.16 {d24,d26,d28,d30}, [\dst]! @ dst[8-11] = q12-q15[0]; dst += 4*2*4\r |
| 566 | \r |
| 567 | vst4.16 {d25,d27,d29,d31}, [\dst]! @ dst[12-15] = q12-q15[1]; dst += 4*2*4\r |
| 568 | \r |
| 569 | vst4.16 {d16,d18,d20,d22}, [\reg1]! @ dst1[0-3] = q8-q11[0]; dst1 += 4*2*4\r |
| 570 | \r |
| 571 | vst4.16 {d17,d19,d21,d23}, [\reg1]! @ dst1[4-7] = q8-q11[1]; dst1 += 4*2*4\r |
| 572 | \r |
| 573 | vst4.16 {d24,d26,d28,d30}, [\reg1]! @ dst1[8-11] = q12-q15[0]; dst1 += 4*2*4\r |
| 574 | \r |
| 575 | vst4.16 {d25,d27,d29,d31}, [\reg1]! @ dst1[12-15] = q12-q15[1]; dst1 += 4*2*4\r |
| 576 | \r |
| 577 | vst4.16 {d16,d18,d20,d22}, [\reg2]! @ dst2[0-3] = q8-q11[0]; dst2 += 4*2*4\r |
| 578 | \r |
| 579 | vst4.16 {d17,d19,d21,d23}, [\reg2]! @ dst2[4-7] = q8-q11[1]; dst2 += 4*2*4\r |
| 580 | \r |
| 581 | vst4.16 {d24,d26,d28,d30}, [\reg2]! @ dst2[8-11] = q12-q15[0]; dst2 += 4*2*4\r |
| 582 | \r |
| 583 | vst4.16 {d25,d27,d29,d31}, [\reg2]! @ dst2[12-15] = q12-q15[1]; dst2 += 4*2*4\r |
| 584 | \r |
| 585 | vst4.16 {d16,d18,d20,d22}, [\reg3]! @ dst3[0-3] = q8-q11[0]; dst3 += 4*2*4\r |
| 586 | \r |
| 587 | vst4.16 {d17,d19,d21,d23}, [\reg3]! @ dst3[4-7] = q8-q11[1]; dst3 += 4*2*4\r |
| 588 | \r |
| 589 | vst4.16 {d24,d26,d28,d30}, [\reg3]! @ dst3[8-11] = q12-q15[0]; dst3 += 4*2*4\r |
| 590 | \r |
| 591 | vst4.16 {d25,d27,d29,d31}, [\reg3]! @ dst3[12-15] = q12-q15[1]; dst3 += 4*2*4\r |
| 592 | bhs 10b\r |
| 593 | \r |
| 594 | @ last 0-15 bytes\r |
| 595 | \r |
| 596 | cmp \counter, #0\r |
| 597 | beq 40f\r |
| 598 | \r |
| 599 | cmp \counter, #4\r |
| 600 | blo 30f\r |
| 601 | \r |
| 602 | @ 4-12 pixels (4 pre iteration)\r |
| 603 | 20:\r |
| 604 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
| 605 | sub \counter, \counter, #4 @ counter -= 4\r |
| 606 | \r |
| 607 | add \src, \src, #4 @ src += 4\r |
| 608 | \r |
| 609 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
| 610 | cmp \counter, #4\r |
| 611 | \r |
| 612 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
| 613 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
| 614 | \r |
| 615 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
| 616 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
| 617 | \r |
| 618 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
| 619 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
| 620 | \r |
| 621 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
| 622 | \r |
| 623 | add \reg5, \dst, \dststride\r |
| 624 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
| 625 | \r |
| 626 | vmov.32 d16[0], \reg2\r |
| 627 | add \reg6, \reg5, \dststride\r |
| 628 | \r |
| 629 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
| 630 | add \reg7, \reg6, \dststride\r |
| 631 | \r |
| 632 | vmov.32 d16[1], \reg4\r |
| 633 | \r |
| 634 | vmov d17, d16\r |
| 635 | \r |
| 636 | vmov d18, d16\r |
| 637 | \r |
| 638 | vmov d19, d16\r |
| 639 | \r |
| 640 | vst4.16 {d16,d17,d18,d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 4*2*4\r |
| 641 | \r |
| 642 | vst4.16 {d16,d17,d18,d19}, [\reg5] @ dst1[0-15] = d16-d19\r |
| 643 | \r |
| 644 | vst4.16 {d16,d17,d18,d19}, [\reg6] @ dst2[0-15] = d16-d19\r |
| 645 | \r |
| 646 | vst4.16 {d16,d17,d18,d19}, [\reg7] @ dst3[0-15] = d16-d19\r |
| 647 | bhs 20b\r |
| 648 | \r |
| 649 | cmp \counter, #0\r |
| 650 | beq 40f\r |
| 651 | \r |
| 652 | @ last 1-3 pixels\r |
| 653 | 30:\r |
| 654 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
| 655 | add \reg2, \dst, \dststride\r |
| 656 | \r |
| 657 | add \reg3, \reg2, \dststride\r |
| 658 | add \dst, \dst, #8 @ dst += 4*2\r |
| 659 | \r |
| 660 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
| 661 | add \reg4, \reg3, \dststride\r |
| 662 | \r |
| 663 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r |
| 664 | subS \counter, \counter, #1 @ counter--\r |
| 665 | \r |
| 666 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r |
| 667 | \r |
| 668 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
| 669 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r |
| 670 | \r |
| 671 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r |
| 672 | \r |
| 673 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r |
| 674 | \r |
| 675 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r |
| 676 | \r |
| 677 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r |
| 678 | \r |
| 679 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r |
| 680 | \r |
| 681 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r |
| 682 | bne 30b\r |
| 683 | \r |
| 684 | 40:\r |
| 685 | .endm\r |
| 686 | \r |