| 1 | @@\r |
| 2 | @@ Copyright (C) 2012 Roman Pauer\r |
| 3 | @@\r |
| 4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
| 5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
| 6 | @@ the Software without restriction, including without limitation the rights to\r |
| 7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
| 8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
| 9 | @@ so, subject to the following conditions:\r |
| 10 | @@\r |
| 11 | @@ The above copyright notice and this permission notice shall be included in all\r |
| 12 | @@ copies or substantial portions of the Software.\r |
| 13 | @@\r |
| 14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
| 15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
| 16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
| 17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
| 18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
| 19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
| 20 | @@ SOFTWARE.\r |
| 21 | @@\r |
| 22 | \r |
| 23 | \r |
| 24 | @ S T U --\ E1 E2\r |
| 25 | @ V C W --/ E3 E4\r |
| 26 | @ X Y Z\r |
| 27 | \r |
| 28 | @ q0 = S1sl < S >\r |
| 29 | @ q1 = S2sl < V >\r |
| 30 | @ q2 = S3sl < X >\r |
| 31 | @ q3 = S1sr < U >\r |
| 32 | @ q4 = S2sr < W >\r |
| 33 | @ q5 = S3sr < Z >\r |
| 34 | @ q6 = E3\r |
| 35 | @ q7 = E4\r |
| 36 | @ q8 = S1\r |
| 37 | @ q9 = S2\r |
| 38 | @ q10 = S3\r |
| 39 | @ q11 = S1prev < T >\r |
| 40 | @ q12 = S2prev < C >\r |
| 41 | @ q13 = S3prev < Y >\r |
| 42 | @ q14 = E1\r |
| 43 | @ q15 = E2\r |
| 44 | \r |
| 45 | \r |
| 46 | .macro __neon_eagle2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 47 | \r |
| 48 | .ifeqs "\qT", "q11"\r |
| 49 | vld1.8 {d23[7]}, [\src1] @ S1prev[15] = src[-srcstride]\r |
| 50 | .endif\r |
| 51 | vld1.8 {d25[7]}, [\src2] @ S2prev[15] = src[0]\r |
| 52 | .ifeqs "\qY", "q13"\r |
| 53 | vld1.8 {d27[7]}, [\src3] @ S3prev[15] = src[srcstride]\r |
| 54 | .endif\r |
| 55 | andS \reg1, \counter, #15 @ reg1 = counter & 15\r |
| 56 | \r |
| 57 | .ifnes "\qT", "q11"\r |
| 58 | add \src1, \src1, \counter @ src1 += counter\r |
| 59 | .endif\r |
| 60 | .ifnes "\qY", "q13"\r |
| 61 | add \src3, \src3, \counter @ src3 += counter\r |
| 62 | .endif\r |
| 63 | beq 1f\r |
| 64 | \r |
| 65 | @ first 1-15 pixels - align counter to 16 bytes\r |
| 66 | \r |
| 67 | @ q0 = S1sl < S >\r |
| 68 | @ q2 = S3sl < X >\r |
| 69 | @ q7 = tmp2\r |
| 70 | @ q15 = tmp1\r |
| 71 | \r |
| 72 | .ifeqs "\qT", "q11"\r |
| 73 | vld1.8 {q8}, [\src1], \reg1 @ S1 = [src - srcstride]; src1 += counter & 15\r |
| 74 | .endif\r |
| 75 | \r |
| 76 | vld1.8 {q9}, [\src2], \reg1 @ S2 = [src ]; src2 += counter & 15\r |
| 77 | \r |
| 78 | .ifeqs "\qY", "q13"\r |
| 79 | vld1.8 {q10}, [\src3], \reg1 @ S3 = [src + srcstride]; src3 += counter & 15\r |
| 80 | .endif\r |
| 81 | .ifeqs "\qT", "q11"\r |
| 82 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r |
| 83 | \r |
| 84 | vmov \qT, q8 @ S1prev = S1 < T >\r |
| 85 | .endif\r |
| 86 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r |
| 87 | \r |
| 88 | vmov q12, q9 @ S2prev = S2 < C >\r |
| 89 | .ifeqs "\qY", "q13"\r |
| 90 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r |
| 91 | \r |
| 92 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
| 93 | .endif\r |
| 94 | .ifeqs "\qT", "q11"\r |
| 95 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | ... < U >\r |
| 96 | .endif\r |
| 97 | \r |
| 98 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | ... < W >\r |
| 99 | \r |
| 100 | .ifeqs "\qY", "q13"\r |
| 101 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | ... < Z >\r |
| 102 | .else\r |
| 103 | vmov q2, q1 @ S3sl = S2sl < X >\r |
| 104 | \r |
| 105 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
| 106 | .endif\r |
| 107 | \r |
| 108 | .ifnes "\qT", "q11"\r |
| 109 | vmov q0, q1 @ S1sl = S2sl < S >\r |
| 110 | \r |
| 111 | vmov q3, q4 @ S1sr = S2sr < U >\r |
| 112 | .endif\r |
| 113 | \r |
| 114 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r |
| 115 | \r |
| 116 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r |
| 117 | \r |
| 118 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r |
| 119 | \r |
| 120 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r |
| 121 | \r |
| 122 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
| 123 | \r |
| 124 | @ q0 = tmp3\r |
| 125 | @ q15 = E2\r |
| 126 | \r |
| 127 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r |
| 128 | \r |
| 129 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r |
| 130 | \r |
| 131 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
| 132 | \r |
| 133 | @ q2 = tmp4\r |
| 134 | @ q7 = E4\r |
| 135 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r |
| 136 | \r |
| 137 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r |
| 138 | \r |
| 139 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
| 140 | \r |
| 141 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
| 142 | \r |
| 143 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
| 144 | \r |
| 145 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
| 146 | \r |
| 147 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
| 148 | \r |
| 149 | .ifeqs "\qT", "q11"\r |
| 150 | sub \reg1, \src1, #1\r |
| 151 | .else\r |
| 152 | sub \reg1, \src2, #1\r |
| 153 | .endif\r |
| 154 | \r |
| 155 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
| 156 | .ifeqs "\qT", "q11"\r |
| 157 | vld1.8 {d23[7]}, [\reg1] @ S1prev[15] = src[counter & 15 - 1 - srcstride]\r |
| 158 | \r |
| 159 | sub \reg1, \src2, #1\r |
| 160 | .endif\r |
| 161 | \r |
| 162 | vld1.8 {d25[7]}, [\reg1] @ S2prev[15] = src[counter & 15 - 1]\r |
| 163 | \r |
| 164 | .ifeqs "\qY", "q13"\r |
| 165 | sub \reg1, \src3, #1\r |
| 166 | \r |
| 167 | vld1.8 {d27[7]}, [\reg1] @ S3prev[15] = src[counter & 15 - 1 + srcstride]\r |
| 168 | .endif\r |
| 169 | \r |
| 170 | ubfx \reg1, \counter, #0, #4 @ reg1 = counter & 15\r |
| 171 | \r |
| 172 | lsl \reg1, #1\r |
| 173 | \r |
| 174 | vst2.8 {q14-q15}, [\dst1],\reg1 @ [dst] = E1,E2; dst1 += reg1\r |
| 175 | \r |
| 176 | bic \counter, \counter, #15\r |
| 177 | \r |
| 178 | vst2.8 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r |
| 179 | \r |
| 180 | @ counter is aligned to 16 bytes\r |
| 181 | \r |
| 182 | 1:\r |
| 183 | .ifeqs "\qT", "q11"\r |
| 184 | vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r |
| 185 | .endif\r |
| 186 | vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r |
| 187 | .ifeqs "\qY", "q13"\r |
| 188 | vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r |
| 189 | .endif\r |
| 190 | \r |
| 191 | @ inner loop (16 pixels per iteration)\r |
| 192 | 2:\r |
| 193 | \r |
| 194 | @ q0 = S1sl < S >\r |
| 195 | @ q2 = S3sl < X >\r |
| 196 | @ q7 = tmp2\r |
| 197 | @ q15 = tmp1\r |
| 198 | \r |
| 199 | .ifeqs "\qT", "q11"\r |
| 200 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r |
| 201 | vmov \qT, q8 @ S1prev = S1 < T >\r |
| 202 | .endif\r |
| 203 | \r |
| 204 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r |
| 205 | vmov q12, q9 @ S2prev = S2 < C >\r |
| 206 | \r |
| 207 | .ifeqs "\qY", "q13"\r |
| 208 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r |
| 209 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
| 210 | .endif\r |
| 211 | \r |
| 212 | .ifeqs "\qT", "q11"\r |
| 213 | vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r |
| 214 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r |
| 215 | .endif\r |
| 216 | \r |
| 217 | vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r |
| 218 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r |
| 219 | \r |
| 220 | .ifeqs "\qY", "q13"\r |
| 221 | vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r |
| 222 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r |
| 223 | .else\r |
| 224 | vmov q2, q1 @ S3sl = S2sl < X >\r |
| 225 | \r |
| 226 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
| 227 | .endif\r |
| 228 | \r |
| 229 | .ifnes "\qT", "q11"\r |
| 230 | vmov q0, q1 @ S1sl = S2sl < S >\r |
| 231 | \r |
| 232 | vmov q3, q4 @ S1sr = S2sr < U >\r |
| 233 | .endif\r |
| 234 | \r |
| 235 | sub \counter, \counter, #16 @ counter -= 16\r |
| 236 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r |
| 237 | \r |
| 238 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r |
| 239 | \r |
| 240 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r |
| 241 | \r |
| 242 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r |
| 243 | \r |
| 244 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
| 245 | \r |
| 246 | @ q0 = tmp3\r |
| 247 | @ q15 = E2\r |
| 248 | \r |
| 249 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r |
| 250 | \r |
| 251 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r |
| 252 | \r |
| 253 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
| 254 | \r |
| 255 | @ q2 = tmp4\r |
| 256 | @ q7 = E4\r |
| 257 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r |
| 258 | \r |
| 259 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r |
| 260 | \r |
| 261 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
| 262 | \r |
| 263 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
| 264 | \r |
| 265 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
| 266 | \r |
| 267 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
| 268 | \r |
| 269 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
| 270 | \r |
| 271 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
| 272 | vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r |
| 273 | \r |
| 274 | cmp \counter, #16\r |
| 275 | \r |
| 276 | vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r |
| 277 | bhi 2b\r |
| 278 | \r |
| 279 | @ last 16 pixels\r |
| 280 | \r |
| 281 | @ q0 = S1sl < S >\r |
| 282 | @ q2 = S3sl < X >\r |
| 283 | @ q7 = tmp2\r |
| 284 | @ q15 = tmp1\r |
| 285 | \r |
| 286 | .ifeqs "\qT", "q11"\r |
| 287 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r |
| 288 | vmov \qT, q8 @ S1prev = S1 < T >\r |
| 289 | .endif\r |
| 290 | \r |
| 291 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r |
| 292 | vmov q12, q9 @ S2prev = S2 < C >\r |
| 293 | \r |
| 294 | .ifeqs "\qY", "q13"\r |
| 295 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r |
| 296 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
| 297 | .endif\r |
| 298 | \r |
| 299 | .ifeqs "\qT", "q11"\r |
| 300 | vshr.u64 d16, d17, #(64-8) @ S1[0] = S1[15] | ...\r |
| 301 | .endif\r |
| 302 | \r |
| 303 | vshr.u64 d18, d19, #(64-8) @ S2[0] = S2[15] | ...\r |
| 304 | \r |
| 305 | .ifeqs "\qY", "q13"\r |
| 306 | vshr.u64 d20, d21, #(64-8) @ S3[0] = S3[15] | ...\r |
| 307 | .endif\r |
| 308 | .ifeqs "\qT", "q11"\r |
| 309 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r |
| 310 | .endif\r |
| 311 | \r |
| 312 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r |
| 313 | \r |
| 314 | .ifeqs "\qY", "q13"\r |
| 315 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r |
| 316 | .else\r |
| 317 | vmov q2, q1 @ S3sl = S2sl < X >\r |
| 318 | \r |
| 319 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
| 320 | .endif\r |
| 321 | \r |
| 322 | .ifnes "\qT", "q11"\r |
| 323 | vmov q0, q1 @ S1sl = S2sl < S >\r |
| 324 | \r |
| 325 | vmov q3, q4 @ S1sr = S2sr < U >\r |
| 326 | .endif\r |
| 327 | \r |
| 328 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r |
| 329 | \r |
| 330 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r |
| 331 | \r |
| 332 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r |
| 333 | \r |
| 334 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r |
| 335 | \r |
| 336 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
| 337 | \r |
| 338 | @ q0 = tmp3\r |
| 339 | @ q15 = E2\r |
| 340 | \r |
| 341 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r |
| 342 | \r |
| 343 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r |
| 344 | \r |
| 345 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
| 346 | \r |
| 347 | @ q2 = tmp4\r |
| 348 | @ q7 = E4\r |
| 349 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r |
| 350 | \r |
| 351 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r |
| 352 | \r |
| 353 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
| 354 | \r |
| 355 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
| 356 | \r |
| 357 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
| 358 | \r |
| 359 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
| 360 | \r |
| 361 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
| 362 | \r |
| 363 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
| 364 | vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r |
| 365 | \r |
| 366 | vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r |
| 367 | \r |
| 368 | .endm\r |
| 369 | \r |
| 370 | .macro _neon_eagle2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 371 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 372 | .endm\r |
| 373 | \r |
| 374 | .macro _neon_eagle2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 375 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 376 | .endm\r |
| 377 | \r |
| 378 | .macro _neon_eagle2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 379 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 380 | .endm\r |
| 381 | \r |
| 382 | .macro neon_eagle2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r |
| 383 | .ifeq \srcalign16\r |
| 384 | \r |
| 385 | .ifeq \dstalign32\r |
| 386 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r |
| 387 | .else\r |
| 388 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r |
| 389 | .endif\r |
| 390 | \r |
| 391 | .else\r |
| 392 | \r |
| 393 | .ifeq \dstalign32\r |
| 394 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r |
| 395 | .else\r |
| 396 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r |
| 397 | .endif\r |
| 398 | \r |
| 399 | .endif\r |
| 400 | .endm\r |
| 401 | \r |
| 402 | \r |
| 403 | .macro __neon_eagle2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 404 | \r |
| 405 | .ifeqs "\qT", "q11"\r |
| 406 | vld1.16 {d23[3]}, [\src1] @ S1prev[7] = src[-srcstride]\r |
| 407 | .endif\r |
| 408 | vld1.16 {d25[3]}, [\src2] @ S2prev[7] = src[0]\r |
| 409 | .ifeqs "\qY", "q13"\r |
| 410 | vld1.16 {d27[3]}, [\src3] @ S3prev[7] = src[srcstride]\r |
| 411 | .endif\r |
| 412 | andS \reg1, \counter, #7 @ reg1 = counter & 7\r |
| 413 | \r |
| 414 | .ifnes "\qT", "q11"\r |
| 415 | add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter\r |
| 416 | .endif\r |
| 417 | .ifnes "\qY", "q13"\r |
| 418 | add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter\r |
| 419 | .endif\r |
| 420 | beq 1f\r |
| 421 | \r |
| 422 | @ first 1-7 pixels - align counter to 16 bytes\r |
| 423 | \r |
| 424 | @ q0 = S1sl < S >\r |
| 425 | @ q2 = S3sl < X >\r |
| 426 | @ q7 = tmp2\r |
| 427 | @ q15 = tmp1\r |
| 428 | \r |
| 429 | .ifeqs "\qT", "q11"\r |
| 430 | vld1.16 {q8}, [\src1] @ S1 = [src - srcstride]\r |
| 431 | add \src1, \src1, \reg1, lsl #1 @ src1 += 2 * (counter & 7)\r |
| 432 | .endif\r |
| 433 | \r |
| 434 | vld1.16 {q9}, [\src2] @ S2 = [src ]\r |
| 435 | add \src2, \src2, \reg1, lsl #1 @ src2 += 2 * (counter & 7)\r |
| 436 | \r |
| 437 | .ifeqs "\qY", "q13"\r |
| 438 | vld1.16 {q10}, [\src3] @ S3 = [src + srcstride]\r |
| 439 | add \src3, \src3, \reg1, lsl #1 @ src3 += 2 * (counter & 7)\r |
| 440 | .endif\r |
| 441 | .ifeqs "\qT", "q11"\r |
| 442 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r |
| 443 | \r |
| 444 | vmov \qT, q8 @ S1prev = S1 < T >\r |
| 445 | .endif\r |
| 446 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r |
| 447 | \r |
| 448 | vmov q12, q9 @ S2prev = S2 < C >\r |
| 449 | .ifeqs "\qY", "q13"\r |
| 450 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r |
| 451 | \r |
| 452 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
| 453 | .endif\r |
| 454 | .ifeqs "\qT", "q11"\r |
| 455 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | ... < U >\r |
| 456 | .endif\r |
| 457 | \r |
| 458 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | ... < W >\r |
| 459 | \r |
| 460 | .ifeqs "\qY", "q13"\r |
| 461 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | ... < Z >\r |
| 462 | .else\r |
| 463 | vmov q2, q1 @ S3sl = S2sl < X >\r |
| 464 | \r |
| 465 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
| 466 | .endif\r |
| 467 | \r |
| 468 | .ifnes "\qT", "q11"\r |
| 469 | vmov q0, q1 @ S1sl = S2sl < S >\r |
| 470 | \r |
| 471 | vmov q3, q4 @ S1sr = S2sr < U >\r |
| 472 | .endif\r |
| 473 | \r |
| 474 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r |
| 475 | \r |
| 476 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r |
| 477 | \r |
| 478 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r |
| 479 | \r |
| 480 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r |
| 481 | \r |
| 482 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
| 483 | \r |
| 484 | @ q0 = tmp3\r |
| 485 | @ q15 = E2\r |
| 486 | \r |
| 487 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r |
| 488 | \r |
| 489 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r |
| 490 | \r |
| 491 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
| 492 | \r |
| 493 | @ q2 = tmp4\r |
| 494 | @ q7 = E4\r |
| 495 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r |
| 496 | \r |
| 497 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r |
| 498 | \r |
| 499 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
| 500 | \r |
| 501 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
| 502 | \r |
| 503 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
| 504 | \r |
| 505 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
| 506 | \r |
| 507 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
| 508 | \r |
| 509 | .ifeqs "\qT", "q11"\r |
| 510 | sub \reg1, \src1, #2\r |
| 511 | .else\r |
| 512 | sub \reg1, \src2, #2\r |
| 513 | .endif\r |
| 514 | \r |
| 515 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
| 516 | .ifeqs "\qT", "q11"\r |
| 517 | vld1.16 {d23[3]}, [\reg1] @ S1prev[7] = src[2 * (counter & 7) - 2 - srcstride]\r |
| 518 | \r |
| 519 | sub \reg1, \src2, #2\r |
| 520 | .endif\r |
| 521 | \r |
| 522 | vld1.16 {d25[3]}, [\reg1] @ S2prev[7] = src[2 * (counter & 7) - 2]\r |
| 523 | \r |
| 524 | .ifeqs "\qY", "q13"\r |
| 525 | sub \reg1, \src3, #2\r |
| 526 | \r |
| 527 | vld1.16 {d27[3]}, [\reg1] @ S3prev[7] = src[2 * (counter & 7) - 2 + srcstride]\r |
| 528 | .endif\r |
| 529 | \r |
| 530 | #ifdef DO_BGR_TO_RGB\r |
| 531 | bgr1555_to_rgb565 q14, q15, q8, q9, q10, \reg1\r |
| 532 | bgr1555_to_rgb565 q6, q7, q8, q9, q10, \reg1\r |
| 533 | #endif\r |
| 534 | \r |
| 535 | ubfx \reg1, \counter, #0, #3 @ reg1 = counter & 7\r |
| 536 | \r |
| 537 | lsl \reg1, #2\r |
| 538 | \r |
| 539 | vst2.16 {q14-q15}, [\dst1], \reg1 @ [dst] = E1,E2; dst1 += reg1\r |
| 540 | \r |
| 541 | bic \counter, \counter, #7\r |
| 542 | \r |
| 543 | vst2.16 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r |
| 544 | \r |
| 545 | @ counter is aligned to 16 bytes\r |
| 546 | \r |
| 547 | 1:\r |
| 548 | .ifeqs "\qT", "q11"\r |
| 549 | vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r |
| 550 | .endif\r |
| 551 | vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r |
| 552 | .ifeqs "\qY", "q13"\r |
| 553 | vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r |
| 554 | .endif\r |
| 555 | \r |
| 556 | @ inner loop (8 pixels per iteration)\r |
| 557 | 2:\r |
| 558 | \r |
| 559 | @ q0 = S1sl < S >\r |
| 560 | @ q2 = S3sl < X >\r |
| 561 | @ q7 = tmp2\r |
| 562 | @ q15 = tmp1\r |
| 563 | \r |
| 564 | .ifeqs "\qT", "q11"\r |
| 565 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r |
| 566 | vmov \qT, q8 @ S1prev = S1 < T >\r |
| 567 | .endif\r |
| 568 | \r |
| 569 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r |
| 570 | vmov q12, q9 @ S2prev = S2 < C >\r |
| 571 | \r |
| 572 | .ifeqs "\qY", "q13"\r |
| 573 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r |
| 574 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
| 575 | .endif\r |
| 576 | \r |
| 577 | .ifeqs "\qT", "q11"\r |
| 578 | vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r |
| 579 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r |
| 580 | .endif\r |
| 581 | \r |
| 582 | vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r |
| 583 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r |
| 584 | \r |
| 585 | .ifeqs "\qY", "q13"\r |
| 586 | vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r |
| 587 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r |
| 588 | .else\r |
| 589 | vmov q2, q1 @ S3sl = S2sl < X >\r |
| 590 | \r |
| 591 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
| 592 | .endif\r |
| 593 | \r |
| 594 | .ifnes "\qT", "q11"\r |
| 595 | vmov q0, q1 @ S1sl = S2sl < S >\r |
| 596 | \r |
| 597 | vmov q3, q4 @ S1sr = S2sr < U >\r |
| 598 | .endif\r |
| 599 | \r |
| 600 | sub \counter, \counter, #8 @ counter -= 8\r |
| 601 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r |
| 602 | \r |
| 603 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r |
| 604 | \r |
| 605 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r |
| 606 | \r |
| 607 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r |
| 608 | \r |
| 609 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
| 610 | \r |
| 611 | @ q0 = tmp3\r |
| 612 | @ q15 = E2\r |
| 613 | \r |
| 614 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r |
| 615 | \r |
| 616 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r |
| 617 | \r |
| 618 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
| 619 | \r |
| 620 | @ q2 = tmp4\r |
| 621 | @ q7 = E4\r |
| 622 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r |
| 623 | \r |
| 624 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r |
| 625 | \r |
| 626 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
| 627 | \r |
| 628 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
| 629 | \r |
| 630 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
| 631 | \r |
| 632 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
| 633 | \r |
| 634 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
| 635 | \r |
| 636 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
| 637 | \r |
| 638 | #ifdef DO_BGR_TO_RGB\r |
| 639 | bgr1555_to_rgb565 q14, q15, q0, q1, q2, \reg1\r |
| 640 | bgr1555_to_rgb565 q6, q7, q0, q1, q2, \reg1\r |
| 641 | #endif\r |
| 642 | \r |
| 643 | vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r |
| 644 | \r |
| 645 | cmp \counter, #8\r |
| 646 | \r |
| 647 | vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r |
| 648 | bhi 2b\r |
| 649 | \r |
| 650 | @ last 8 pixels\r |
| 651 | \r |
| 652 | @ q0 = S1sl < S >\r |
| 653 | @ q2 = S3sl < X >\r |
| 654 | @ q7 = tmp2\r |
| 655 | @ q15 = tmp1\r |
| 656 | \r |
| 657 | .ifeqs "\qT", "q11"\r |
| 658 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r |
| 659 | vmov \qT, q8 @ S1prev = S1 < T >\r |
| 660 | .endif\r |
| 661 | \r |
| 662 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r |
| 663 | vmov q12, q9 @ S2prev = S2 < C >\r |
| 664 | \r |
| 665 | .ifeqs "\qY", "q13"\r |
| 666 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r |
| 667 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
| 668 | .endif\r |
| 669 | \r |
| 670 | .ifeqs "\qT", "q11"\r |
| 671 | vshr.u64 d16, d17, #(64-16) @ S1[0] = S1[7] | ...\r |
| 672 | .endif\r |
| 673 | \r |
| 674 | vshr.u64 d18, d19, #(64-16) @ S2[0] = S2[7] | ...\r |
| 675 | \r |
| 676 | .ifeqs "\qY", "q13"\r |
| 677 | vshr.u64 d20, d21, #(64-16) @ S3[0] = S3[7] | ...\r |
| 678 | .endif\r |
| 679 | .ifeqs "\qT", "q11"\r |
| 680 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r |
| 681 | .endif\r |
| 682 | \r |
| 683 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r |
| 684 | \r |
| 685 | .ifeqs "\qY", "q13"\r |
| 686 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r |
| 687 | .else\r |
| 688 | vmov q2, q1 @ S3sl = S2sl < X >\r |
| 689 | \r |
| 690 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
| 691 | .endif\r |
| 692 | \r |
| 693 | .ifnes "\qT", "q11"\r |
| 694 | vmov q0, q1 @ S1sl = S2sl < S >\r |
| 695 | \r |
| 696 | vmov q3, q4 @ S1sr = S2sr < U >\r |
| 697 | .endif\r |
| 698 | \r |
| 699 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r |
| 700 | \r |
| 701 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r |
| 702 | \r |
| 703 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r |
| 704 | \r |
| 705 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r |
| 706 | \r |
| 707 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
| 708 | \r |
| 709 | @ q0 = tmp3\r |
| 710 | @ q15 = E2\r |
| 711 | \r |
| 712 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r |
| 713 | \r |
| 714 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r |
| 715 | \r |
| 716 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
| 717 | \r |
| 718 | @ q2 = tmp4\r |
| 719 | @ q7 = E4\r |
| 720 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r |
| 721 | \r |
| 722 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r |
| 723 | \r |
| 724 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
| 725 | \r |
| 726 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
| 727 | \r |
| 728 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
| 729 | \r |
| 730 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
| 731 | \r |
| 732 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
| 733 | \r |
| 734 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
| 735 | \r |
| 736 | #ifdef DO_BGR_TO_RGB\r |
| 737 | bgr1555_to_rgb565 q14, q15, q8, q9, q10, \reg1\r |
| 738 | bgr1555_to_rgb565 q6, q7, q8, q9, q10, \reg1\r |
| 739 | #endif\r |
| 740 | \r |
| 741 | vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r |
| 742 | \r |
| 743 | vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r |
| 744 | \r |
| 745 | .endm\r |
| 746 | \r |
| 747 | .macro _neon_eagle2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 748 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 749 | .endm\r |
| 750 | \r |
| 751 | .macro _neon_eagle2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 752 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 753 | .endm\r |
| 754 | \r |
| 755 | .macro _neon_eagle2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 756 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 757 | .endm\r |
| 758 | \r |
| 759 | .macro neon_eagle2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r |
| 760 | .ifeq \srcalign16\r |
| 761 | \r |
| 762 | .ifeq \dstalign32\r |
| 763 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r |
| 764 | .else\r |
| 765 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r |
| 766 | .endif\r |
| 767 | \r |
| 768 | .else\r |
| 769 | \r |
| 770 | .ifeq \dstalign32\r |
| 771 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1, \dst2\r |
| 772 | .else\r |
| 773 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1 A256, \dst2 A256\r |
| 774 | .endif\r |
| 775 | \r |
| 776 | .endif\r |
| 777 | .endm\r |
| 778 | \r |