| 1 | @@\r |
| 2 | @@ Copyright (C) 2012 Roman Pauer\r |
| 3 | @@\r |
| 4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
| 5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
| 6 | @@ the Software without restriction, including without limitation the rights to\r |
| 7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
| 8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
| 9 | @@ so, subject to the following conditions:\r |
| 10 | @@\r |
| 11 | @@ The above copyright notice and this permission notice shall be included in all\r |
| 12 | @@ copies or substantial portions of the Software.\r |
| 13 | @@\r |
| 14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
| 15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
| 16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
| 17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
| 18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
| 19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
| 20 | @@ SOFTWARE.\r |
| 21 | @@\r |
| 22 | \r |
| 23 | \r |
| 24 | @ A B C --\ E0 E1\r |
| 25 | @ D E F --/ E2 E3\r |
| 26 | @ G H I\r |
| 27 | \r |
| 28 | @ q0 = E0 (tmp0)\r |
| 29 | @ q1 = E1 (tmp1)\r |
| 30 | @ q2 = E2 (tmp2)\r |
| 31 | @ q3 = E3 (tmp3)\r |
| 32 | @ q8 = S2prev\r |
| 33 | @ q9 = S2next\r |
| 34 | @ q10 = C0 < B == H || D == F >\r |
| 35 | @ q11 = S1 < B >\r |
| 36 | @ q12 = S2 < E >\r |
| 37 | @ q13 = S3 < H >\r |
| 38 | @ q14 = S2sl < D >\r |
| 39 | @ q15 = S2sr < F >\r |
| 40 | \r |
| 41 | \r |
| 42 | .macro __neon_scale2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qB, qH, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 43 | \r |
| 44 | vld1.8 {d17[7]}, [\src2] @ S2prev[15] = src[0]\r |
| 45 | andS \reg1, \counter, #15 @ reg1 = counter & 15\r |
| 46 | \r |
| 47 | .ifnes "\qB", "q11"\r |
| 48 | add \src1, \src1, \counter @ src1 += counter\r |
| 49 | .endif\r |
| 50 | .ifnes "\qH", "q13"\r |
| 51 | add \src3, \src3, \counter @ src3 += counter\r |
| 52 | .endif\r |
| 53 | beq 1f\r |
| 54 | \r |
| 55 | @ first 1-15 pixels - align counter to 16 bytes\r |
| 56 | vld1.8 {q12}, [\src2], \reg1 @ S2 = [src] < E >; src2 += counter & 15\r |
| 57 | \r |
| 58 | .ifeqs "\qB", "q11"\r |
| 59 | vld1.8 {\qB}, [\src1], \reg1 @ S1 = [src - srcstride] < B >; src1 += counter & 15\r |
| 60 | .endif\r |
| 61 | \r |
| 62 | .ifeqs "\qH", "q13"\r |
| 63 | vld1.8 {\qH}, [\src3], \reg1 @ S3 = [src + srcstride] < H >; src3 += counter & 15\r |
| 64 | .endif\r |
| 65 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r |
| 66 | \r |
| 67 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r |
| 68 | \r |
| 69 | vmov.8 d17[7], \reg1 @ S2prev[15] = reg1\r |
| 70 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | ... < F >\r |
| 71 | \r |
| 72 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r |
| 73 | \r |
| 74 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r |
| 75 | \r |
| 76 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r |
| 77 | vtbl.8 d17, {d28, d29}, d17 @ S2prev[15] = src[reg1 - 1]\r |
| 78 | \r |
| 79 | lsl \reg1, #1\r |
| 80 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
| 81 | \r |
| 82 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r |
| 83 | \r |
| 84 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r |
| 85 | \r |
| 86 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
| 87 | \r |
| 88 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
| 89 | \r |
| 90 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
| 91 | \r |
| 92 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
| 93 | \r |
| 94 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
| 95 | \r |
| 96 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
| 97 | \r |
| 98 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
| 99 | vst2.8 {q0-q1}, [\dst1], \reg1 @ [dst] = E0,E1; dst1 += reg1\r |
| 100 | \r |
| 101 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
| 102 | bic \counter, \counter, #15\r |
| 103 | \r |
| 104 | vst2.8 {q2-q3}, [\dst2], \reg1 @ [dst + dststride] = E2,E3; dst2 += reg1\r |
| 105 | \r |
| 106 | @ counter is aligned to 16 bytes\r |
| 107 | \r |
| 108 | 1:\r |
| 109 | vld1.8 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 16\r |
| 110 | \r |
| 111 | @ inner loop (16 pixels per iteration)\r |
| 112 | 2:\r |
| 113 | \r |
| 114 | vmov q12, q9 @ S2 = S2next < E >\r |
| 115 | .ifeqs "\qB", "q11"\r |
| 116 | vld1.8 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 16\r |
| 117 | .endif\r |
| 118 | \r |
| 119 | .ifeqs "\qH", "q13"\r |
| 120 | vld1.8 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 16\r |
| 121 | .endif\r |
| 122 | \r |
| 123 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r |
| 124 | vld1.8 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 16\r |
| 125 | \r |
| 126 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r |
| 127 | \r |
| 128 | vmov q8, q12 @ S2prev = S2\r |
| 129 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | S2next[0] < F >\r |
| 130 | \r |
| 131 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r |
| 132 | \r |
| 133 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r |
| 134 | \r |
| 135 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r |
| 136 | \r |
| 137 | sub \counter, \counter, #16 @ counter -= 16\r |
| 138 | \r |
| 139 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
| 140 | \r |
| 141 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r |
| 142 | \r |
| 143 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r |
| 144 | \r |
| 145 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
| 146 | \r |
| 147 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
| 148 | \r |
| 149 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
| 150 | \r |
| 151 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
| 152 | \r |
| 153 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
| 154 | \r |
| 155 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
| 156 | \r |
| 157 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
| 158 | vst2.8 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*16\r |
| 159 | \r |
| 160 | cmp \counter, #16\r |
| 161 | \r |
| 162 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
| 163 | \r |
| 164 | vst2.8 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*16\r |
| 165 | \r |
| 166 | bhi 2b\r |
| 167 | \r |
| 168 | @ last 16 pixels\r |
| 169 | \r |
| 170 | vmov q12, q9 @ S2 = S2next < E >\r |
| 171 | \r |
| 172 | vshr.u64 d18, d19, #(64-8) @ S2next[0] = S2[15] | ...\r |
| 173 | .ifeqs "\qB", "q11"\r |
| 174 | vld1.8 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 16\r |
| 175 | .endif\r |
| 176 | \r |
| 177 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r |
| 178 | \r |
| 179 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | S2next[0] < F >\r |
| 180 | .ifeqs "\qH", "q13"\r |
| 181 | vld1.8 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 16\r |
| 182 | .endif\r |
| 183 | \r |
| 184 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r |
| 185 | \r |
| 186 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r |
| 187 | \r |
| 188 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r |
| 189 | \r |
| 190 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r |
| 191 | \r |
| 192 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
| 193 | \r |
| 194 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r |
| 195 | \r |
| 196 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r |
| 197 | \r |
| 198 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
| 199 | \r |
| 200 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
| 201 | \r |
| 202 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
| 203 | \r |
| 204 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
| 205 | \r |
| 206 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
| 207 | \r |
| 208 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
| 209 | \r |
| 210 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
| 211 | vst2.8 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*16\r |
| 212 | \r |
| 213 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
| 214 | \r |
| 215 | vst2.8 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*16\r |
| 216 | \r |
| 217 | .endm\r |
| 218 | \r |
| 219 | .macro _neon_scale2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 220 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 221 | .endm\r |
| 222 | \r |
| 223 | .macro _neon_scale2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 224 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 225 | .endm\r |
| 226 | \r |
| 227 | .macro _neon_scale2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 228 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 229 | .endm\r |
| 230 | \r |
| 231 | .macro neon_scale2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r |
| 232 | .ifeq \srcalign16\r |
| 233 | \r |
| 234 | .ifeq \dstalign32\r |
| 235 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r |
| 236 | .else\r |
| 237 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r |
| 238 | .endif\r |
| 239 | \r |
| 240 | .else\r |
| 241 | \r |
| 242 | .ifeq \dstalign32\r |
| 243 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r |
| 244 | .else\r |
| 245 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r |
| 246 | .endif\r |
| 247 | \r |
| 248 | .endif\r |
| 249 | .endm\r |
| 250 | \r |
| 251 | \r |
| 252 | .macro __neon_scale2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qB, qH, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 253 | \r |
| 254 | vld1.16 {d17[3]}, [\src2] @ S2prev[7] = src[0]\r |
| 255 | andS \reg1, \counter, #7 @ reg1 = counter & 7\r |
| 256 | \r |
| 257 | .ifnes "\qB", "q11"\r |
| 258 | add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter\r |
| 259 | .endif\r |
| 260 | .ifnes "\qH", "q13"\r |
| 261 | add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter\r |
| 262 | .endif\r |
| 263 | beq 1f\r |
| 264 | \r |
| 265 | @ first 1-7 pixels - align counter to 16 bytes\r |
| 266 | vld1.16 {q12}, [\src2] @ S2 = [src] < E >\r |
| 267 | lsl \reg1, #1\r |
| 268 | \r |
| 269 | .ifeqs "\qB", "q11"\r |
| 270 | vld1.16 {\qB}, [\src1] @ S1 = [src - srcstride] < B >\r |
| 271 | .endif\r |
| 272 | bfi \reg1, \reg1, #8, #8\r |
| 273 | \r |
| 274 | .ifeqs "\qH", "q13"\r |
| 275 | vld1.16 {\qH}, [\src3] @ S3 = [src + srcstride] < H >\r |
| 276 | .endif\r |
| 277 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r |
| 278 | \r |
| 279 | add \reg1, \reg1, #256\r |
| 280 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r |
| 281 | \r |
| 282 | vmov.16 d17[3], \reg1 @ S2prev[7] = reg1\r |
| 283 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | ... < F >\r |
| 284 | \r |
| 285 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r |
| 286 | \r |
| 287 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r |
| 288 | \r |
| 289 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r |
| 290 | vtbl.8 d17, {d28, d29}, d17 @ S2prev[7] = src[reg1 - 1]\r |
| 291 | \r |
| 292 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
| 293 | \r |
| 294 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r |
| 295 | \r |
| 296 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r |
| 297 | \r |
| 298 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
| 299 | \r |
| 300 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
| 301 | \r |
| 302 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
| 303 | \r |
| 304 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
| 305 | \r |
| 306 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
| 307 | \r |
| 308 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
| 309 | \r |
| 310 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
| 311 | \r |
| 312 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
| 313 | \r |
| 314 | #ifdef DO_BGR_TO_RGB\r |
| 315 | bgr1555_to_rgb565 q0, q1, q12, q14, q15, \reg1\r |
| 316 | bgr1555_to_rgb565 q2, q3, q12, q14, q15, \reg1\r |
| 317 | #endif\r |
| 318 | \r |
| 319 | and \reg1, \counter, #7\r |
| 320 | \r |
| 321 | vst2.16 {q0-q1}, [\dst1] @ [dst] = E0,E1\r |
| 322 | \r |
| 323 | bic \counter, \counter, #7\r |
| 324 | .ifeqs "\qB", "q11"\r |
| 325 | add \src1, \src1, \reg1, lsl #1\r |
| 326 | .endif\r |
| 327 | add \src2, \src2, \reg1, lsl #1\r |
| 328 | .ifeqs "\qH", "q13"\r |
| 329 | add \src3, \src3, \reg1, lsl #1\r |
| 330 | .endif\r |
| 331 | \r |
| 332 | vst2.16 {q2-q3}, [\dst2] @ [dst + dststride] = E2,E3\r |
| 333 | \r |
| 334 | add \dst1, \dst1, \reg1, lsl #2\r |
| 335 | add \dst2, \dst2, \reg1, lsl #2\r |
| 336 | \r |
| 337 | @ counter is aligned to 16 bytes\r |
| 338 | \r |
| 339 | 1:\r |
| 340 | vld1.16 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 2*8\r |
| 341 | \r |
| 342 | @ inner loop (8 pixels per iteration)\r |
| 343 | 2:\r |
| 344 | \r |
| 345 | vmov q12, q9 @ S2 = S2next < E >\r |
| 346 | .ifeqs "\qB", "q11"\r |
| 347 | vld1.16 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 2*8\r |
| 348 | .endif\r |
| 349 | \r |
| 350 | .ifeqs "\qH", "q13"\r |
| 351 | vld1.16 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 2*8\r |
| 352 | .endif\r |
| 353 | \r |
| 354 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r |
| 355 | vld1.16 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 2*8\r |
| 356 | \r |
| 357 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r |
| 358 | \r |
| 359 | vmov q8, q12 @ S2prev = S2\r |
| 360 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | S2next[0] < F >\r |
| 361 | \r |
| 362 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r |
| 363 | \r |
| 364 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r |
| 365 | \r |
| 366 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r |
| 367 | \r |
| 368 | sub \counter, \counter, #8 @ counter -= 8\r |
| 369 | \r |
| 370 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
| 371 | \r |
| 372 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r |
| 373 | \r |
| 374 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r |
| 375 | \r |
| 376 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
| 377 | \r |
| 378 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
| 379 | \r |
| 380 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
| 381 | \r |
| 382 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
| 383 | \r |
| 384 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
| 385 | \r |
| 386 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
| 387 | \r |
| 388 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
| 389 | \r |
| 390 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
| 391 | \r |
| 392 | #ifdef DO_BGR_TO_RGB\r |
| 393 | bgr1555_to_rgb565 q0, q1, q12, q14, q15, \reg1\r |
| 394 | bgr1555_to_rgb565 q2, q3, q12, q14, q15, \reg1\r |
| 395 | #endif\r |
| 396 | \r |
| 397 | vst2.16 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*2*8\r |
| 398 | \r |
| 399 | cmp \counter, #8\r |
| 400 | \r |
| 401 | vst2.16 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*2*8\r |
| 402 | \r |
| 403 | bhi 2b\r |
| 404 | \r |
| 405 | @ last 8 pixels\r |
| 406 | \r |
| 407 | vmov q12, q9 @ S2 = S2next < E >\r |
| 408 | \r |
| 409 | vshr.u64 d18, d19, #(64-16) @ S2next[0] = S2[7] | ...\r |
| 410 | .ifeqs "\qB", "q11"\r |
| 411 | vld1.16 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 2*8\r |
| 412 | .endif\r |
| 413 | \r |
| 414 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r |
| 415 | \r |
| 416 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | S2next[0] < F >\r |
| 417 | .ifeqs "\qH", "q13"\r |
| 418 | vld1.16 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 2*8\r |
| 419 | .endif\r |
| 420 | \r |
| 421 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r |
| 422 | \r |
| 423 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r |
| 424 | \r |
| 425 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r |
| 426 | \r |
| 427 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r |
| 428 | \r |
| 429 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
| 430 | \r |
| 431 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r |
| 432 | \r |
| 433 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r |
| 434 | \r |
| 435 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
| 436 | \r |
| 437 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
| 438 | \r |
| 439 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
| 440 | \r |
| 441 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
| 442 | \r |
| 443 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
| 444 | \r |
| 445 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
| 446 | \r |
| 447 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
| 448 | \r |
| 449 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
| 450 | \r |
| 451 | #ifdef DO_BGR_TO_RGB\r |
| 452 | bgr1555_to_rgb565 q0, q1, q12, q14, q15, \reg1\r |
| 453 | bgr1555_to_rgb565 q2, q3, q12, q14, q15, \reg1\r |
| 454 | #endif\r |
| 455 | \r |
| 456 | vst2.16 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*2*8\r |
| 457 | \r |
| 458 | vst2.16 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*2*8\r |
| 459 | \r |
| 460 | .endm\r |
| 461 | \r |
| 462 | .macro _neon_scale2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 463 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 464 | .endm\r |
| 465 | \r |
| 466 | .macro _neon_scale2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 467 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 468 | .endm\r |
| 469 | \r |
| 470 | .macro _neon_scale2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
| 471 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
| 472 | .endm\r |
| 473 | \r |
| 474 | .macro neon_scale2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r |
| 475 | .ifeq \srcalign16\r |
| 476 | \r |
| 477 | .ifeq \dstalign32\r |
| 478 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r |
| 479 | .else\r |
| 480 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r |
| 481 | .endif\r |
| 482 | \r |
| 483 | .else\r |
| 484 | \r |
| 485 | .ifeq \dstalign32\r |
| 486 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1, \dst2\r |
| 487 | .else\r |
| 488 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1 A256, \dst2 A256\r |
| 489 | .endif\r |
| 490 | \r |
| 491 | .endif\r |
| 492 | .endm\r |
| 493 | \r |