e38fee1b |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r |
3 | @@\r |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
6 | @@ the Software without restriction, including without limitation the rights to\r |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
9 | @@ so, subject to the following conditions:\r |
10 | @@\r |
11 | @@ The above copyright notice and this permission notice shall be included in all\r |
12 | @@ copies or substantial portions of the Software.\r |
13 | @@\r |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
20 | @@ SOFTWARE.\r |
21 | @@\r |
22 | \r |
23 | \r |
24 | @ A B C --\ E0 E1\r |
25 | @ D E F --/ E2 E3\r |
26 | @ G H I\r |
27 | \r |
28 | @ q0 = E0 (tmp0)\r |
29 | @ q1 = E1 (tmp1)\r |
30 | @ q2 = E2 (tmp2)\r |
31 | @ q3 = E3 (tmp3)\r |
32 | @ q8 = S2prev\r |
33 | @ q9 = S2next\r |
34 | @ q10 = C0 < B == H || D == F >\r |
35 | @ q11 = S1 < B >\r |
36 | @ q12 = S2 < E >\r |
37 | @ q13 = S3 < H >\r |
38 | @ q14 = S2sl < D >\r |
39 | @ q15 = S2sr < F >\r |
40 | \r |
41 | \r |
42 | .macro __neon_scale2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qB, qH, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
43 | \r |
44 | vld1.8 {d17[7]}, [\src2] @ S2prev[15] = src[0]\r |
45 | andS \reg1, \counter, #15 @ reg1 = counter & 15\r |
46 | \r |
47 | .ifnes "\qB", "q11"\r |
48 | add \src1, \src1, \counter @ src1 += counter\r |
49 | .endif\r |
50 | .ifnes "\qH", "q13"\r |
51 | add \src3, \src3, \counter @ src3 += counter\r |
52 | .endif\r |
53 | beq 1f\r |
54 | \r |
55 | @ first 1-15 pixels - align counter to 16 bytes\r |
56 | vld1.8 {q12}, [\src2], \reg1 @ S2 = [src] < E >; src2 += counter & 15\r |
57 | \r |
58 | .ifeqs "\qB", "q11"\r |
59 | vld1.8 {\qB}, [\src1], \reg1 @ S1 = [src - srcstride] < B >; src1 += counter & 15\r |
60 | .endif\r |
61 | \r |
62 | .ifeqs "\qH", "q13"\r |
63 | vld1.8 {\qH}, [\src3], \reg1 @ S3 = [src + srcstride] < H >; src3 += counter & 15\r |
64 | .endif\r |
65 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r |
66 | \r |
67 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r |
68 | \r |
69 | vmov.8 d17[7], \reg1 @ S2prev[15] = reg1\r |
70 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | ... < F >\r |
71 | \r |
72 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r |
73 | \r |
74 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r |
75 | \r |
76 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r |
77 | vtbl.8 d17, {d28, d29}, d17 @ S2prev[15] = src[reg1 - 1]\r |
78 | \r |
79 | lsl \reg1, #1\r |
80 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
81 | \r |
82 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r |
83 | \r |
84 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r |
85 | \r |
86 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
87 | \r |
88 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
89 | \r |
90 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
91 | \r |
92 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
93 | \r |
94 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
95 | \r |
96 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
97 | \r |
98 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
99 | vst2.8 {q0-q1}, [\dst1], \reg1 @ [dst] = E0,E1; dst1 += reg1\r |
100 | \r |
101 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
102 | bic \counter, \counter, #15\r |
103 | \r |
104 | vst2.8 {q2-q3}, [\dst2], \reg1 @ [dst + dststride] = E2,E3; dst2 += reg1\r |
105 | \r |
106 | @ counter is aligned to 16 bytes\r |
107 | \r |
108 | 1:\r |
109 | vld1.8 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 16\r |
110 | \r |
111 | @ inner loop (16 pixels per iteration)\r |
112 | 2:\r |
113 | \r |
114 | vmov q12, q9 @ S2 = S2next < E >\r |
115 | .ifeqs "\qB", "q11"\r |
116 | vld1.8 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 16\r |
117 | .endif\r |
118 | \r |
119 | .ifeqs "\qH", "q13"\r |
120 | vld1.8 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 16\r |
121 | .endif\r |
122 | \r |
123 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r |
124 | vld1.8 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 16\r |
125 | \r |
126 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r |
127 | \r |
128 | vmov q8, q12 @ S2prev = S2\r |
129 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | S2next[0] < F >\r |
130 | \r |
131 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r |
132 | \r |
133 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r |
134 | \r |
135 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r |
136 | \r |
137 | sub \counter, \counter, #16 @ counter -= 16\r |
138 | \r |
139 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
140 | \r |
141 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r |
142 | \r |
143 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r |
144 | \r |
145 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
146 | \r |
147 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
148 | \r |
149 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
150 | \r |
151 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
152 | \r |
153 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
154 | \r |
155 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
156 | \r |
157 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
158 | vst2.8 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*16\r |
159 | \r |
160 | cmp \counter, #16\r |
161 | \r |
162 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
163 | \r |
164 | vst2.8 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*16\r |
165 | \r |
166 | bhi 2b\r |
167 | \r |
168 | @ last 16 pixels\r |
169 | \r |
170 | vmov q12, q9 @ S2 = S2next < E >\r |
171 | \r |
172 | vshr.u64 d18, d19, #(64-8) @ S2next[0] = S2[15] | ...\r |
173 | .ifeqs "\qB", "q11"\r |
174 | vld1.8 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 16\r |
175 | .endif\r |
176 | \r |
177 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r |
178 | \r |
179 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | S2next[0] < F >\r |
180 | .ifeqs "\qH", "q13"\r |
181 | vld1.8 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 16\r |
182 | .endif\r |
183 | \r |
184 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r |
185 | \r |
186 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r |
187 | \r |
188 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r |
189 | \r |
190 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r |
191 | \r |
192 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
193 | \r |
194 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r |
195 | \r |
196 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r |
197 | \r |
198 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
199 | \r |
200 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
201 | \r |
202 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
203 | \r |
204 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
205 | \r |
206 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
207 | \r |
208 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
209 | \r |
210 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
211 | vst2.8 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*16\r |
212 | \r |
213 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
214 | \r |
215 | vst2.8 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*16\r |
216 | \r |
217 | .endm\r |
218 | \r |
219 | .macro _neon_scale2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
220 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
221 | .endm\r |
222 | \r |
223 | .macro _neon_scale2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
224 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
225 | .endm\r |
226 | \r |
227 | .macro _neon_scale2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
228 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
229 | .endm\r |
230 | \r |
231 | .macro neon_scale2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r |
232 | .ifeq \srcalign16\r |
233 | \r |
234 | .ifeq \dstalign32\r |
235 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r |
236 | .else\r |
237 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r |
238 | .endif\r |
239 | \r |
240 | .else\r |
241 | \r |
242 | .ifeq \dstalign32\r |
243 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r |
244 | .else\r |
245 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r |
246 | .endif\r |
247 | \r |
248 | .endif\r |
249 | .endm\r |
250 | \r |
251 | \r |
252 | .macro __neon_scale2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qB, qH, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
253 | \r |
254 | vld1.16 {d17[3]}, [\src2] @ S2prev[7] = src[0]\r |
255 | andS \reg1, \counter, #7 @ reg1 = counter & 7\r |
256 | \r |
257 | .ifnes "\qB", "q11"\r |
258 | add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter\r |
259 | .endif\r |
260 | .ifnes "\qH", "q13"\r |
261 | add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter\r |
262 | .endif\r |
263 | beq 1f\r |
264 | \r |
265 | @ first 1-7 pixels - align counter to 16 bytes\r |
266 | vld1.16 {q12}, [\src2] @ S2 = [src] < E >\r |
267 | lsl \reg1, #1\r |
268 | \r |
269 | .ifeqs "\qB", "q11"\r |
270 | vld1.16 {\qB}, [\src1] @ S1 = [src - srcstride] < B >\r |
271 | .endif\r |
272 | bfi \reg1, \reg1, #8, #8\r |
273 | \r |
274 | .ifeqs "\qH", "q13"\r |
275 | vld1.16 {\qH}, [\src3] @ S3 = [src + srcstride] < H >\r |
276 | .endif\r |
277 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r |
278 | \r |
279 | add \reg1, \reg1, #256\r |
280 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r |
281 | \r |
282 | vmov.16 d17[3], \reg1 @ S2prev[7] = reg1\r |
283 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | ... < F >\r |
284 | \r |
285 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r |
286 | \r |
287 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r |
288 | \r |
289 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r |
290 | vtbl.8 d17, {d28, d29}, d17 @ S2prev[7] = src[reg1 - 1]\r |
291 | \r |
292 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
293 | and \reg1, \counter, #7\r |
294 | \r |
295 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r |
296 | \r |
297 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r |
298 | \r |
299 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
300 | \r |
301 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
302 | \r |
303 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
304 | \r |
305 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
306 | \r |
307 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
308 | \r |
309 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
310 | \r |
311 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
312 | vst2.16 {q0-q1}, [\dst1] @ [dst] = E0,E1\r |
313 | \r |
314 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
315 | \r |
316 | bic \counter, \counter, #7\r |
317 | .ifeqs "\qB", "q11"\r |
318 | add \src1, \src1, \reg1, lsl #1\r |
319 | .endif\r |
320 | add \src2, \src2, \reg1, lsl #1\r |
321 | .ifeqs "\qH", "q13"\r |
322 | add \src3, \src3, \reg1, lsl #1\r |
323 | .endif\r |
324 | \r |
325 | vst2.16 {q2-q3}, [\dst2] @ [dst + dststride] = E2,E3\r |
326 | \r |
327 | add \dst1, \dst1, \reg1, lsl #2\r |
328 | add \dst2, \dst2, \reg1, lsl #2\r |
329 | \r |
330 | @ counter is aligned to 16 bytes\r |
331 | \r |
332 | 1:\r |
333 | vld1.16 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 2*8\r |
334 | \r |
335 | @ inner loop (8 pixels per iteration)\r |
336 | 2:\r |
337 | \r |
338 | vmov q12, q9 @ S2 = S2next < E >\r |
339 | .ifeqs "\qB", "q11"\r |
340 | vld1.16 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 2*8\r |
341 | .endif\r |
342 | \r |
343 | .ifeqs "\qH", "q13"\r |
344 | vld1.16 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 2*8\r |
345 | .endif\r |
346 | \r |
347 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r |
348 | vld1.16 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 2*8\r |
349 | \r |
350 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r |
351 | \r |
352 | vmov q8, q12 @ S2prev = S2\r |
353 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | S2next[0] < F >\r |
354 | \r |
355 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r |
356 | \r |
357 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r |
358 | \r |
359 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r |
360 | \r |
361 | sub \counter, \counter, #8 @ counter -= 8\r |
362 | \r |
363 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
364 | \r |
365 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r |
366 | \r |
367 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r |
368 | \r |
369 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
370 | \r |
371 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
372 | \r |
373 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
374 | \r |
375 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
376 | \r |
377 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
378 | \r |
379 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
380 | \r |
381 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
382 | vst2.16 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*2*8\r |
383 | \r |
384 | cmp \counter, #8\r |
385 | \r |
386 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
387 | \r |
388 | vst2.16 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*2*8\r |
389 | \r |
390 | bhi 2b\r |
391 | \r |
392 | @ last 8 pixels\r |
393 | \r |
394 | vmov q12, q9 @ S2 = S2next < E >\r |
395 | \r |
396 | vshr.u64 d18, d19, #(64-16) @ S2next[0] = S2[7] | ...\r |
397 | .ifeqs "\qB", "q11"\r |
398 | vld1.16 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 2*8\r |
399 | .endif\r |
400 | \r |
401 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r |
402 | \r |
403 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | S2next[0] < F >\r |
404 | .ifeqs "\qH", "q13"\r |
405 | vld1.16 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 2*8\r |
406 | .endif\r |
407 | \r |
408 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r |
409 | \r |
410 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r |
411 | \r |
412 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r |
413 | \r |
414 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r |
415 | \r |
416 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r |
417 | \r |
418 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r |
419 | \r |
420 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r |
421 | \r |
422 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r |
423 | \r |
424 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r |
425 | \r |
426 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r |
427 | \r |
428 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r |
429 | \r |
430 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r |
431 | \r |
432 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r |
433 | \r |
434 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r |
435 | vst2.16 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*2*8\r |
436 | \r |
437 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r |
438 | \r |
439 | vst2.16 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*2*8\r |
440 | \r |
441 | .endm\r |
442 | \r |
443 | .macro _neon_scale2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
444 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
445 | .endm\r |
446 | \r |
447 | .macro _neon_scale2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
448 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
449 | .endm\r |
450 | \r |
451 | .macro _neon_scale2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
452 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
453 | .endm\r |
454 | \r |
455 | .macro neon_scale2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r |
456 | .ifeq \srcalign16\r |
457 | \r |
458 | .ifeq \dstalign32\r |
459 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r |
460 | .else\r |
461 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r |
462 | .endif\r |
463 | \r |
464 | .else\r |
465 | \r |
466 | .ifeq \dstalign32\r |
467 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r |
468 | .else\r |
469 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r |
470 | .endif\r |
471 | \r |
472 | .endif\r |
473 | .endm\r |
474 | \r |