neon filters: support optional color space conversion
[libpicofe.git] / arm / neon_eagle2x.Sinc
CommitLineData
7fc3ac8a
H
1@@\r
2@@ Copyright (C) 2012 Roman Pauer\r
3@@\r
4@@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r
5@@ this software and associated documentation files (the "Software"), to deal in\r
6@@ the Software without restriction, including without limitation the rights to\r
7@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r
8@@ of the Software, and to permit persons to whom the Software is furnished to do\r
9@@ so, subject to the following conditions:\r
10@@\r
11@@ The above copyright notice and this permission notice shall be included in all\r
12@@ copies or substantial portions of the Software.\r
13@@\r
14@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
15@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
16@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
17@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
18@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
19@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
20@@ SOFTWARE.\r
21@@\r
22\r
23\r
24@ S T U --\ E1 E2\r
25@ V C W --/ E3 E4\r
26@ X Y Z\r
27\r
28@ q0 = S1sl < S >\r
29@ q1 = S2sl < V >\r
30@ q2 = S3sl < X >\r
31@ q3 = S1sr < U >\r
32@ q4 = S2sr < W >\r
33@ q5 = S3sr < Z >\r
34@ q6 = E3\r
35@ q7 = E4\r
36@ q8 = S1\r
37@ q9 = S2\r
38@ q10 = S3\r
39@ q11 = S1prev < T >\r
40@ q12 = S2prev < C >\r
41@ q13 = S3prev < Y >\r
42@ q14 = E1\r
43@ q15 = E2\r
44\r
45\r
46.macro __neon_eagle2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
47\r
48 .ifeqs "\qT", "q11"\r
49 vld1.8 {d23[7]}, [\src1] @ S1prev[15] = src[-srcstride]\r
50 .endif\r
51 vld1.8 {d25[7]}, [\src2] @ S2prev[15] = src[0]\r
52 .ifeqs "\qY", "q13"\r
53 vld1.8 {d27[7]}, [\src3] @ S3prev[15] = src[srcstride]\r
54 .endif\r
55 andS \reg1, \counter, #15 @ reg1 = counter & 15\r
56\r
57 .ifnes "\qT", "q11"\r
58 add \src1, \src1, \counter @ src1 += counter\r
59 .endif\r
60 .ifnes "\qY", "q13"\r
61 add \src3, \src3, \counter @ src3 += counter\r
62 .endif\r
63 beq 1f\r
64\r
65 @ first 1-15 pixels - align counter to 16 bytes\r
66\r
67@ q0 = S1sl < S >\r
68@ q2 = S3sl < X >\r
69@ q7 = tmp2\r
70@ q15 = tmp1\r
71\r
72 .ifeqs "\qT", "q11"\r
73 vld1.8 {q8}, [\src1], \reg1 @ S1 = [src - srcstride]; src1 += counter & 15\r
74 .endif\r
75\r
76 vld1.8 {q9}, [\src2], \reg1 @ S2 = [src ]; src2 += counter & 15\r
77\r
78 .ifeqs "\qY", "q13"\r
79 vld1.8 {q10}, [\src3], \reg1 @ S3 = [src + srcstride]; src3 += counter & 15\r
80 .endif\r
81 .ifeqs "\qT", "q11"\r
82 vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r
83\r
84 vmov \qT, q8 @ S1prev = S1 < T >\r
85 .endif\r
86 vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r
87\r
88 vmov q12, q9 @ S2prev = S2 < C >\r
89 .ifeqs "\qY", "q13"\r
90 vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r
91\r
92 vmov \qY, q10 @ S3prev = S3 < Y >\r
93 .endif\r
94 .ifeqs "\qT", "q11"\r
95 vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | ... < U >\r
96 .endif\r
97\r
98 vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | ... < W >\r
99\r
100 .ifeqs "\qY", "q13"\r
101 vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | ... < Z >\r
102 .else\r
103 vmov q2, q1 @ S3sl = S2sl < X >\r
104\r
105 vmov q5, q4 @ S3sr = S2sr < Z >\r
106 .endif\r
107\r
108 .ifnes "\qT", "q11"\r
109 vmov q0, q1 @ S1sl = S2sl < S >\r
110\r
111 vmov q3, q4 @ S1sr = S2sr < U >\r
112 .endif\r
113\r
114 vceq.i8 q14, q0, \qT @ E1 = < S == T >\r
115\r
116 vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r
117\r
118 vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r
119\r
120 vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r
121\r
122 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
123\r
124@ q0 = tmp3\r
125@ q15 = E2\r
126\r
127 vceq.i8 q15, q3, \qT @ E2 = < U == T >\r
128\r
129 vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r
130\r
131 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
132\r
133@ q2 = tmp4\r
134@ q7 = E4\r
135 vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r
136\r
137 vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r
138\r
139 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
140\r
141 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
142\r
143 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
144\r
145 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
146\r
147 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
148\r
149 .ifeqs "\qT", "q11"\r
150 sub \reg1, \src1, #1\r
151 .else\r
152 sub \reg1, \src2, #1\r
153 .endif\r
154\r
155 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
156 .ifeqs "\qT", "q11"\r
157 vld1.8 {d23[7]}, [\reg1] @ S1prev[15] = src[counter & 15 - 1 - srcstride]\r
158\r
159 sub \reg1, \src2, #1\r
160 .endif\r
161\r
162 vld1.8 {d25[7]}, [\reg1] @ S2prev[15] = src[counter & 15 - 1]\r
163\r
164 .ifeqs "\qY", "q13"\r
165 sub \reg1, \src3, #1\r
166\r
167 vld1.8 {d27[7]}, [\reg1] @ S3prev[15] = src[counter & 15 - 1 + srcstride]\r
168 .endif\r
169\r
170 ubfx \reg1, \counter, #0, #4 @ reg1 = counter & 15\r
171\r
172 lsl \reg1, #1\r
173\r
174 vst2.8 {q14-q15}, [\dst1],\reg1 @ [dst] = E1,E2; dst1 += reg1\r
175\r
176 bic \counter, \counter, #15\r
177\r
178 vst2.8 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r
179\r
180 @ counter is aligned to 16 bytes\r
181\r
182 1:\r
183 .ifeqs "\qT", "q11"\r
184 vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r
185 .endif\r
186 vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r
187 .ifeqs "\qY", "q13"\r
188 vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r
189 .endif\r
190\r
191 @ inner loop (16 pixels per iteration)\r
192 2:\r
193\r
194@ q0 = S1sl < S >\r
195@ q2 = S3sl < X >\r
196@ q7 = tmp2\r
197@ q15 = tmp1\r
198\r
199 .ifeqs "\qT", "q11"\r
200 vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r
201 vmov \qT, q8 @ S1prev = S1 < T >\r
202 .endif\r
203\r
204 vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r
205 vmov q12, q9 @ S2prev = S2 < C >\r
206\r
207 .ifeqs "\qY", "q13"\r
208 vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r
209 vmov \qY, q10 @ S3prev = S3 < Y >\r
210 .endif\r
211\r
212 .ifeqs "\qT", "q11"\r
213 vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r
214 vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r
215 .endif\r
216\r
217 vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r
218 vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r
219\r
220 .ifeqs "\qY", "q13"\r
221 vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r
222 vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r
223 .else\r
224 vmov q2, q1 @ S3sl = S2sl < X >\r
225\r
226 vmov q5, q4 @ S3sr = S2sr < Z >\r
227 .endif\r
228\r
229 .ifnes "\qT", "q11"\r
230 vmov q0, q1 @ S1sl = S2sl < S >\r
231\r
232 vmov q3, q4 @ S1sr = S2sr < U >\r
233 .endif\r
234\r
235 sub \counter, \counter, #16 @ counter -= 16\r
236 vceq.i8 q14, q0, \qT @ E1 = < S == T >\r
237\r
238 vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r
239\r
240 vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r
241\r
242 vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r
243\r
244 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
245\r
246@ q0 = tmp3\r
247@ q15 = E2\r
248\r
249 vceq.i8 q15, q3, \qT @ E2 = < U == T >\r
250\r
251 vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r
252\r
253 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
254\r
255@ q2 = tmp4\r
256@ q7 = E4\r
257 vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r
258\r
259 vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r
260\r
261 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
262\r
263 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
264\r
265 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
266\r
267 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
268\r
269 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
270\r
271 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
272 vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r
273\r
274 cmp \counter, #16\r
275\r
276 vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r
277 bhi 2b\r
278\r
279 @ last 16 pixels\r
280\r
281@ q0 = S1sl < S >\r
282@ q2 = S3sl < X >\r
283@ q7 = tmp2\r
284@ q15 = tmp1\r
285\r
286 .ifeqs "\qT", "q11"\r
287 vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r
288 vmov \qT, q8 @ S1prev = S1 < T >\r
289 .endif\r
290\r
291 vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r
292 vmov q12, q9 @ S2prev = S2 < C >\r
293\r
294 .ifeqs "\qY", "q13"\r
295 vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r
296 vmov \qY, q10 @ S3prev = S3 < Y >\r
297 .endif\r
298\r
299 .ifeqs "\qT", "q11"\r
300 vshr.u64 d16, d17, #(64-8) @ S1[0] = S1[15] | ...\r
301 .endif\r
302\r
303 vshr.u64 d18, d19, #(64-8) @ S2[0] = S2[15] | ...\r
304\r
305 .ifeqs "\qY", "q13"\r
306 vshr.u64 d20, d21, #(64-8) @ S3[0] = S3[15] | ...\r
307 .endif\r
308 .ifeqs "\qT", "q11"\r
309 vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r
310 .endif\r
311\r
312 vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r
313\r
314 .ifeqs "\qY", "q13"\r
315 vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r
316 .else\r
317 vmov q2, q1 @ S3sl = S2sl < X >\r
318\r
319 vmov q5, q4 @ S3sr = S2sr < Z >\r
320 .endif\r
321\r
322 .ifnes "\qT", "q11"\r
323 vmov q0, q1 @ S1sl = S2sl < S >\r
324\r
325 vmov q3, q4 @ S1sr = S2sr < U >\r
326 .endif\r
327\r
328 vceq.i8 q14, q0, \qT @ E1 = < S == T >\r
329\r
330 vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r
331\r
332 vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r
333\r
334 vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r
335\r
336 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
337\r
338@ q0 = tmp3\r
339@ q15 = E2\r
340\r
341 vceq.i8 q15, q3, \qT @ E2 = < U == T >\r
342\r
343 vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r
344\r
345 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
346\r
347@ q2 = tmp4\r
348@ q7 = E4\r
349 vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r
350\r
351 vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r
352\r
353 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
354\r
355 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
356\r
357 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
358\r
359 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
360\r
361 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
362\r
363 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
364 vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r
365\r
366 vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r
367\r
368.endm\r
369\r
370.macro _neon_eagle2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
371 __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
372.endm\r
373\r
374.macro _neon_eagle2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
375 __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
376.endm\r
377\r
378.macro _neon_eagle2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
379 __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
380.endm\r
381\r
382.macro neon_eagle2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r
383 .ifeq \srcalign16\r
384\r
385 .ifeq \dstalign32\r
386 _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r
387 .else\r
388 _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r
389 .endif\r
390\r
391 .else\r
392\r
393 .ifeq \dstalign32\r
394 _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r
395 .else\r
396 _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r
397 .endif\r
398\r
399 .endif\r
400.endm\r
401\r
402\r
403.macro __neon_eagle2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
404\r
405 .ifeqs "\qT", "q11"\r
406 vld1.16 {d23[3]}, [\src1] @ S1prev[7] = src[-srcstride]\r
407 .endif\r
408 vld1.16 {d25[3]}, [\src2] @ S2prev[7] = src[0]\r
409 .ifeqs "\qY", "q13"\r
410 vld1.16 {d27[3]}, [\src3] @ S3prev[7] = src[srcstride]\r
411 .endif\r
412 andS \reg1, \counter, #7 @ reg1 = counter & 7\r
413\r
414 .ifnes "\qT", "q11"\r
415 add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter\r
416 .endif\r
417 .ifnes "\qY", "q13"\r
418 add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter\r
419 .endif\r
420 beq 1f\r
421\r
422 @ first 1-7 pixels - align counter to 16 bytes\r
423\r
424@ q0 = S1sl < S >\r
425@ q2 = S3sl < X >\r
426@ q7 = tmp2\r
427@ q15 = tmp1\r
428\r
429 .ifeqs "\qT", "q11"\r
430 vld1.16 {q8}, [\src1] @ S1 = [src - srcstride]\r
431 add \src1, \src1, \reg1, lsl #1 @ src1 += 2 * (counter & 7)\r
432 .endif\r
433\r
434 vld1.16 {q9}, [\src2] @ S2 = [src ]\r
435 add \src2, \src2, \reg1, lsl #1 @ src2 += 2 * (counter & 7)\r
436\r
437 .ifeqs "\qY", "q13"\r
438 vld1.16 {q10}, [\src3] @ S3 = [src + srcstride]\r
439 add \src3, \src3, \reg1, lsl #1 @ src3 += 2 * (counter & 7)\r
440 .endif\r
441 .ifeqs "\qT", "q11"\r
442 vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r
443\r
444 vmov \qT, q8 @ S1prev = S1 < T >\r
445 .endif\r
446 vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r
447\r
448 vmov q12, q9 @ S2prev = S2 < C >\r
449 .ifeqs "\qY", "q13"\r
450 vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r
451\r
452 vmov \qY, q10 @ S3prev = S3 < Y >\r
453 .endif\r
454 .ifeqs "\qT", "q11"\r
455 vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | ... < U >\r
456 .endif\r
457\r
458 vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | ... < W >\r
459\r
460 .ifeqs "\qY", "q13"\r
461 vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | ... < Z >\r
462 .else\r
463 vmov q2, q1 @ S3sl = S2sl < X >\r
464\r
465 vmov q5, q4 @ S3sr = S2sr < Z >\r
466 .endif\r
467\r
468 .ifnes "\qT", "q11"\r
469 vmov q0, q1 @ S1sl = S2sl < S >\r
470\r
471 vmov q3, q4 @ S1sr = S2sr < U >\r
472 .endif\r
473\r
474 vceq.i16 q14, q0, \qT @ E1 = < S == T >\r
475\r
476 vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r
477\r
478 vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r
479\r
480 vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r
481\r
482 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
483\r
484@ q0 = tmp3\r
485@ q15 = E2\r
486\r
487 vceq.i16 q15, q3, \qT @ E2 = < U == T >\r
488\r
489 vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r
490\r
491 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
492\r
493@ q2 = tmp4\r
494@ q7 = E4\r
495 vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r
496\r
497 vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r
498\r
499 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
500\r
501 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
502\r
503 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
504\r
505 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
506\r
507 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
508\r
509 .ifeqs "\qT", "q11"\r
510 sub \reg1, \src1, #2\r
511 .else\r
512 sub \reg1, \src2, #2\r
513 .endif\r
514\r
515 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
516 .ifeqs "\qT", "q11"\r
517 vld1.16 {d23[3]}, [\reg1] @ S1prev[7] = src[2 * (counter & 7) - 2 - srcstride]\r
518\r
519 sub \reg1, \src2, #2\r
520 .endif\r
521\r
522 vld1.16 {d25[3]}, [\reg1] @ S2prev[7] = src[2 * (counter & 7) - 2]\r
523\r
524 .ifeqs "\qY", "q13"\r
525 sub \reg1, \src3, #2\r
526\r
527 vld1.16 {d27[3]}, [\reg1] @ S3prev[7] = src[2 * (counter & 7) - 2 + srcstride]\r
528 .endif\r
529\r
6ce097ba 530 #ifdef DO_BGR_TO_RGB\r
531 bgr1555_to_rgb565 q14, q15, q8, q9, q10\r
532 bgr1555_to_rgb565 q6, q7, q8, q9, q10\r
533 #endif\r
534\r
7fc3ac8a
H
535 ubfx \reg1, \counter, #0, #3 @ reg1 = counter & 7\r
536\r
537 lsl \reg1, #2\r
538\r
539 vst2.16 {q14-q15}, [\dst1], \reg1 @ [dst] = E1,E2; dst1 += reg1\r
540\r
541 bic \counter, \counter, #7\r
542\r
543 vst2.16 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r
544\r
545 @ counter is aligned to 16 bytes\r
546\r
547 1:\r
548 .ifeqs "\qT", "q11"\r
549 vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r
550 .endif\r
551 vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r
552 .ifeqs "\qY", "q13"\r
553 vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r
554 .endif\r
555\r
556 @ inner loop (8 pixels per iteration)\r
557 2:\r
558\r
559@ q0 = S1sl < S >\r
560@ q2 = S3sl < X >\r
561@ q7 = tmp2\r
562@ q15 = tmp1\r
563\r
564 .ifeqs "\qT", "q11"\r
565 vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r
566 vmov \qT, q8 @ S1prev = S1 < T >\r
567 .endif\r
568\r
569 vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r
570 vmov q12, q9 @ S2prev = S2 < C >\r
571\r
572 .ifeqs "\qY", "q13"\r
573 vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r
574 vmov \qY, q10 @ S3prev = S3 < Y >\r
575 .endif\r
576\r
577 .ifeqs "\qT", "q11"\r
578 vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r
579 vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r
580 .endif\r
581\r
582 vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r
583 vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r
584\r
585 .ifeqs "\qY", "q13"\r
586 vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r
587 vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r
588 .else\r
589 vmov q2, q1 @ S3sl = S2sl < X >\r
590\r
591 vmov q5, q4 @ S3sr = S2sr < Z >\r
592 .endif\r
593\r
594 .ifnes "\qT", "q11"\r
595 vmov q0, q1 @ S1sl = S2sl < S >\r
596\r
597 vmov q3, q4 @ S1sr = S2sr < U >\r
598 .endif\r
599\r
600 sub \counter, \counter, #8 @ counter -= 8\r
601 vceq.i16 q14, q0, \qT @ E1 = < S == T >\r
602\r
603 vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r
604\r
605 vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r
606\r
607 vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r
608\r
609 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
610\r
611@ q0 = tmp3\r
612@ q15 = E2\r
613\r
614 vceq.i16 q15, q3, \qT @ E2 = < U == T >\r
615\r
616 vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r
617\r
618 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
619\r
620@ q2 = tmp4\r
621@ q7 = E4\r
622 vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r
623\r
624 vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r
625\r
626 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
627\r
628 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
629\r
630 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
631\r
632 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
633\r
634 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
635\r
636 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
6ce097ba 637\r
638 #ifdef DO_BGR_TO_RGB\r
639 bgr1555_to_rgb565 q14, q15, q0, q1, q2\r
640 bgr1555_to_rgb565 q6, q7, q0, q1, q2\r
641 #endif\r
642\r
7fc3ac8a
H
643 vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r
644\r
645 cmp \counter, #8\r
646\r
647 vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r
648 bhi 2b\r
649\r
650 @ last 8 pixels\r
651\r
652@ q0 = S1sl < S >\r
653@ q2 = S3sl < X >\r
654@ q7 = tmp2\r
655@ q15 = tmp1\r
656\r
657 .ifeqs "\qT", "q11"\r
658 vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r
659 vmov \qT, q8 @ S1prev = S1 < T >\r
660 .endif\r
661\r
662 vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r
663 vmov q12, q9 @ S2prev = S2 < C >\r
664\r
665 .ifeqs "\qY", "q13"\r
666 vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r
667 vmov \qY, q10 @ S3prev = S3 < Y >\r
668 .endif\r
669\r
670 .ifeqs "\qT", "q11"\r
671 vshr.u64 d16, d17, #(64-16) @ S1[0] = S1[7] | ...\r
672 .endif\r
673\r
674 vshr.u64 d18, d19, #(64-16) @ S2[0] = S2[7] | ...\r
675\r
676 .ifeqs "\qY", "q13"\r
677 vshr.u64 d20, d21, #(64-16) @ S3[0] = S3[7] | ...\r
678 .endif\r
679 .ifeqs "\qT", "q11"\r
680 vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r
681 .endif\r
682\r
683 vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r
684\r
685 .ifeqs "\qY", "q13"\r
686 vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r
687 .else\r
688 vmov q2, q1 @ S3sl = S2sl < X >\r
689\r
690 vmov q5, q4 @ S3sr = S2sr < Z >\r
691 .endif\r
692\r
693 .ifnes "\qT", "q11"\r
694 vmov q0, q1 @ S1sl = S2sl < S >\r
695\r
696 vmov q3, q4 @ S1sr = S2sr < U >\r
697 .endif\r
698\r
699 vceq.i16 q14, q0, \qT @ E1 = < S == T >\r
700\r
701 vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r
702\r
703 vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r
704\r
705 vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r
706\r
707 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
708\r
709@ q0 = tmp3\r
710@ q15 = E2\r
711\r
712 vceq.i16 q15, q3, \qT @ E2 = < U == T >\r
713\r
714 vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r
715\r
716 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
717\r
718@ q2 = tmp4\r
719@ q7 = E4\r
720 vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r
721\r
722 vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r
723\r
724 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
725\r
726 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
727\r
728 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
729\r
730 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
731\r
732 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
733\r
734 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
6ce097ba 735\r
736 #ifdef DO_BGR_TO_RGB\r
737 bgr1555_to_rgb565 q14, q15, q8, q9, q10\r
738 bgr1555_to_rgb565 q6, q7, q8, q9, q10\r
739 #endif\r
740\r
7fc3ac8a
H
741 vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r
742\r
743 vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r
744\r
745.endm\r
746\r
747.macro _neon_eagle2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
748 __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
749.endm\r
750\r
751.macro _neon_eagle2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
752 __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
753.endm\r
754\r
755.macro _neon_eagle2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
756 __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
757.endm\r
758\r
759.macro neon_eagle2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r
760 .ifeq \srcalign16\r
761\r
762 .ifeq \dstalign32\r
763 _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r
764 .else\r
765 _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r
766 .endif\r
767\r
768 .else\r
769\r
770 .ifeq \dstalign32\r
67381db0 771 _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1, \dst2\r
7fc3ac8a 772 .else\r
67381db0 773 _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1 A256, \dst2 A256\r
7fc3ac8a
H
774 .endif\r
775\r
776 .endif\r
777.endm\r
778\r