add M-HT's neon scalers
[fceu.git] / drivers / arm / neon_eagle2x.Sinc
CommitLineData
7127faf3 1@@\r
2@@ Copyright (C) 2012 Roman Pauer\r
3@@\r
4@@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r
5@@ this software and associated documentation files (the "Software"), to deal in\r
6@@ the Software without restriction, including without limitation the rights to\r
7@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r
8@@ of the Software, and to permit persons to whom the Software is furnished to do\r
9@@ so, subject to the following conditions:\r
10@@\r
11@@ The above copyright notice and this permission notice shall be included in all\r
12@@ copies or substantial portions of the Software.\r
13@@\r
14@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
15@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
16@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
17@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
18@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
19@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
20@@ SOFTWARE.\r
21@@\r
22\r
23\r
24@ S T U --\ E1 E2\r
25@ V C W --/ E3 E4\r
26@ X Y Z\r
27\r
28@ q0 = S1sl < S >\r
29@ q1 = S2sl < V >\r
30@ q2 = S3sl < X >\r
31@ q3 = S1sr < U >\r
32@ q4 = S2sr < W >\r
33@ q5 = S3sr < Z >\r
34@ q6 = E3\r
35@ q7 = E4\r
36@ q8 = S1\r
37@ q9 = S2\r
38@ q10 = S3\r
39@ q11 = S1prev < T >\r
40@ q12 = S2prev < C >\r
41@ q13 = S3prev < Y >\r
42@ q14 = E1\r
43@ q15 = E2\r
44\r
45\r
46.macro __neon_eagle2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
47\r
48 .ifeqs "\qT", "q11"\r
49 vld1.8 {d23[7]}, [\src1] @ S1prev[15] = src[-srcstride]\r
50 .endif\r
51 vld1.8 {d25[7]}, [\src2] @ S2prev[15] = src[0]\r
52 .ifeqs "\qY", "q13"\r
53 vld1.8 {d27[7]}, [\src3] @ S3prev[15] = src[srcstride]\r
54 .endif\r
55 andS \reg1, \counter, #15 @ reg1 = counter & 15\r
56\r
57 .ifnes "\qT", "q11"\r
58 add \src1, \src1, \counter @ src1 += counter\r
59 .endif\r
60 .ifnes "\qY", "q13"\r
61 add \src3, \src3, \counter @ src3 += counter\r
62 .endif\r
63 beq 1f\r
64\r
65 @ first 1-15 pixels - align counter to 16 bytes\r
66\r
67@ q0 = S1sl < S >\r
68@ q2 = S3sl < X >\r
69@ q7 = tmp2\r
70@ q15 = tmp1\r
71\r
72 .ifeqs "\qT", "q11"\r
73 vld1.8 {q8}, [\src1], \reg1 @ S1 = [src - srcstride]; src1 += counter & 15\r
74 .endif\r
75\r
76 vld1.8 {q9}, [\src2], \reg1 @ S2 = [src ]; src2 += counter & 15\r
77\r
78 .ifeqs "\qY", "q13"\r
79 vld1.8 {q10}, [\src3], \reg1 @ S3 = [src + srcstride]; src3 += counter & 15\r
80 .endif\r
81 .ifeqs "\qT", "q11"\r
82 vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r
83\r
84 vmov \qT, q8 @ S1prev = S1 < T >\r
85 .endif\r
86 vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r
87\r
88 vmov q12, q9 @ S2prev = S2 < C >\r
89 .ifeqs "\qY", "q13"\r
90 vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r
91\r
92 vmov \qY, q10 @ S3prev = S3 < Y >\r
93 .endif\r
94 .ifeqs "\qT", "q11"\r
95 vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | ... < U >\r
96 .endif\r
97\r
98 vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | ... < W >\r
99\r
100 .ifeqs "\qY", "q13"\r
101 vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | ... < Z >\r
102 .else\r
103 vmov q2, q1 @ S3sl = S2sl < X >\r
104\r
105 vmov q5, q4 @ S3sr = S2sr < Z >\r
106 .endif\r
107\r
108 .ifnes "\qT", "q11"\r
109 vmov q0, q1 @ S1sl = S2sl < S >\r
110\r
111 vmov q3, q4 @ S1sr = S2sr < U >\r
112 .endif\r
113\r
114 vceq.i8 q14, q0, \qT @ E1 = < S == T >\r
115\r
116 vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r
117\r
118 vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r
119\r
120 vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r
121\r
122 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
123\r
124@ q0 = tmp3\r
125@ q15 = E2\r
126\r
127 vceq.i8 q15, q3, \qT @ E2 = < U == T >\r
128\r
129 vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r
130\r
131 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
132\r
133@ q2 = tmp4\r
134@ q7 = E4\r
135 vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r
136\r
137 vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r
138\r
139 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
140\r
141 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
142\r
143 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
144\r
145 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
146\r
147 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
148\r
149 .ifeqs "\qT", "q11"\r
150 sub \reg1, \src1, #1\r
151 .else\r
152 sub \reg1, \src2, #1\r
153 .endif\r
154\r
155 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
156 .ifeqs "\qT", "q11"\r
157 vld1.8 {d23[7]}, [\reg1] @ S1prev[15] = src[counter & 15 - 1 - srcstride]\r
158\r
159 sub \reg1, \src2, #1\r
160 .endif\r
161\r
162 vld1.8 {d25[7]}, [\reg1] @ S2prev[15] = src[counter & 15 - 1]\r
163\r
164 .ifeqs "\qY", "q13"\r
165 sub \reg1, \src3, #1\r
166\r
167 vld1.8 {d27[7]}, [\reg1] @ S3prev[15] = src[counter & 15 - 1 + srcstride]\r
168 .endif\r
169\r
170 ubfx \reg1, \counter, #0, #4 @ reg1 = counter & 15\r
171\r
172 lsl \reg1, #1\r
173\r
174 vst2.8 {q14-q15}, [\dst1],\reg1 @ [dst] = E1,E2; dst1 += reg1\r
175\r
176 bic \counter, \counter, #15\r
177\r
178 vst2.8 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r
179\r
180 @ counter is aligned to 16 bytes\r
181\r
182 1:\r
183 .ifeqs "\qT", "q11"\r
184 vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r
185 .endif\r
186 vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r
187 .ifeqs "\qY", "q13"\r
188 vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r
189 .endif\r
190\r
191 @ inner loop (16 pixels per iteration)\r
192 2:\r
193\r
194@ q0 = S1sl < S >\r
195@ q2 = S3sl < X >\r
196@ q7 = tmp2\r
197@ q15 = tmp1\r
198\r
199 .ifeqs "\qT", "q11"\r
200 vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r
201 vmov \qT, q8 @ S1prev = S1 < T >\r
202 .endif\r
203\r
204 vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r
205 vmov q12, q9 @ S2prev = S2 < C >\r
206\r
207 .ifeqs "\qY", "q13"\r
208 vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r
209 vmov \qY, q10 @ S3prev = S3 < Y >\r
210 .endif\r
211\r
212 .ifeqs "\qT", "q11"\r
213 vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r
214 vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r
215 .endif\r
216\r
217 vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r
218 vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r
219\r
220 .ifeqs "\qY", "q13"\r
221 vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r
222 vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r
223 .else\r
224 vmov q2, q1 @ S3sl = S2sl < X >\r
225\r
226 vmov q5, q4 @ S3sr = S2sr < Z >\r
227 .endif\r
228\r
229 .ifnes "\qT", "q11"\r
230 vmov q0, q1 @ S1sl = S2sl < S >\r
231\r
232 vmov q3, q4 @ S1sr = S2sr < U >\r
233 .endif\r
234\r
235 sub \counter, \counter, #16 @ counter -= 16\r
236 vceq.i8 q14, q0, \qT @ E1 = < S == T >\r
237\r
238 vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r
239\r
240 vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r
241\r
242 vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r
243\r
244 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
245\r
246@ q0 = tmp3\r
247@ q15 = E2\r
248\r
249 vceq.i8 q15, q3, \qT @ E2 = < U == T >\r
250\r
251 vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r
252\r
253 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
254\r
255@ q2 = tmp4\r
256@ q7 = E4\r
257 vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r
258\r
259 vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r
260\r
261 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
262\r
263 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
264\r
265 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
266\r
267 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
268\r
269 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
270\r
271 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
272 vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r
273\r
274 cmp \counter, #16\r
275\r
276 vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r
277 bhi 2b\r
278\r
279 @ last 16 pixels\r
280\r
281@ q0 = S1sl < S >\r
282@ q2 = S3sl < X >\r
283@ q7 = tmp2\r
284@ q15 = tmp1\r
285\r
286 .ifeqs "\qT", "q11"\r
287 vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r
288 vmov \qT, q8 @ S1prev = S1 < T >\r
289 .endif\r
290\r
291 vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r
292 vmov q12, q9 @ S2prev = S2 < C >\r
293\r
294 .ifeqs "\qY", "q13"\r
295 vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r
296 vmov \qY, q10 @ S3prev = S3 < Y >\r
297 .endif\r
298\r
299 .ifeqs "\qT", "q11"\r
300 vshr.u64 d16, d17, #(64-8) @ S1[0] = S1[15] | ...\r
301 .endif\r
302\r
303 vshr.u64 d18, d19, #(64-8) @ S2[0] = S2[15] | ...\r
304\r
305 .ifeqs "\qY", "q13"\r
306 vshr.u64 d20, d21, #(64-8) @ S3[0] = S3[15] | ...\r
307 .endif\r
308 .ifeqs "\qT", "q11"\r
309 vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r
310 .endif\r
311\r
312 vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r
313\r
314 .ifeqs "\qY", "q13"\r
315 vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r
316 .else\r
317 vmov q2, q1 @ S3sl = S2sl < X >\r
318\r
319 vmov q5, q4 @ S3sr = S2sr < Z >\r
320 .endif\r
321\r
322 .ifnes "\qT", "q11"\r
323 vmov q0, q1 @ S1sl = S2sl < S >\r
324\r
325 vmov q3, q4 @ S1sr = S2sr < U >\r
326 .endif\r
327\r
328 vceq.i8 q14, q0, \qT @ E1 = < S == T >\r
329\r
330 vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r
331\r
332 vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r
333\r
334 vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r
335\r
336 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
337\r
338@ q0 = tmp3\r
339@ q15 = E2\r
340\r
341 vceq.i8 q15, q3, \qT @ E2 = < U == T >\r
342\r
343 vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r
344\r
345 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
346\r
347@ q2 = tmp4\r
348@ q7 = E4\r
349 vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r
350\r
351 vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r
352\r
353 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
354\r
355 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
356\r
357 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
358\r
359 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
360\r
361 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
362\r
363 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
364 vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r
365\r
366 vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r
367\r
368.endm\r
369\r
370.macro _neon_eagle2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
371 __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
372.endm\r
373\r
374.macro _neon_eagle2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
375 __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
376.endm\r
377\r
378.macro _neon_eagle2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
379 __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
380.endm\r
381\r
382.macro neon_eagle2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r
383 .ifeq \srcalign16\r
384\r
385 .ifeq \dstalign32\r
386 _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r
387 .else\r
388 _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r
389 .endif\r
390\r
391 .else\r
392\r
393 .ifeq \dstalign32\r
394 _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r
395 .else\r
396 _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r
397 .endif\r
398\r
399 .endif\r
400.endm\r
401\r
402\r
403.macro __neon_eagle2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
404\r
405 .ifeqs "\qT", "q11"\r
406 vld1.16 {d23[3]}, [\src1] @ S1prev[7] = src[-srcstride]\r
407 .endif\r
408 vld1.16 {d25[3]}, [\src2] @ S2prev[7] = src[0]\r
409 .ifeqs "\qY", "q13"\r
410 vld1.16 {d27[3]}, [\src3] @ S3prev[7] = src[srcstride]\r
411 .endif\r
412 andS \reg1, \counter, #7 @ reg1 = counter & 7\r
413\r
414 .ifnes "\qT", "q11"\r
415 add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter\r
416 .endif\r
417 .ifnes "\qY", "q13"\r
418 add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter\r
419 .endif\r
420 beq 1f\r
421\r
422 @ first 1-7 pixels - align counter to 16 bytes\r
423\r
424@ q0 = S1sl < S >\r
425@ q2 = S3sl < X >\r
426@ q7 = tmp2\r
427@ q15 = tmp1\r
428\r
429 .ifeqs "\qT", "q11"\r
430 vld1.16 {q8}, [\src1] @ S1 = [src - srcstride]\r
431 add \src1, \src1, \reg1, lsl #1 @ src1 += 2 * (counter & 7)\r
432 .endif\r
433\r
434 vld1.16 {q9}, [\src2] @ S2 = [src ]\r
435 add \src2, \src2, \reg1, lsl #1 @ src2 += 2 * (counter & 7)\r
436\r
437 .ifeqs "\qY", "q13"\r
438 vld1.16 {q10}, [\src3] @ S3 = [src + srcstride]\r
439 add \src3, \src3, \reg1, lsl #1 @ src3 += 2 * (counter & 7)\r
440 .endif\r
441 .ifeqs "\qT", "q11"\r
442 vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r
443\r
444 vmov \qT, q8 @ S1prev = S1 < T >\r
445 .endif\r
446 vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r
447\r
448 vmov q12, q9 @ S2prev = S2 < C >\r
449 .ifeqs "\qY", "q13"\r
450 vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r
451\r
452 vmov \qY, q10 @ S3prev = S3 < Y >\r
453 .endif\r
454 .ifeqs "\qT", "q11"\r
455 vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | ... < U >\r
456 .endif\r
457\r
458 vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | ... < W >\r
459\r
460 .ifeqs "\qY", "q13"\r
461 vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | ... < Z >\r
462 .else\r
463 vmov q2, q1 @ S3sl = S2sl < X >\r
464\r
465 vmov q5, q4 @ S3sr = S2sr < Z >\r
466 .endif\r
467\r
468 .ifnes "\qT", "q11"\r
469 vmov q0, q1 @ S1sl = S2sl < S >\r
470\r
471 vmov q3, q4 @ S1sr = S2sr < U >\r
472 .endif\r
473\r
474 vceq.i16 q14, q0, \qT @ E1 = < S == T >\r
475\r
476 vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r
477\r
478 vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r
479\r
480 vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r
481\r
482 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
483\r
484@ q0 = tmp3\r
485@ q15 = E2\r
486\r
487 vceq.i16 q15, q3, \qT @ E2 = < U == T >\r
488\r
489 vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r
490\r
491 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
492\r
493@ q2 = tmp4\r
494@ q7 = E4\r
495 vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r
496\r
497 vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r
498\r
499 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
500\r
501 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
502\r
503 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
504\r
505 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
506\r
507 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
508\r
509 .ifeqs "\qT", "q11"\r
510 sub \reg1, \src1, #2\r
511 .else\r
512 sub \reg1, \src2, #2\r
513 .endif\r
514\r
515 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
516 .ifeqs "\qT", "q11"\r
517 vld1.16 {d23[3]}, [\reg1] @ S1prev[7] = src[2 * (counter & 7) - 2 - srcstride]\r
518\r
519 sub \reg1, \src2, #2\r
520 .endif\r
521\r
522 vld1.16 {d25[3]}, [\reg1] @ S2prev[7] = src[2 * (counter & 7) - 2]\r
523\r
524 .ifeqs "\qY", "q13"\r
525 sub \reg1, \src3, #2\r
526\r
527 vld1.16 {d27[3]}, [\reg1] @ S3prev[7] = src[2 * (counter & 7) - 2 + srcstride]\r
528 .endif\r
529\r
530 ubfx \reg1, \counter, #0, #3 @ reg1 = counter & 7\r
531\r
532 lsl \reg1, #2\r
533\r
534 vst2.16 {q14-q15}, [\dst1], \reg1 @ [dst] = E1,E2; dst1 += reg1\r
535\r
536 bic \counter, \counter, #7\r
537\r
538 vst2.16 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r
539\r
540 @ counter is aligned to 16 bytes\r
541\r
542 1:\r
543 .ifeqs "\qT", "q11"\r
544 vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r
545 .endif\r
546 vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r
547 .ifeqs "\qY", "q13"\r
548 vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r
549 .endif\r
550\r
551 @ inner loop (8 pixels per iteration)\r
552 2:\r
553\r
554@ q0 = S1sl < S >\r
555@ q2 = S3sl < X >\r
556@ q7 = tmp2\r
557@ q15 = tmp1\r
558\r
559 .ifeqs "\qT", "q11"\r
560 vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r
561 vmov \qT, q8 @ S1prev = S1 < T >\r
562 .endif\r
563\r
564 vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r
565 vmov q12, q9 @ S2prev = S2 < C >\r
566\r
567 .ifeqs "\qY", "q13"\r
568 vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r
569 vmov \qY, q10 @ S3prev = S3 < Y >\r
570 .endif\r
571\r
572 .ifeqs "\qT", "q11"\r
573 vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r
574 vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r
575 .endif\r
576\r
577 vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r
578 vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r
579\r
580 .ifeqs "\qY", "q13"\r
581 vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r
582 vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r
583 .else\r
584 vmov q2, q1 @ S3sl = S2sl < X >\r
585\r
586 vmov q5, q4 @ S3sr = S2sr < Z >\r
587 .endif\r
588\r
589 .ifnes "\qT", "q11"\r
590 vmov q0, q1 @ S1sl = S2sl < S >\r
591\r
592 vmov q3, q4 @ S1sr = S2sr < U >\r
593 .endif\r
594\r
595 sub \counter, \counter, #8 @ counter -= 8\r
596 vceq.i16 q14, q0, \qT @ E1 = < S == T >\r
597\r
598 vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r
599\r
600 vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r
601\r
602 vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r
603\r
604 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
605\r
606@ q0 = tmp3\r
607@ q15 = E2\r
608\r
609 vceq.i16 q15, q3, \qT @ E2 = < U == T >\r
610\r
611 vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r
612\r
613 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
614\r
615@ q2 = tmp4\r
616@ q7 = E4\r
617 vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r
618\r
619 vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r
620\r
621 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
622\r
623 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
624\r
625 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
626\r
627 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
628\r
629 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
630\r
631 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
632 vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r
633\r
634 cmp \counter, #8\r
635\r
636 vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r
637 bhi 2b\r
638\r
639 @ last 8 pixels\r
640\r
641@ q0 = S1sl < S >\r
642@ q2 = S3sl < X >\r
643@ q7 = tmp2\r
644@ q15 = tmp1\r
645\r
646 .ifeqs "\qT", "q11"\r
647 vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r
648 vmov \qT, q8 @ S1prev = S1 < T >\r
649 .endif\r
650\r
651 vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r
652 vmov q12, q9 @ S2prev = S2 < C >\r
653\r
654 .ifeqs "\qY", "q13"\r
655 vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r
656 vmov \qY, q10 @ S3prev = S3 < Y >\r
657 .endif\r
658\r
659 .ifeqs "\qT", "q11"\r
660 vshr.u64 d16, d17, #(64-16) @ S1[0] = S1[7] | ...\r
661 .endif\r
662\r
663 vshr.u64 d18, d19, #(64-16) @ S2[0] = S2[7] | ...\r
664\r
665 .ifeqs "\qY", "q13"\r
666 vshr.u64 d20, d21, #(64-16) @ S3[0] = S3[7] | ...\r
667 .endif\r
668 .ifeqs "\qT", "q11"\r
669 vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r
670 .endif\r
671\r
672 vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r
673\r
674 .ifeqs "\qY", "q13"\r
675 vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r
676 .else\r
677 vmov q2, q1 @ S3sl = S2sl < X >\r
678\r
679 vmov q5, q4 @ S3sr = S2sr < Z >\r
680 .endif\r
681\r
682 .ifnes "\qT", "q11"\r
683 vmov q0, q1 @ S1sl = S2sl < S >\r
684\r
685 vmov q3, q4 @ S1sr = S2sr < U >\r
686 .endif\r
687\r
688 vceq.i16 q14, q0, \qT @ E1 = < S == T >\r
689\r
690 vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r
691\r
692 vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r
693\r
694 vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r
695\r
696 vand q14, q14, q15 @ E1 = < S == T && S == V >\r
697\r
698@ q0 = tmp3\r
699@ q15 = E2\r
700\r
701 vceq.i16 q15, q3, \qT @ E2 = < U == T >\r
702\r
703 vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r
704\r
705 vand q6, q6, q7 @ E3 = < X == Y && X == V >\r
706\r
707@ q2 = tmp4\r
708@ q7 = E4\r
709 vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r
710\r
711 vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r
712\r
713 vand q15, q15, q0 @ E2 = < U == T && U == W >\r
714\r
715 vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r
716\r
717 vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r
718\r
719 vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r
720\r
721 vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r
722\r
723 vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r
724 vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r
725\r
726 vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r
727\r
728.endm\r
729\r
730.macro _neon_eagle2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
731 __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
732.endm\r
733\r
734.macro _neon_eagle2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
735 __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
736.endm\r
737\r
738.macro _neon_eagle2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r
739 __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r
740.endm\r
741\r
742.macro neon_eagle2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r
743 .ifeq \srcalign16\r
744\r
745 .ifeq \dstalign32\r
746 _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r
747 .else\r
748 _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r
749 .endif\r
750\r
751 .else\r
752\r
753 .ifeq \dstalign32\r
754 _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r
755 .else\r
756 _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r
757 .endif\r
758\r
759 .endif\r
760.endm\r
761\r