e38fee1b |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r |
3 | @@\r |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
6 | @@ the Software without restriction, including without limitation the rights to\r |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
9 | @@ so, subject to the following conditions:\r |
10 | @@\r |
11 | @@ The above copyright notice and this permission notice shall be included in all\r |
12 | @@ copies or substantial portions of the Software.\r |
13 | @@\r |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
20 | @@ SOFTWARE.\r |
21 | @@\r |
22 | \r |
23 | \r |
24 | @ S T U --\ E1 E2\r |
25 | @ V C W --/ E3 E4\r |
26 | @ X Y Z\r |
27 | \r |
28 | @ q0 = S1sl < S >\r |
29 | @ q1 = S2sl < V >\r |
30 | @ q2 = S3sl < X >\r |
31 | @ q3 = S1sr < U >\r |
32 | @ q4 = S2sr < W >\r |
33 | @ q5 = S3sr < Z >\r |
34 | @ q6 = E3\r |
35 | @ q7 = E4\r |
36 | @ q8 = S1\r |
37 | @ q9 = S2\r |
38 | @ q10 = S3\r |
39 | @ q11 = S1prev < T >\r |
40 | @ q12 = S2prev < C >\r |
41 | @ q13 = S3prev < Y >\r |
42 | @ q14 = E1\r |
43 | @ q15 = E2\r |
44 | \r |
45 | \r |
46 | .macro __neon_eagle2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
47 | \r |
48 | .ifeqs "\qT", "q11"\r |
49 | vld1.8 {d23[7]}, [\src1] @ S1prev[15] = src[-srcstride]\r |
50 | .endif\r |
51 | vld1.8 {d25[7]}, [\src2] @ S2prev[15] = src[0]\r |
52 | .ifeqs "\qY", "q13"\r |
53 | vld1.8 {d27[7]}, [\src3] @ S3prev[15] = src[srcstride]\r |
54 | .endif\r |
55 | andS \reg1, \counter, #15 @ reg1 = counter & 15\r |
56 | \r |
57 | .ifnes "\qT", "q11"\r |
58 | add \src1, \src1, \counter @ src1 += counter\r |
59 | .endif\r |
60 | .ifnes "\qY", "q13"\r |
61 | add \src3, \src3, \counter @ src3 += counter\r |
62 | .endif\r |
63 | beq 1f\r |
64 | \r |
65 | @ first 1-15 pixels - align counter to 16 bytes\r |
66 | \r |
67 | @ q0 = S1sl < S >\r |
68 | @ q2 = S3sl < X >\r |
69 | @ q7 = tmp2\r |
70 | @ q15 = tmp1\r |
71 | \r |
72 | .ifeqs "\qT", "q11"\r |
73 | vld1.8 {q8}, [\src1], \reg1 @ S1 = [src - srcstride]; src1 += counter & 15\r |
74 | .endif\r |
75 | \r |
76 | vld1.8 {q9}, [\src2], \reg1 @ S2 = [src ]; src2 += counter & 15\r |
77 | \r |
78 | .ifeqs "\qY", "q13"\r |
79 | vld1.8 {q10}, [\src3], \reg1 @ S3 = [src + srcstride]; src3 += counter & 15\r |
80 | .endif\r |
81 | .ifeqs "\qT", "q11"\r |
82 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r |
83 | \r |
84 | vmov \qT, q8 @ S1prev = S1 < T >\r |
85 | .endif\r |
86 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r |
87 | \r |
88 | vmov q12, q9 @ S2prev = S2 < C >\r |
89 | .ifeqs "\qY", "q13"\r |
90 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r |
91 | \r |
92 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
93 | .endif\r |
94 | .ifeqs "\qT", "q11"\r |
95 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | ... < U >\r |
96 | .endif\r |
97 | \r |
98 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | ... < W >\r |
99 | \r |
100 | .ifeqs "\qY", "q13"\r |
101 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | ... < Z >\r |
102 | .else\r |
103 | vmov q2, q1 @ S3sl = S2sl < X >\r |
104 | \r |
105 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
106 | .endif\r |
107 | \r |
108 | .ifnes "\qT", "q11"\r |
109 | vmov q0, q1 @ S1sl = S2sl < S >\r |
110 | \r |
111 | vmov q3, q4 @ S1sr = S2sr < U >\r |
112 | .endif\r |
113 | \r |
114 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r |
115 | \r |
116 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r |
117 | \r |
118 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r |
119 | \r |
120 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r |
121 | \r |
122 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
123 | \r |
124 | @ q0 = tmp3\r |
125 | @ q15 = E2\r |
126 | \r |
127 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r |
128 | \r |
129 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r |
130 | \r |
131 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
132 | \r |
133 | @ q2 = tmp4\r |
134 | @ q7 = E4\r |
135 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r |
136 | \r |
137 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r |
138 | \r |
139 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
140 | \r |
141 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
142 | \r |
143 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
144 | \r |
145 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
146 | \r |
147 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
148 | \r |
149 | .ifeqs "\qT", "q11"\r |
150 | sub \reg1, \src1, #1\r |
151 | .else\r |
152 | sub \reg1, \src2, #1\r |
153 | .endif\r |
154 | \r |
155 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
156 | .ifeqs "\qT", "q11"\r |
157 | vld1.8 {d23[7]}, [\reg1] @ S1prev[15] = src[counter & 15 - 1 - srcstride]\r |
158 | \r |
159 | sub \reg1, \src2, #1\r |
160 | .endif\r |
161 | \r |
162 | vld1.8 {d25[7]}, [\reg1] @ S2prev[15] = src[counter & 15 - 1]\r |
163 | \r |
164 | .ifeqs "\qY", "q13"\r |
165 | sub \reg1, \src3, #1\r |
166 | \r |
167 | vld1.8 {d27[7]}, [\reg1] @ S3prev[15] = src[counter & 15 - 1 + srcstride]\r |
168 | .endif\r |
169 | \r |
170 | ubfx \reg1, \counter, #0, #4 @ reg1 = counter & 15\r |
171 | \r |
172 | lsl \reg1, #1\r |
173 | \r |
174 | vst2.8 {q14-q15}, [\dst1],\reg1 @ [dst] = E1,E2; dst1 += reg1\r |
175 | \r |
176 | bic \counter, \counter, #15\r |
177 | \r |
178 | vst2.8 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r |
179 | \r |
180 | @ counter is aligned to 16 bytes\r |
181 | \r |
182 | 1:\r |
183 | .ifeqs "\qT", "q11"\r |
184 | vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r |
185 | .endif\r |
186 | vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r |
187 | .ifeqs "\qY", "q13"\r |
188 | vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r |
189 | .endif\r |
190 | \r |
191 | @ inner loop (16 pixels per iteration)\r |
192 | 2:\r |
193 | \r |
194 | @ q0 = S1sl < S >\r |
195 | @ q2 = S3sl < X >\r |
196 | @ q7 = tmp2\r |
197 | @ q15 = tmp1\r |
198 | \r |
199 | .ifeqs "\qT", "q11"\r |
200 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r |
201 | vmov \qT, q8 @ S1prev = S1 < T >\r |
202 | .endif\r |
203 | \r |
204 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r |
205 | vmov q12, q9 @ S2prev = S2 < C >\r |
206 | \r |
207 | .ifeqs "\qY", "q13"\r |
208 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r |
209 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
210 | .endif\r |
211 | \r |
212 | .ifeqs "\qT", "q11"\r |
213 | vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r |
214 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r |
215 | .endif\r |
216 | \r |
217 | vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r |
218 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r |
219 | \r |
220 | .ifeqs "\qY", "q13"\r |
221 | vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r |
222 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r |
223 | .else\r |
224 | vmov q2, q1 @ S3sl = S2sl < X >\r |
225 | \r |
226 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
227 | .endif\r |
228 | \r |
229 | .ifnes "\qT", "q11"\r |
230 | vmov q0, q1 @ S1sl = S2sl < S >\r |
231 | \r |
232 | vmov q3, q4 @ S1sr = S2sr < U >\r |
233 | .endif\r |
234 | \r |
235 | sub \counter, \counter, #16 @ counter -= 16\r |
236 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r |
237 | \r |
238 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r |
239 | \r |
240 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r |
241 | \r |
242 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r |
243 | \r |
244 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
245 | \r |
246 | @ q0 = tmp3\r |
247 | @ q15 = E2\r |
248 | \r |
249 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r |
250 | \r |
251 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r |
252 | \r |
253 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
254 | \r |
255 | @ q2 = tmp4\r |
256 | @ q7 = E4\r |
257 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r |
258 | \r |
259 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r |
260 | \r |
261 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
262 | \r |
263 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
264 | \r |
265 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
266 | \r |
267 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
268 | \r |
269 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
270 | \r |
271 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
272 | vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r |
273 | \r |
274 | cmp \counter, #16\r |
275 | \r |
276 | vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r |
277 | bhi 2b\r |
278 | \r |
279 | @ last 16 pixels\r |
280 | \r |
281 | @ q0 = S1sl < S >\r |
282 | @ q2 = S3sl < X >\r |
283 | @ q7 = tmp2\r |
284 | @ q15 = tmp1\r |
285 | \r |
286 | .ifeqs "\qT", "q11"\r |
287 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r |
288 | vmov \qT, q8 @ S1prev = S1 < T >\r |
289 | .endif\r |
290 | \r |
291 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r |
292 | vmov q12, q9 @ S2prev = S2 < C >\r |
293 | \r |
294 | .ifeqs "\qY", "q13"\r |
295 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r |
296 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
297 | .endif\r |
298 | \r |
299 | .ifeqs "\qT", "q11"\r |
300 | vshr.u64 d16, d17, #(64-8) @ S1[0] = S1[15] | ...\r |
301 | .endif\r |
302 | \r |
303 | vshr.u64 d18, d19, #(64-8) @ S2[0] = S2[15] | ...\r |
304 | \r |
305 | .ifeqs "\qY", "q13"\r |
306 | vshr.u64 d20, d21, #(64-8) @ S3[0] = S3[15] | ...\r |
307 | .endif\r |
308 | .ifeqs "\qT", "q11"\r |
309 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r |
310 | .endif\r |
311 | \r |
312 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r |
313 | \r |
314 | .ifeqs "\qY", "q13"\r |
315 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r |
316 | .else\r |
317 | vmov q2, q1 @ S3sl = S2sl < X >\r |
318 | \r |
319 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
320 | .endif\r |
321 | \r |
322 | .ifnes "\qT", "q11"\r |
323 | vmov q0, q1 @ S1sl = S2sl < S >\r |
324 | \r |
325 | vmov q3, q4 @ S1sr = S2sr < U >\r |
326 | .endif\r |
327 | \r |
328 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r |
329 | \r |
330 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r |
331 | \r |
332 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r |
333 | \r |
334 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r |
335 | \r |
336 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
337 | \r |
338 | @ q0 = tmp3\r |
339 | @ q15 = E2\r |
340 | \r |
341 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r |
342 | \r |
343 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r |
344 | \r |
345 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
346 | \r |
347 | @ q2 = tmp4\r |
348 | @ q7 = E4\r |
349 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r |
350 | \r |
351 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r |
352 | \r |
353 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
354 | \r |
355 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
356 | \r |
357 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
358 | \r |
359 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
360 | \r |
361 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
362 | \r |
363 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
364 | vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r |
365 | \r |
366 | vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r |
367 | \r |
368 | .endm\r |
369 | \r |
370 | .macro _neon_eagle2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
371 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
372 | .endm\r |
373 | \r |
374 | .macro _neon_eagle2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
375 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
376 | .endm\r |
377 | \r |
378 | .macro _neon_eagle2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
379 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
380 | .endm\r |
381 | \r |
382 | .macro neon_eagle2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r |
383 | .ifeq \srcalign16\r |
384 | \r |
385 | .ifeq \dstalign32\r |
386 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r |
387 | .else\r |
388 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r |
389 | .endif\r |
390 | \r |
391 | .else\r |
392 | \r |
393 | .ifeq \dstalign32\r |
394 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r |
395 | .else\r |
396 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r |
397 | .endif\r |
398 | \r |
399 | .endif\r |
400 | .endm\r |
401 | \r |
402 | \r |
403 | .macro __neon_eagle2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
404 | \r |
405 | .ifeqs "\qT", "q11"\r |
406 | vld1.16 {d23[3]}, [\src1] @ S1prev[7] = src[-srcstride]\r |
407 | .endif\r |
408 | vld1.16 {d25[3]}, [\src2] @ S2prev[7] = src[0]\r |
409 | .ifeqs "\qY", "q13"\r |
410 | vld1.16 {d27[3]}, [\src3] @ S3prev[7] = src[srcstride]\r |
411 | .endif\r |
412 | andS \reg1, \counter, #7 @ reg1 = counter & 7\r |
413 | \r |
414 | .ifnes "\qT", "q11"\r |
415 | add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter\r |
416 | .endif\r |
417 | .ifnes "\qY", "q13"\r |
418 | add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter\r |
419 | .endif\r |
420 | beq 1f\r |
421 | \r |
422 | @ first 1-7 pixels - align counter to 16 bytes\r |
423 | \r |
424 | @ q0 = S1sl < S >\r |
425 | @ q2 = S3sl < X >\r |
426 | @ q7 = tmp2\r |
427 | @ q15 = tmp1\r |
428 | \r |
429 | .ifeqs "\qT", "q11"\r |
430 | vld1.16 {q8}, [\src1] @ S1 = [src - srcstride]\r |
431 | add \src1, \src1, \reg1, lsl #1 @ src1 += 2 * (counter & 7)\r |
432 | .endif\r |
433 | \r |
434 | vld1.16 {q9}, [\src2] @ S2 = [src ]\r |
435 | add \src2, \src2, \reg1, lsl #1 @ src2 += 2 * (counter & 7)\r |
436 | \r |
437 | .ifeqs "\qY", "q13"\r |
438 | vld1.16 {q10}, [\src3] @ S3 = [src + srcstride]\r |
439 | add \src3, \src3, \reg1, lsl #1 @ src3 += 2 * (counter & 7)\r |
440 | .endif\r |
441 | .ifeqs "\qT", "q11"\r |
442 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r |
443 | \r |
444 | vmov \qT, q8 @ S1prev = S1 < T >\r |
445 | .endif\r |
446 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r |
447 | \r |
448 | vmov q12, q9 @ S2prev = S2 < C >\r |
449 | .ifeqs "\qY", "q13"\r |
450 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r |
451 | \r |
452 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
453 | .endif\r |
454 | .ifeqs "\qT", "q11"\r |
455 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | ... < U >\r |
456 | .endif\r |
457 | \r |
458 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | ... < W >\r |
459 | \r |
460 | .ifeqs "\qY", "q13"\r |
461 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | ... < Z >\r |
462 | .else\r |
463 | vmov q2, q1 @ S3sl = S2sl < X >\r |
464 | \r |
465 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
466 | .endif\r |
467 | \r |
468 | .ifnes "\qT", "q11"\r |
469 | vmov q0, q1 @ S1sl = S2sl < S >\r |
470 | \r |
471 | vmov q3, q4 @ S1sr = S2sr < U >\r |
472 | .endif\r |
473 | \r |
474 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r |
475 | \r |
476 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r |
477 | \r |
478 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r |
479 | \r |
480 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r |
481 | \r |
482 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
483 | \r |
484 | @ q0 = tmp3\r |
485 | @ q15 = E2\r |
486 | \r |
487 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r |
488 | \r |
489 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r |
490 | \r |
491 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
492 | \r |
493 | @ q2 = tmp4\r |
494 | @ q7 = E4\r |
495 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r |
496 | \r |
497 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r |
498 | \r |
499 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
500 | \r |
501 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
502 | \r |
503 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
504 | \r |
505 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
506 | \r |
507 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
508 | \r |
509 | .ifeqs "\qT", "q11"\r |
510 | sub \reg1, \src1, #2\r |
511 | .else\r |
512 | sub \reg1, \src2, #2\r |
513 | .endif\r |
514 | \r |
515 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
516 | .ifeqs "\qT", "q11"\r |
517 | vld1.16 {d23[3]}, [\reg1] @ S1prev[7] = src[2 * (counter & 7) - 2 - srcstride]\r |
518 | \r |
519 | sub \reg1, \src2, #2\r |
520 | .endif\r |
521 | \r |
522 | vld1.16 {d25[3]}, [\reg1] @ S2prev[7] = src[2 * (counter & 7) - 2]\r |
523 | \r |
524 | .ifeqs "\qY", "q13"\r |
525 | sub \reg1, \src3, #2\r |
526 | \r |
527 | vld1.16 {d27[3]}, [\reg1] @ S3prev[7] = src[2 * (counter & 7) - 2 + srcstride]\r |
528 | .endif\r |
529 | \r |
530 | ubfx \reg1, \counter, #0, #3 @ reg1 = counter & 7\r |
531 | \r |
532 | lsl \reg1, #2\r |
533 | \r |
534 | vst2.16 {q14-q15}, [\dst1], \reg1 @ [dst] = E1,E2; dst1 += reg1\r |
535 | \r |
536 | bic \counter, \counter, #7\r |
537 | \r |
538 | vst2.16 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r |
539 | \r |
540 | @ counter is aligned to 16 bytes\r |
541 | \r |
542 | 1:\r |
543 | .ifeqs "\qT", "q11"\r |
544 | vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r |
545 | .endif\r |
546 | vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r |
547 | .ifeqs "\qY", "q13"\r |
548 | vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r |
549 | .endif\r |
550 | \r |
551 | @ inner loop (8 pixels per iteration)\r |
552 | 2:\r |
553 | \r |
554 | @ q0 = S1sl < S >\r |
555 | @ q2 = S3sl < X >\r |
556 | @ q7 = tmp2\r |
557 | @ q15 = tmp1\r |
558 | \r |
559 | .ifeqs "\qT", "q11"\r |
560 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r |
561 | vmov \qT, q8 @ S1prev = S1 < T >\r |
562 | .endif\r |
563 | \r |
564 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r |
565 | vmov q12, q9 @ S2prev = S2 < C >\r |
566 | \r |
567 | .ifeqs "\qY", "q13"\r |
568 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r |
569 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
570 | .endif\r |
571 | \r |
572 | .ifeqs "\qT", "q11"\r |
573 | vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r |
574 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r |
575 | .endif\r |
576 | \r |
577 | vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r |
578 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r |
579 | \r |
580 | .ifeqs "\qY", "q13"\r |
581 | vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r |
582 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r |
583 | .else\r |
584 | vmov q2, q1 @ S3sl = S2sl < X >\r |
585 | \r |
586 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
587 | .endif\r |
588 | \r |
589 | .ifnes "\qT", "q11"\r |
590 | vmov q0, q1 @ S1sl = S2sl < S >\r |
591 | \r |
592 | vmov q3, q4 @ S1sr = S2sr < U >\r |
593 | .endif\r |
594 | \r |
595 | sub \counter, \counter, #8 @ counter -= 8\r |
596 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r |
597 | \r |
598 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r |
599 | \r |
600 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r |
601 | \r |
602 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r |
603 | \r |
604 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
605 | \r |
606 | @ q0 = tmp3\r |
607 | @ q15 = E2\r |
608 | \r |
609 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r |
610 | \r |
611 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r |
612 | \r |
613 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
614 | \r |
615 | @ q2 = tmp4\r |
616 | @ q7 = E4\r |
617 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r |
618 | \r |
619 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r |
620 | \r |
621 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
622 | \r |
623 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
624 | \r |
625 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
626 | \r |
627 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
628 | \r |
629 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
630 | \r |
631 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
632 | vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r |
633 | \r |
634 | cmp \counter, #8\r |
635 | \r |
636 | vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r |
637 | bhi 2b\r |
638 | \r |
639 | @ last 8 pixels\r |
640 | \r |
641 | @ q0 = S1sl < S >\r |
642 | @ q2 = S3sl < X >\r |
643 | @ q7 = tmp2\r |
644 | @ q15 = tmp1\r |
645 | \r |
646 | .ifeqs "\qT", "q11"\r |
647 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r |
648 | vmov \qT, q8 @ S1prev = S1 < T >\r |
649 | .endif\r |
650 | \r |
651 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r |
652 | vmov q12, q9 @ S2prev = S2 < C >\r |
653 | \r |
654 | .ifeqs "\qY", "q13"\r |
655 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r |
656 | vmov \qY, q10 @ S3prev = S3 < Y >\r |
657 | .endif\r |
658 | \r |
659 | .ifeqs "\qT", "q11"\r |
660 | vshr.u64 d16, d17, #(64-16) @ S1[0] = S1[7] | ...\r |
661 | .endif\r |
662 | \r |
663 | vshr.u64 d18, d19, #(64-16) @ S2[0] = S2[7] | ...\r |
664 | \r |
665 | .ifeqs "\qY", "q13"\r |
666 | vshr.u64 d20, d21, #(64-16) @ S3[0] = S3[7] | ...\r |
667 | .endif\r |
668 | .ifeqs "\qT", "q11"\r |
669 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r |
670 | .endif\r |
671 | \r |
672 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r |
673 | \r |
674 | .ifeqs "\qY", "q13"\r |
675 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r |
676 | .else\r |
677 | vmov q2, q1 @ S3sl = S2sl < X >\r |
678 | \r |
679 | vmov q5, q4 @ S3sr = S2sr < Z >\r |
680 | .endif\r |
681 | \r |
682 | .ifnes "\qT", "q11"\r |
683 | vmov q0, q1 @ S1sl = S2sl < S >\r |
684 | \r |
685 | vmov q3, q4 @ S1sr = S2sr < U >\r |
686 | .endif\r |
687 | \r |
688 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r |
689 | \r |
690 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r |
691 | \r |
692 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r |
693 | \r |
694 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r |
695 | \r |
696 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r |
697 | \r |
698 | @ q0 = tmp3\r |
699 | @ q15 = E2\r |
700 | \r |
701 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r |
702 | \r |
703 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r |
704 | \r |
705 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r |
706 | \r |
707 | @ q2 = tmp4\r |
708 | @ q7 = E4\r |
709 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r |
710 | \r |
711 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r |
712 | \r |
713 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r |
714 | \r |
715 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r |
716 | \r |
717 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r |
718 | \r |
719 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r |
720 | \r |
721 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r |
722 | \r |
723 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r |
724 | vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r |
725 | \r |
726 | vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r |
727 | \r |
728 | .endm\r |
729 | \r |
730 | .macro _neon_eagle2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
731 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
732 | .endm\r |
733 | \r |
734 | .macro _neon_eagle2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
735 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
736 | .endm\r |
737 | \r |
738 | .macro _neon_eagle2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r |
739 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r |
740 | .endm\r |
741 | \r |
742 | .macro neon_eagle2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r |
743 | .ifeq \srcalign16\r |
744 | \r |
745 | .ifeq \dstalign32\r |
746 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r |
747 | .else\r |
748 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r |
749 | .endif\r |
750 | \r |
751 | .else\r |
752 | \r |
753 | .ifeq \dstalign32\r |
754 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r |
755 | .else\r |
756 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r |
757 | .endif\r |
758 | \r |
759 | .endif\r |
760 | .endm\r |
761 | \r |