Commit | Line | Data |
---|---|---|
7fc3ac8a H |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r | |
3 | @@\r | |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r | |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r | |
6 | @@ the Software without restriction, including without limitation the rights to\r | |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r | |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r | |
9 | @@ so, subject to the following conditions:\r | |
10 | @@\r | |
11 | @@ The above copyright notice and this permission notice shall be included in all\r | |
12 | @@ copies or substantial portions of the Software.\r | |
13 | @@\r | |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r | |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r | |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r | |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r | |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r | |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r | |
20 | @@ SOFTWARE.\r | |
21 | @@\r | |
22 | \r | |
23 | \r | |
24 | @ S T U --\ E1 E2\r | |
25 | @ V C W --/ E3 E4\r | |
26 | @ X Y Z\r | |
27 | \r | |
28 | @ q0 = S1sl < S >\r | |
29 | @ q1 = S2sl < V >\r | |
30 | @ q2 = S3sl < X >\r | |
31 | @ q3 = S1sr < U >\r | |
32 | @ q4 = S2sr < W >\r | |
33 | @ q5 = S3sr < Z >\r | |
34 | @ q6 = E3\r | |
35 | @ q7 = E4\r | |
36 | @ q8 = S1\r | |
37 | @ q9 = S2\r | |
38 | @ q10 = S3\r | |
39 | @ q11 = S1prev < T >\r | |
40 | @ q12 = S2prev < C >\r | |
41 | @ q13 = S3prev < Y >\r | |
42 | @ q14 = E1\r | |
43 | @ q15 = E2\r | |
44 | \r | |
45 | \r | |
46 | .macro __neon_eagle2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
47 | \r | |
48 | .ifeqs "\qT", "q11"\r | |
49 | vld1.8 {d23[7]}, [\src1] @ S1prev[15] = src[-srcstride]\r | |
50 | .endif\r | |
51 | vld1.8 {d25[7]}, [\src2] @ S2prev[15] = src[0]\r | |
52 | .ifeqs "\qY", "q13"\r | |
53 | vld1.8 {d27[7]}, [\src3] @ S3prev[15] = src[srcstride]\r | |
54 | .endif\r | |
55 | andS \reg1, \counter, #15 @ reg1 = counter & 15\r | |
56 | \r | |
57 | .ifnes "\qT", "q11"\r | |
58 | add \src1, \src1, \counter @ src1 += counter\r | |
59 | .endif\r | |
60 | .ifnes "\qY", "q13"\r | |
61 | add \src3, \src3, \counter @ src3 += counter\r | |
62 | .endif\r | |
63 | beq 1f\r | |
64 | \r | |
65 | @ first 1-15 pixels - align counter to 16 bytes\r | |
66 | \r | |
67 | @ q0 = S1sl < S >\r | |
68 | @ q2 = S3sl < X >\r | |
69 | @ q7 = tmp2\r | |
70 | @ q15 = tmp1\r | |
71 | \r | |
72 | .ifeqs "\qT", "q11"\r | |
73 | vld1.8 {q8}, [\src1], \reg1 @ S1 = [src - srcstride]; src1 += counter & 15\r | |
74 | .endif\r | |
75 | \r | |
76 | vld1.8 {q9}, [\src2], \reg1 @ S2 = [src ]; src2 += counter & 15\r | |
77 | \r | |
78 | .ifeqs "\qY", "q13"\r | |
79 | vld1.8 {q10}, [\src3], \reg1 @ S3 = [src + srcstride]; src3 += counter & 15\r | |
80 | .endif\r | |
81 | .ifeqs "\qT", "q11"\r | |
82 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r | |
83 | \r | |
84 | vmov \qT, q8 @ S1prev = S1 < T >\r | |
85 | .endif\r | |
86 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r | |
87 | \r | |
88 | vmov q12, q9 @ S2prev = S2 < C >\r | |
89 | .ifeqs "\qY", "q13"\r | |
90 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r | |
91 | \r | |
92 | vmov \qY, q10 @ S3prev = S3 < Y >\r | |
93 | .endif\r | |
94 | .ifeqs "\qT", "q11"\r | |
95 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | ... < U >\r | |
96 | .endif\r | |
97 | \r | |
98 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | ... < W >\r | |
99 | \r | |
100 | .ifeqs "\qY", "q13"\r | |
101 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | ... < Z >\r | |
102 | .else\r | |
103 | vmov q2, q1 @ S3sl = S2sl < X >\r | |
104 | \r | |
105 | vmov q5, q4 @ S3sr = S2sr < Z >\r | |
106 | .endif\r | |
107 | \r | |
108 | .ifnes "\qT", "q11"\r | |
109 | vmov q0, q1 @ S1sl = S2sl < S >\r | |
110 | \r | |
111 | vmov q3, q4 @ S1sr = S2sr < U >\r | |
112 | .endif\r | |
113 | \r | |
114 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r | |
115 | \r | |
116 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r | |
117 | \r | |
118 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r | |
119 | \r | |
120 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r | |
121 | \r | |
122 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r | |
123 | \r | |
124 | @ q0 = tmp3\r | |
125 | @ q15 = E2\r | |
126 | \r | |
127 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r | |
128 | \r | |
129 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r | |
130 | \r | |
131 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r | |
132 | \r | |
133 | @ q2 = tmp4\r | |
134 | @ q7 = E4\r | |
135 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r | |
136 | \r | |
137 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r | |
138 | \r | |
139 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r | |
140 | \r | |
141 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r | |
142 | \r | |
143 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r | |
144 | \r | |
145 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r | |
146 | \r | |
147 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r | |
148 | \r | |
149 | .ifeqs "\qT", "q11"\r | |
150 | sub \reg1, \src1, #1\r | |
151 | .else\r | |
152 | sub \reg1, \src2, #1\r | |
153 | .endif\r | |
154 | \r | |
155 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r | |
156 | .ifeqs "\qT", "q11"\r | |
157 | vld1.8 {d23[7]}, [\reg1] @ S1prev[15] = src[counter & 15 - 1 - srcstride]\r | |
158 | \r | |
159 | sub \reg1, \src2, #1\r | |
160 | .endif\r | |
161 | \r | |
162 | vld1.8 {d25[7]}, [\reg1] @ S2prev[15] = src[counter & 15 - 1]\r | |
163 | \r | |
164 | .ifeqs "\qY", "q13"\r | |
165 | sub \reg1, \src3, #1\r | |
166 | \r | |
167 | vld1.8 {d27[7]}, [\reg1] @ S3prev[15] = src[counter & 15 - 1 + srcstride]\r | |
168 | .endif\r | |
169 | \r | |
170 | ubfx \reg1, \counter, #0, #4 @ reg1 = counter & 15\r | |
171 | \r | |
172 | lsl \reg1, #1\r | |
173 | \r | |
174 | vst2.8 {q14-q15}, [\dst1],\reg1 @ [dst] = E1,E2; dst1 += reg1\r | |
175 | \r | |
176 | bic \counter, \counter, #15\r | |
177 | \r | |
178 | vst2.8 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r | |
179 | \r | |
180 | @ counter is aligned to 16 bytes\r | |
181 | \r | |
182 | 1:\r | |
183 | .ifeqs "\qT", "q11"\r | |
184 | vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r | |
185 | .endif\r | |
186 | vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r | |
187 | .ifeqs "\qY", "q13"\r | |
188 | vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r | |
189 | .endif\r | |
190 | \r | |
191 | @ inner loop (16 pixels per iteration)\r | |
192 | 2:\r | |
193 | \r | |
194 | @ q0 = S1sl < S >\r | |
195 | @ q2 = S3sl < X >\r | |
196 | @ q7 = tmp2\r | |
197 | @ q15 = tmp1\r | |
198 | \r | |
199 | .ifeqs "\qT", "q11"\r | |
200 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r | |
201 | vmov \qT, q8 @ S1prev = S1 < T >\r | |
202 | .endif\r | |
203 | \r | |
204 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r | |
205 | vmov q12, q9 @ S2prev = S2 < C >\r | |
206 | \r | |
207 | .ifeqs "\qY", "q13"\r | |
208 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r | |
209 | vmov \qY, q10 @ S3prev = S3 < Y >\r | |
210 | .endif\r | |
211 | \r | |
212 | .ifeqs "\qT", "q11"\r | |
213 | vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16\r | |
214 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r | |
215 | .endif\r | |
216 | \r | |
217 | vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16\r | |
218 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r | |
219 | \r | |
220 | .ifeqs "\qY", "q13"\r | |
221 | vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16\r | |
222 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r | |
223 | .else\r | |
224 | vmov q2, q1 @ S3sl = S2sl < X >\r | |
225 | \r | |
226 | vmov q5, q4 @ S3sr = S2sr < Z >\r | |
227 | .endif\r | |
228 | \r | |
229 | .ifnes "\qT", "q11"\r | |
230 | vmov q0, q1 @ S1sl = S2sl < S >\r | |
231 | \r | |
232 | vmov q3, q4 @ S1sr = S2sr < U >\r | |
233 | .endif\r | |
234 | \r | |
235 | sub \counter, \counter, #16 @ counter -= 16\r | |
236 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r | |
237 | \r | |
238 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r | |
239 | \r | |
240 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r | |
241 | \r | |
242 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r | |
243 | \r | |
244 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r | |
245 | \r | |
246 | @ q0 = tmp3\r | |
247 | @ q15 = E2\r | |
248 | \r | |
249 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r | |
250 | \r | |
251 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r | |
252 | \r | |
253 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r | |
254 | \r | |
255 | @ q2 = tmp4\r | |
256 | @ q7 = E4\r | |
257 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r | |
258 | \r | |
259 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r | |
260 | \r | |
261 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r | |
262 | \r | |
263 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r | |
264 | \r | |
265 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r | |
266 | \r | |
267 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r | |
268 | \r | |
269 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r | |
270 | \r | |
271 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r | |
272 | vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r | |
273 | \r | |
274 | cmp \counter, #16\r | |
275 | \r | |
276 | vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r | |
277 | bhi 2b\r | |
278 | \r | |
279 | @ last 16 pixels\r | |
280 | \r | |
281 | @ q0 = S1sl < S >\r | |
282 | @ q2 = S3sl < X >\r | |
283 | @ q7 = tmp2\r | |
284 | @ q15 = tmp1\r | |
285 | \r | |
286 | .ifeqs "\qT", "q11"\r | |
287 | vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >\r | |
288 | vmov \qT, q8 @ S1prev = S1 < T >\r | |
289 | .endif\r | |
290 | \r | |
291 | vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >\r | |
292 | vmov q12, q9 @ S2prev = S2 < C >\r | |
293 | \r | |
294 | .ifeqs "\qY", "q13"\r | |
295 | vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >\r | |
296 | vmov \qY, q10 @ S3prev = S3 < Y >\r | |
297 | .endif\r | |
298 | \r | |
299 | .ifeqs "\qT", "q11"\r | |
300 | vshr.u64 d16, d17, #(64-8) @ S1[0] = S1[15] | ...\r | |
301 | .endif\r | |
302 | \r | |
303 | vshr.u64 d18, d19, #(64-8) @ S2[0] = S2[15] | ...\r | |
304 | \r | |
305 | .ifeqs "\qY", "q13"\r | |
306 | vshr.u64 d20, d21, #(64-8) @ S3[0] = S3[15] | ...\r | |
307 | .endif\r | |
308 | .ifeqs "\qT", "q11"\r | |
309 | vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >\r | |
310 | .endif\r | |
311 | \r | |
312 | vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >\r | |
313 | \r | |
314 | .ifeqs "\qY", "q13"\r | |
315 | vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >\r | |
316 | .else\r | |
317 | vmov q2, q1 @ S3sl = S2sl < X >\r | |
318 | \r | |
319 | vmov q5, q4 @ S3sr = S2sr < Z >\r | |
320 | .endif\r | |
321 | \r | |
322 | .ifnes "\qT", "q11"\r | |
323 | vmov q0, q1 @ S1sl = S2sl < S >\r | |
324 | \r | |
325 | vmov q3, q4 @ S1sr = S2sr < U >\r | |
326 | .endif\r | |
327 | \r | |
328 | vceq.i8 q14, q0, \qT @ E1 = < S == T >\r | |
329 | \r | |
330 | vceq.i8 q15, q0, q1 @ tmp1 = < S == V >\r | |
331 | \r | |
332 | vceq.i8 q6, q2, \qY @ E3 = < X == Y >\r | |
333 | \r | |
334 | vceq.i8 q7, q2, q1 @ tmp2 = < X == V >\r | |
335 | \r | |
336 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r | |
337 | \r | |
338 | @ q0 = tmp3\r | |
339 | @ q15 = E2\r | |
340 | \r | |
341 | vceq.i8 q15, q3, \qT @ E2 = < U == T >\r | |
342 | \r | |
343 | vceq.i8 q0, q3, q4 @ tmp3 = < U == W >\r | |
344 | \r | |
345 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r | |
346 | \r | |
347 | @ q2 = tmp4\r | |
348 | @ q7 = E4\r | |
349 | vceq.i8 q7, q5, \qY @ E4 = < Z == Y >\r | |
350 | \r | |
351 | vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >\r | |
352 | \r | |
353 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r | |
354 | \r | |
355 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r | |
356 | \r | |
357 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r | |
358 | \r | |
359 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r | |
360 | \r | |
361 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r | |
362 | \r | |
363 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r | |
364 | vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16\r | |
365 | \r | |
366 | vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16\r | |
367 | \r | |
368 | .endm\r | |
369 | \r | |
370 | .macro _neon_eagle2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
371 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
372 | .endm\r | |
373 | \r | |
374 | .macro _neon_eagle2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
375 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
376 | .endm\r | |
377 | \r | |
378 | .macro _neon_eagle2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
379 | __neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
380 | .endm\r | |
381 | \r | |
382 | .macro neon_eagle2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r | |
383 | .ifeq \srcalign16\r | |
384 | \r | |
385 | .ifeq \dstalign32\r | |
386 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r | |
387 | .else\r | |
388 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r | |
389 | .endif\r | |
390 | \r | |
391 | .else\r | |
392 | \r | |
393 | .ifeq \dstalign32\r | |
394 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r | |
395 | .else\r | |
396 | _neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r | |
397 | .endif\r | |
398 | \r | |
399 | .endif\r | |
400 | .endm\r | |
401 | \r | |
402 | \r | |
403 | .macro __neon_eagle2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
404 | \r | |
405 | .ifeqs "\qT", "q11"\r | |
406 | vld1.16 {d23[3]}, [\src1] @ S1prev[7] = src[-srcstride]\r | |
407 | .endif\r | |
408 | vld1.16 {d25[3]}, [\src2] @ S2prev[7] = src[0]\r | |
409 | .ifeqs "\qY", "q13"\r | |
410 | vld1.16 {d27[3]}, [\src3] @ S3prev[7] = src[srcstride]\r | |
411 | .endif\r | |
412 | andS \reg1, \counter, #7 @ reg1 = counter & 7\r | |
413 | \r | |
414 | .ifnes "\qT", "q11"\r | |
415 | add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter\r | |
416 | .endif\r | |
417 | .ifnes "\qY", "q13"\r | |
418 | add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter\r | |
419 | .endif\r | |
420 | beq 1f\r | |
421 | \r | |
422 | @ first 1-7 pixels - align counter to 16 bytes\r | |
423 | \r | |
424 | @ q0 = S1sl < S >\r | |
425 | @ q2 = S3sl < X >\r | |
426 | @ q7 = tmp2\r | |
427 | @ q15 = tmp1\r | |
428 | \r | |
429 | .ifeqs "\qT", "q11"\r | |
430 | vld1.16 {q8}, [\src1] @ S1 = [src - srcstride]\r | |
431 | add \src1, \src1, \reg1, lsl #1 @ src1 += 2 * (counter & 7)\r | |
432 | .endif\r | |
433 | \r | |
434 | vld1.16 {q9}, [\src2] @ S2 = [src ]\r | |
435 | add \src2, \src2, \reg1, lsl #1 @ src2 += 2 * (counter & 7)\r | |
436 | \r | |
437 | .ifeqs "\qY", "q13"\r | |
438 | vld1.16 {q10}, [\src3] @ S3 = [src + srcstride]\r | |
439 | add \src3, \src3, \reg1, lsl #1 @ src3 += 2 * (counter & 7)\r | |
440 | .endif\r | |
441 | .ifeqs "\qT", "q11"\r | |
442 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r | |
443 | \r | |
444 | vmov \qT, q8 @ S1prev = S1 < T >\r | |
445 | .endif\r | |
446 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r | |
447 | \r | |
448 | vmov q12, q9 @ S2prev = S2 < C >\r | |
449 | .ifeqs "\qY", "q13"\r | |
450 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r | |
451 | \r | |
452 | vmov \qY, q10 @ S3prev = S3 < Y >\r | |
453 | .endif\r | |
454 | .ifeqs "\qT", "q11"\r | |
455 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | ... < U >\r | |
456 | .endif\r | |
457 | \r | |
458 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | ... < W >\r | |
459 | \r | |
460 | .ifeqs "\qY", "q13"\r | |
461 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | ... < Z >\r | |
462 | .else\r | |
463 | vmov q2, q1 @ S3sl = S2sl < X >\r | |
464 | \r | |
465 | vmov q5, q4 @ S3sr = S2sr < Z >\r | |
466 | .endif\r | |
467 | \r | |
468 | .ifnes "\qT", "q11"\r | |
469 | vmov q0, q1 @ S1sl = S2sl < S >\r | |
470 | \r | |
471 | vmov q3, q4 @ S1sr = S2sr < U >\r | |
472 | .endif\r | |
473 | \r | |
474 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r | |
475 | \r | |
476 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r | |
477 | \r | |
478 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r | |
479 | \r | |
480 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r | |
481 | \r | |
482 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r | |
483 | \r | |
484 | @ q0 = tmp3\r | |
485 | @ q15 = E2\r | |
486 | \r | |
487 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r | |
488 | \r | |
489 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r | |
490 | \r | |
491 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r | |
492 | \r | |
493 | @ q2 = tmp4\r | |
494 | @ q7 = E4\r | |
495 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r | |
496 | \r | |
497 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r | |
498 | \r | |
499 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r | |
500 | \r | |
501 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r | |
502 | \r | |
503 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r | |
504 | \r | |
505 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r | |
506 | \r | |
507 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r | |
508 | \r | |
509 | .ifeqs "\qT", "q11"\r | |
510 | sub \reg1, \src1, #2\r | |
511 | .else\r | |
512 | sub \reg1, \src2, #2\r | |
513 | .endif\r | |
514 | \r | |
515 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r | |
516 | .ifeqs "\qT", "q11"\r | |
517 | vld1.16 {d23[3]}, [\reg1] @ S1prev[7] = src[2 * (counter & 7) - 2 - srcstride]\r | |
518 | \r | |
519 | sub \reg1, \src2, #2\r | |
520 | .endif\r | |
521 | \r | |
522 | vld1.16 {d25[3]}, [\reg1] @ S2prev[7] = src[2 * (counter & 7) - 2]\r | |
523 | \r | |
524 | .ifeqs "\qY", "q13"\r | |
525 | sub \reg1, \src3, #2\r | |
526 | \r | |
527 | vld1.16 {d27[3]}, [\reg1] @ S3prev[7] = src[2 * (counter & 7) - 2 + srcstride]\r | |
528 | .endif\r | |
529 | \r | |
6ce097ba | 530 | #ifdef DO_BGR_TO_RGB\r |
c688b90f | 531 | bgr1555_to_rgb565 q14, q15, q8, q9, q10, \reg1\r |
532 | bgr1555_to_rgb565 q6, q7, q8, q9, q10, \reg1\r | |
6ce097ba | 533 | #endif\r |
534 | \r | |
7fc3ac8a H |
535 | ubfx \reg1, \counter, #0, #3 @ reg1 = counter & 7\r |
536 | \r | |
537 | lsl \reg1, #2\r | |
538 | \r | |
539 | vst2.16 {q14-q15}, [\dst1], \reg1 @ [dst] = E1,E2; dst1 += reg1\r | |
540 | \r | |
541 | bic \counter, \counter, #7\r | |
542 | \r | |
543 | vst2.16 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1\r | |
544 | \r | |
545 | @ counter is aligned to 16 bytes\r | |
546 | \r | |
547 | 1:\r | |
548 | .ifeqs "\qT", "q11"\r | |
549 | vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r | |
550 | .endif\r | |
551 | vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r | |
552 | .ifeqs "\qY", "q13"\r | |
553 | vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r | |
554 | .endif\r | |
555 | \r | |
556 | @ inner loop (8 pixels per iteration)\r | |
557 | 2:\r | |
558 | \r | |
559 | @ q0 = S1sl < S >\r | |
560 | @ q2 = S3sl < X >\r | |
561 | @ q7 = tmp2\r | |
562 | @ q15 = tmp1\r | |
563 | \r | |
564 | .ifeqs "\qT", "q11"\r | |
565 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r | |
566 | vmov \qT, q8 @ S1prev = S1 < T >\r | |
567 | .endif\r | |
568 | \r | |
569 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r | |
570 | vmov q12, q9 @ S2prev = S2 < C >\r | |
571 | \r | |
572 | .ifeqs "\qY", "q13"\r | |
573 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r | |
574 | vmov \qY, q10 @ S3prev = S3 < Y >\r | |
575 | .endif\r | |
576 | \r | |
577 | .ifeqs "\qT", "q11"\r | |
578 | vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8\r | |
579 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r | |
580 | .endif\r | |
581 | \r | |
582 | vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8\r | |
583 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r | |
584 | \r | |
585 | .ifeqs "\qY", "q13"\r | |
586 | vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8\r | |
587 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r | |
588 | .else\r | |
589 | vmov q2, q1 @ S3sl = S2sl < X >\r | |
590 | \r | |
591 | vmov q5, q4 @ S3sr = S2sr < Z >\r | |
592 | .endif\r | |
593 | \r | |
594 | .ifnes "\qT", "q11"\r | |
595 | vmov q0, q1 @ S1sl = S2sl < S >\r | |
596 | \r | |
597 | vmov q3, q4 @ S1sr = S2sr < U >\r | |
598 | .endif\r | |
599 | \r | |
600 | sub \counter, \counter, #8 @ counter -= 8\r | |
601 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r | |
602 | \r | |
603 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r | |
604 | \r | |
605 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r | |
606 | \r | |
607 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r | |
608 | \r | |
609 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r | |
610 | \r | |
611 | @ q0 = tmp3\r | |
612 | @ q15 = E2\r | |
613 | \r | |
614 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r | |
615 | \r | |
616 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r | |
617 | \r | |
618 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r | |
619 | \r | |
620 | @ q2 = tmp4\r | |
621 | @ q7 = E4\r | |
622 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r | |
623 | \r | |
624 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r | |
625 | \r | |
626 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r | |
627 | \r | |
628 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r | |
629 | \r | |
630 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r | |
631 | \r | |
632 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r | |
633 | \r | |
634 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r | |
635 | \r | |
636 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r | |
6ce097ba | 637 | \r |
638 | #ifdef DO_BGR_TO_RGB\r | |
c688b90f | 639 | bgr1555_to_rgb565 q14, q15, q0, q1, q2, \reg1\r |
640 | bgr1555_to_rgb565 q6, q7, q0, q1, q2, \reg1\r | |
6ce097ba | 641 | #endif\r |
642 | \r | |
7fc3ac8a H |
643 | vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r |
644 | \r | |
645 | cmp \counter, #8\r | |
646 | \r | |
647 | vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r | |
648 | bhi 2b\r | |
649 | \r | |
650 | @ last 8 pixels\r | |
651 | \r | |
652 | @ q0 = S1sl < S >\r | |
653 | @ q2 = S3sl < X >\r | |
654 | @ q7 = tmp2\r | |
655 | @ q15 = tmp1\r | |
656 | \r | |
657 | .ifeqs "\qT", "q11"\r | |
658 | vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >\r | |
659 | vmov \qT, q8 @ S1prev = S1 < T >\r | |
660 | .endif\r | |
661 | \r | |
662 | vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >\r | |
663 | vmov q12, q9 @ S2prev = S2 < C >\r | |
664 | \r | |
665 | .ifeqs "\qY", "q13"\r | |
666 | vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >\r | |
667 | vmov \qY, q10 @ S3prev = S3 < Y >\r | |
668 | .endif\r | |
669 | \r | |
670 | .ifeqs "\qT", "q11"\r | |
671 | vshr.u64 d16, d17, #(64-16) @ S1[0] = S1[7] | ...\r | |
672 | .endif\r | |
673 | \r | |
674 | vshr.u64 d18, d19, #(64-16) @ S2[0] = S2[7] | ...\r | |
675 | \r | |
676 | .ifeqs "\qY", "q13"\r | |
677 | vshr.u64 d20, d21, #(64-16) @ S3[0] = S3[7] | ...\r | |
678 | .endif\r | |
679 | .ifeqs "\qT", "q11"\r | |
680 | vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >\r | |
681 | .endif\r | |
682 | \r | |
683 | vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >\r | |
684 | \r | |
685 | .ifeqs "\qY", "q13"\r | |
686 | vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >\r | |
687 | .else\r | |
688 | vmov q2, q1 @ S3sl = S2sl < X >\r | |
689 | \r | |
690 | vmov q5, q4 @ S3sr = S2sr < Z >\r | |
691 | .endif\r | |
692 | \r | |
693 | .ifnes "\qT", "q11"\r | |
694 | vmov q0, q1 @ S1sl = S2sl < S >\r | |
695 | \r | |
696 | vmov q3, q4 @ S1sr = S2sr < U >\r | |
697 | .endif\r | |
698 | \r | |
699 | vceq.i16 q14, q0, \qT @ E1 = < S == T >\r | |
700 | \r | |
701 | vceq.i16 q15, q0, q1 @ tmp1 = < S == V >\r | |
702 | \r | |
703 | vceq.i16 q6, q2, \qY @ E3 = < X == Y >\r | |
704 | \r | |
705 | vceq.i16 q7, q2, q1 @ tmp2 = < X == V >\r | |
706 | \r | |
707 | vand q14, q14, q15 @ E1 = < S == T && S == V >\r | |
708 | \r | |
709 | @ q0 = tmp3\r | |
710 | @ q15 = E2\r | |
711 | \r | |
712 | vceq.i16 q15, q3, \qT @ E2 = < U == T >\r | |
713 | \r | |
714 | vceq.i16 q0, q3, q4 @ tmp3 = < U == W >\r | |
715 | \r | |
716 | vand q6, q6, q7 @ E3 = < X == Y && X == V >\r | |
717 | \r | |
718 | @ q2 = tmp4\r | |
719 | @ q7 = E4\r | |
720 | vceq.i16 q7, q5, \qY @ E4 = < Z == Y >\r | |
721 | \r | |
722 | vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >\r | |
723 | \r | |
724 | vand q15, q15, q0 @ E2 = < U == T && U == W >\r | |
725 | \r | |
726 | vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >\r | |
727 | \r | |
728 | vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >\r | |
729 | \r | |
730 | vand q7, q7, q2 @ E4 = < Z == Y && Z == W >\r | |
731 | \r | |
732 | vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >\r | |
733 | \r | |
734 | vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >\r | |
6ce097ba | 735 | \r |
736 | #ifdef DO_BGR_TO_RGB\r | |
c688b90f | 737 | bgr1555_to_rgb565 q14, q15, q8, q9, q10, \reg1\r |
738 | bgr1555_to_rgb565 q6, q7, q8, q9, q10, \reg1\r | |
6ce097ba | 739 | #endif\r |
740 | \r | |
7fc3ac8a H |
741 | vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8\r |
742 | \r | |
743 | vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8\r | |
744 | \r | |
745 | .endm\r | |
746 | \r | |
747 | .macro _neon_eagle2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
748 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
749 | .endm\r | |
750 | \r | |
751 | .macro _neon_eagle2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
752 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
753 | .endm\r | |
754 | \r | |
755 | .macro _neon_eagle2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
756 | __neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
757 | .endm\r | |
758 | \r | |
759 | .macro neon_eagle2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r | |
760 | .ifeq \srcalign16\r | |
761 | \r | |
762 | .ifeq \dstalign32\r | |
763 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r | |
764 | .else\r | |
765 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r | |
766 | .endif\r | |
767 | \r | |
768 | .else\r | |
769 | \r | |
770 | .ifeq \dstalign32\r | |
67381db0 | 771 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1, \dst2\r |
7fc3ac8a | 772 | .else\r |
67381db0 | 773 | _neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1 A256, \dst2 A256\r |
7fc3ac8a H |
774 | .endif\r |
775 | \r | |
776 | .endif\r | |
777 | .endm\r | |
778 | \r |