Commit | Line | Data |
---|---|---|
7fc3ac8a H |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r | |
3 | @@\r | |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r | |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r | |
6 | @@ the Software without restriction, including without limitation the rights to\r | |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r | |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r | |
9 | @@ so, subject to the following conditions:\r | |
10 | @@\r | |
11 | @@ The above copyright notice and this permission notice shall be included in all\r | |
12 | @@ copies or substantial portions of the Software.\r | |
13 | @@\r | |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r | |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r | |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r | |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r | |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r | |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r | |
20 | @@ SOFTWARE.\r | |
21 | @@\r | |
22 | \r | |
23 | \r | |
24 | @ A B C --\ E0 E1\r | |
25 | @ D E F --/ E2 E3\r | |
26 | @ G H I\r | |
27 | \r | |
28 | @ q0 = E0 (tmp0)\r | |
29 | @ q1 = E1 (tmp1)\r | |
30 | @ q2 = E2 (tmp2)\r | |
31 | @ q3 = E3 (tmp3)\r | |
32 | @ q8 = S2prev\r | |
33 | @ q9 = S2next\r | |
34 | @ q10 = C0 < B == H || D == F >\r | |
35 | @ q11 = S1 < B >\r | |
36 | @ q12 = S2 < E >\r | |
37 | @ q13 = S3 < H >\r | |
38 | @ q14 = S2sl < D >\r | |
39 | @ q15 = S2sr < F >\r | |
40 | \r | |
41 | \r | |
42 | .macro __neon_scale2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qB, qH, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
43 | \r | |
44 | vld1.8 {d17[7]}, [\src2] @ S2prev[15] = src[0]\r | |
45 | andS \reg1, \counter, #15 @ reg1 = counter & 15\r | |
46 | \r | |
47 | .ifnes "\qB", "q11"\r | |
48 | add \src1, \src1, \counter @ src1 += counter\r | |
49 | .endif\r | |
50 | .ifnes "\qH", "q13"\r | |
51 | add \src3, \src3, \counter @ src3 += counter\r | |
52 | .endif\r | |
53 | beq 1f\r | |
54 | \r | |
55 | @ first 1-15 pixels - align counter to 16 bytes\r | |
56 | vld1.8 {q12}, [\src2], \reg1 @ S2 = [src] < E >; src2 += counter & 15\r | |
57 | \r | |
58 | .ifeqs "\qB", "q11"\r | |
59 | vld1.8 {\qB}, [\src1], \reg1 @ S1 = [src - srcstride] < B >; src1 += counter & 15\r | |
60 | .endif\r | |
61 | \r | |
62 | .ifeqs "\qH", "q13"\r | |
63 | vld1.8 {\qH}, [\src3], \reg1 @ S3 = [src + srcstride] < H >; src3 += counter & 15\r | |
64 | .endif\r | |
65 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r | |
66 | \r | |
67 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r | |
68 | \r | |
69 | vmov.8 d17[7], \reg1 @ S2prev[15] = reg1\r | |
70 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | ... < F >\r | |
71 | \r | |
72 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r | |
73 | \r | |
74 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r | |
75 | \r | |
76 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r | |
77 | vtbl.8 d17, {d28, d29}, d17 @ S2prev[15] = src[reg1 - 1]\r | |
78 | \r | |
79 | lsl \reg1, #1\r | |
80 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r | |
81 | \r | |
82 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r | |
83 | \r | |
84 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r | |
85 | \r | |
86 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r | |
87 | \r | |
88 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r | |
89 | \r | |
90 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r | |
91 | \r | |
92 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r | |
93 | \r | |
94 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r | |
95 | \r | |
96 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r | |
97 | \r | |
98 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r | |
99 | vst2.8 {q0-q1}, [\dst1], \reg1 @ [dst] = E0,E1; dst1 += reg1\r | |
100 | \r | |
101 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r | |
102 | bic \counter, \counter, #15\r | |
103 | \r | |
104 | vst2.8 {q2-q3}, [\dst2], \reg1 @ [dst + dststride] = E2,E3; dst2 += reg1\r | |
105 | \r | |
106 | @ counter is aligned to 16 bytes\r | |
107 | \r | |
108 | 1:\r | |
109 | vld1.8 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 16\r | |
110 | \r | |
111 | @ inner loop (16 pixels per iteration)\r | |
112 | 2:\r | |
113 | \r | |
114 | vmov q12, q9 @ S2 = S2next < E >\r | |
115 | .ifeqs "\qB", "q11"\r | |
116 | vld1.8 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 16\r | |
117 | .endif\r | |
118 | \r | |
119 | .ifeqs "\qH", "q13"\r | |
120 | vld1.8 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 16\r | |
121 | .endif\r | |
122 | \r | |
123 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r | |
124 | vld1.8 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 16\r | |
125 | \r | |
126 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r | |
127 | \r | |
128 | vmov q8, q12 @ S2prev = S2\r | |
129 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | S2next[0] < F >\r | |
130 | \r | |
131 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r | |
132 | \r | |
133 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r | |
134 | \r | |
135 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r | |
136 | \r | |
137 | sub \counter, \counter, #16 @ counter -= 16\r | |
138 | \r | |
139 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r | |
140 | \r | |
141 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r | |
142 | \r | |
143 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r | |
144 | \r | |
145 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r | |
146 | \r | |
147 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r | |
148 | \r | |
149 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r | |
150 | \r | |
151 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r | |
152 | \r | |
153 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r | |
154 | \r | |
155 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r | |
156 | \r | |
157 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r | |
158 | vst2.8 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*16\r | |
159 | \r | |
160 | cmp \counter, #16\r | |
161 | \r | |
162 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r | |
163 | \r | |
164 | vst2.8 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*16\r | |
165 | \r | |
166 | bhi 2b\r | |
167 | \r | |
168 | @ last 16 pixels\r | |
169 | \r | |
170 | vmov q12, q9 @ S2 = S2next < E >\r | |
171 | \r | |
172 | vshr.u64 d18, d19, #(64-8) @ S2next[0] = S2[15] | ...\r | |
173 | .ifeqs "\qB", "q11"\r | |
174 | vld1.8 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 16\r | |
175 | .endif\r | |
176 | \r | |
177 | vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >\r | |
178 | \r | |
179 | vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | S2next[0] < F >\r | |
180 | .ifeqs "\qH", "q13"\r | |
181 | vld1.8 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 16\r | |
182 | .endif\r | |
183 | \r | |
184 | vceq.i8 q0, q14, \qB @ tmp0 = < D == B >\r | |
185 | \r | |
186 | vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >\r | |
187 | \r | |
188 | vceq.i8 q3, q14, q15 @ tmp3 = < D == F >\r | |
189 | \r | |
190 | vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >\r | |
191 | \r | |
192 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r | |
193 | \r | |
194 | vceq.i8 q2, q14, \qH @ tmp2 = < D == H >\r | |
195 | \r | |
196 | vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >\r | |
197 | \r | |
198 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r | |
199 | \r | |
200 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r | |
201 | \r | |
202 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r | |
203 | \r | |
204 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r | |
205 | \r | |
206 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r | |
207 | \r | |
208 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r | |
209 | \r | |
210 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r | |
211 | vst2.8 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*16\r | |
212 | \r | |
213 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r | |
214 | \r | |
215 | vst2.8 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*16\r | |
216 | \r | |
217 | .endm\r | |
218 | \r | |
219 | .macro _neon_scale2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
220 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
221 | .endm\r | |
222 | \r | |
223 | .macro _neon_scale2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
224 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
225 | .endm\r | |
226 | \r | |
227 | .macro _neon_scale2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
228 | __neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
229 | .endm\r | |
230 | \r | |
231 | .macro neon_scale2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r | |
232 | .ifeq \srcalign16\r | |
233 | \r | |
234 | .ifeq \dstalign32\r | |
235 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r | |
236 | .else\r | |
237 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r | |
238 | .endif\r | |
239 | \r | |
240 | .else\r | |
241 | \r | |
242 | .ifeq \dstalign32\r | |
243 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2\r | |
244 | .else\r | |
245 | _neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256\r | |
246 | .endif\r | |
247 | \r | |
248 | .endif\r | |
249 | .endm\r | |
250 | \r | |
251 | \r | |
252 | .macro __neon_scale2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qB, qH, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
253 | \r | |
254 | vld1.16 {d17[3]}, [\src2] @ S2prev[7] = src[0]\r | |
255 | andS \reg1, \counter, #7 @ reg1 = counter & 7\r | |
256 | \r | |
257 | .ifnes "\qB", "q11"\r | |
258 | add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter\r | |
259 | .endif\r | |
260 | .ifnes "\qH", "q13"\r | |
261 | add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter\r | |
262 | .endif\r | |
263 | beq 1f\r | |
264 | \r | |
265 | @ first 1-7 pixels - align counter to 16 bytes\r | |
266 | vld1.16 {q12}, [\src2] @ S2 = [src] < E >\r | |
267 | lsl \reg1, #1\r | |
268 | \r | |
269 | .ifeqs "\qB", "q11"\r | |
270 | vld1.16 {\qB}, [\src1] @ S1 = [src - srcstride] < B >\r | |
271 | .endif\r | |
272 | bfi \reg1, \reg1, #8, #8\r | |
273 | \r | |
274 | .ifeqs "\qH", "q13"\r | |
275 | vld1.16 {\qH}, [\src3] @ S3 = [src + srcstride] < H >\r | |
276 | .endif\r | |
277 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r | |
278 | \r | |
279 | add \reg1, \reg1, #256\r | |
280 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r | |
281 | \r | |
282 | vmov.16 d17[3], \reg1 @ S2prev[7] = reg1\r | |
283 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | ... < F >\r | |
284 | \r | |
285 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r | |
286 | \r | |
287 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r | |
288 | \r | |
289 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r | |
290 | vtbl.8 d17, {d28, d29}, d17 @ S2prev[7] = src[reg1 - 1]\r | |
291 | \r | |
292 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r | |
293 | and \reg1, \counter, #7\r | |
294 | \r | |
295 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r | |
296 | \r | |
297 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r | |
298 | \r | |
299 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r | |
300 | \r | |
301 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r | |
302 | \r | |
303 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r | |
304 | \r | |
305 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r | |
306 | \r | |
307 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r | |
308 | \r | |
309 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r | |
310 | \r | |
311 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r | |
7fc3ac8a H |
312 | \r |
313 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r | |
314 | \r | |
6ce097ba | 315 | #ifdef DO_BGR_TO_RGB\r |
316 | bgr1555_to_rgb565 q0, q1, q12, q14, q15\r | |
317 | bgr1555_to_rgb565 q2, q3, q12, q14, q15\r | |
318 | #endif\r | |
319 | \r | |
320 | vst2.16 {q0-q1}, [\dst1] @ [dst] = E0,E1\r | |
321 | \r | |
7fc3ac8a H |
322 | bic \counter, \counter, #7\r |
323 | .ifeqs "\qB", "q11"\r | |
324 | add \src1, \src1, \reg1, lsl #1\r | |
325 | .endif\r | |
326 | add \src2, \src2, \reg1, lsl #1\r | |
327 | .ifeqs "\qH", "q13"\r | |
328 | add \src3, \src3, \reg1, lsl #1\r | |
329 | .endif\r | |
330 | \r | |
331 | vst2.16 {q2-q3}, [\dst2] @ [dst + dststride] = E2,E3\r | |
332 | \r | |
333 | add \dst1, \dst1, \reg1, lsl #2\r | |
334 | add \dst2, \dst2, \reg1, lsl #2\r | |
335 | \r | |
336 | @ counter is aligned to 16 bytes\r | |
337 | \r | |
338 | 1:\r | |
339 | vld1.16 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 2*8\r | |
340 | \r | |
341 | @ inner loop (8 pixels per iteration)\r | |
342 | 2:\r | |
343 | \r | |
344 | vmov q12, q9 @ S2 = S2next < E >\r | |
345 | .ifeqs "\qB", "q11"\r | |
346 | vld1.16 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 2*8\r | |
347 | .endif\r | |
348 | \r | |
349 | .ifeqs "\qH", "q13"\r | |
350 | vld1.16 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 2*8\r | |
351 | .endif\r | |
352 | \r | |
353 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r | |
354 | vld1.16 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 2*8\r | |
355 | \r | |
356 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r | |
357 | \r | |
358 | vmov q8, q12 @ S2prev = S2\r | |
359 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | S2next[0] < F >\r | |
360 | \r | |
361 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r | |
362 | \r | |
363 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r | |
364 | \r | |
365 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r | |
366 | \r | |
367 | sub \counter, \counter, #8 @ counter -= 8\r | |
368 | \r | |
369 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r | |
370 | \r | |
371 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r | |
372 | \r | |
373 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r | |
374 | \r | |
375 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r | |
376 | \r | |
377 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r | |
378 | \r | |
379 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r | |
380 | \r | |
381 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r | |
382 | \r | |
383 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r | |
384 | \r | |
385 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r | |
386 | \r | |
387 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r | |
6ce097ba | 388 | \r |
389 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r | |
390 | \r | |
391 | #ifdef DO_BGR_TO_RGB\r | |
392 | bgr1555_to_rgb565 q0, q1, q12, q14, q15\r | |
393 | bgr1555_to_rgb565 q2, q3, q12, q14, q15\r | |
394 | #endif\r | |
395 | \r | |
7fc3ac8a H |
396 | vst2.16 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*2*8\r |
397 | \r | |
398 | cmp \counter, #8\r | |
399 | \r | |
7fc3ac8a H |
400 | vst2.16 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*2*8\r |
401 | \r | |
402 | bhi 2b\r | |
403 | \r | |
404 | @ last 8 pixels\r | |
405 | \r | |
406 | vmov q12, q9 @ S2 = S2next < E >\r | |
407 | \r | |
408 | vshr.u64 d18, d19, #(64-16) @ S2next[0] = S2[7] | ...\r | |
409 | .ifeqs "\qB", "q11"\r | |
410 | vld1.16 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 2*8\r | |
411 | .endif\r | |
412 | \r | |
413 | vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >\r | |
414 | \r | |
415 | vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | S2next[0] < F >\r | |
416 | .ifeqs "\qH", "q13"\r | |
417 | vld1.16 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 2*8\r | |
418 | .endif\r | |
419 | \r | |
420 | vceq.i16 q0, q14, \qB @ tmp0 = < D == B >\r | |
421 | \r | |
422 | vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >\r | |
423 | \r | |
424 | vceq.i16 q3, q14, q15 @ tmp3 = < D == F >\r | |
425 | \r | |
426 | vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >\r | |
427 | \r | |
428 | vorr q10, q2, q3 @ C0 = < B == H || D == F >\r | |
429 | \r | |
430 | vceq.i16 q2, q14, \qH @ tmp2 = < D == H >\r | |
431 | \r | |
432 | vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >\r | |
433 | \r | |
434 | vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >\r | |
435 | \r | |
436 | vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >\r | |
437 | \r | |
438 | vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >\r | |
439 | \r | |
440 | vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >\r | |
441 | \r | |
442 | vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >\r | |
443 | \r | |
444 | vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >\r | |
445 | \r | |
446 | vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >\r | |
7fc3ac8a H |
447 | \r |
448 | vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >\r | |
449 | \r | |
6ce097ba | 450 | #ifdef DO_BGR_TO_RGB\r |
451 | bgr1555_to_rgb565 q0, q1, q12, q14, q15\r | |
452 | bgr1555_to_rgb565 q2, q3, q12, q14, q15\r | |
453 | #endif\r | |
454 | \r | |
455 | vst2.16 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*2*8\r | |
456 | \r | |
7fc3ac8a H |
457 | vst2.16 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*2*8\r |
458 | \r | |
459 | .endm\r | |
460 | \r | |
461 | .macro _neon_scale2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
462 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
463 | .endm\r | |
464 | \r | |
465 | .macro _neon_scale2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
466 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
467 | .endm\r | |
468 | \r | |
469 | .macro _neon_scale2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2\r | |
470 | __neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2\r | |
471 | .endm\r | |
472 | \r | |
473 | .macro neon_scale2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32\r | |
474 | .ifeq \srcalign16\r | |
475 | \r | |
476 | .ifeq \dstalign32\r | |
477 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2\r | |
478 | .else\r | |
479 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256\r | |
480 | .endif\r | |
481 | \r | |
482 | .else\r | |
483 | \r | |
484 | .ifeq \dstalign32\r | |
67381db0 | 485 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1, \dst2\r |
7fc3ac8a | 486 | .else\r |
67381db0 | 487 | _neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1 A128, \src2 A128, \src3 A128, \dst1 A256, \dst2 A256\r |
7fc3ac8a H |
488 | .endif\r |
489 | \r | |
490 | .endif\r | |
491 | .endm\r | |
492 | \r |