e38fee1b |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r |
3 | @@\r |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r |
6 | @@ the Software without restriction, including without limitation the rights to\r |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r |
9 | @@ so, subject to the following conditions:\r |
10 | @@\r |
11 | @@ The above copyright notice and this permission notice shall be included in all\r |
12 | @@ copies or substantial portions of the Software.\r |
13 | @@\r |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r |
20 | @@ SOFTWARE.\r |
21 | @@\r |
22 | \r |
23 | \r |
24 | \r |
25 | .macro _neon_normalxx_8_16_line_middle src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride, dA, dB\r |
26 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
27 | \r |
28 | ldr \reg2, [\src, #4] @ reg2 = src[4-7]\r |
29 | \r |
30 | ldr \reg3, [\src, #8] @ reg3 = src[8-11]\r |
31 | \r |
32 | ldr \reg4, [\src, #12] @ reg4 = src[12-15]\r |
33 | ubfx \reg5, \reg1, #0, #8 @ reg5 = src[0]\r |
34 | \r |
35 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[0]]\r |
36 | ubfx \reg6, \reg1, #8, #8 @ reg6 = src[1]\r |
37 | \r |
38 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[1]]\r |
39 | ubfx \reg7, \reg1, #16, #8 @ reg7 = src[2]\r |
40 | \r |
41 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[2]]\r |
42 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
43 | \r |
44 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
45 | ubfx \reg8, \reg2, #0, #8 @ reg8 = src[4]\r |
46 | \r |
47 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[4]]\r |
48 | ubfx \reg9, \reg2, #8, #8 @ reg9 = src[5]\r |
49 | \r |
50 | ldr \reg9, [\pal, \reg9, lsl #2] @ reg9 = pal[src[5]]\r |
51 | bfi \reg5, \reg6, #16, #16 @ reg5 = pal[src[0]] | pal[src[1]] << 16\r |
52 | \r |
53 | bfi \reg7, \reg1, #16, #16 @ reg7 = pal[src[2]] | pal[src[3]] << 16\r |
54 | ubfx \reg6, \reg2, #16, #8 @ reg6 = src[6]\r |
55 | \r |
56 | vmov d16, \reg5, \reg7 @ d16 = pal[src[0-3]]\r |
57 | lsr \reg2, \reg2, #24 @ reg2 = src[7]\r |
58 | \r |
59 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[6]]\r |
60 | bfi \reg8, \reg9, #16, #16 @ reg8 = pal[src[4]] | pal[src[5]] << 16\r |
61 | \r |
62 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[7]]\r |
63 | ubfx \reg1, \reg3, #0, #8 @ reg1 = src[8]\r |
64 | \r |
65 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[8]]\r |
66 | ubfx \reg5, \reg3, #8, #8 @ reg5 = src[9]\r |
67 | \r |
68 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[9]]\r |
69 | ubfx \reg7, \reg3, #16, #8 @ reg7 = src[10]\r |
70 | \r |
71 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[10]]\r |
72 | bfi \reg6, \reg2, #16, #16 @ reg6 = pal[src[6]] | pal[src[7]] << 16\r |
73 | \r |
74 | vmov d17, \reg8, \reg6 @ d17 = pal[src[4-7]]\r |
75 | lsr \reg3, \reg3, #24 @ reg3 = src[11]\r |
76 | \r |
77 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[11]]\r |
78 | ubfx \reg2, \reg4, #0, #8 @ reg2 = src[12]\r |
79 | \r |
80 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[12]]\r |
81 | ubfx \reg6, \reg4, #8, #8 @ reg6 = src[13]\r |
82 | \r |
83 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[13]]\r |
84 | ubfx \reg8, \reg4, #16, #8 @ reg8 = src[14]\r |
85 | \r |
86 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[14]]\r |
87 | lsr \reg4, \reg4, #24 @ reg4 = src[15]\r |
88 | \r |
89 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[15]]\r |
90 | bfi \reg1, \reg5, #16, #16 @ reg1 = pal[src[8]] | pal[src[9]] << 16\r |
91 | \r |
92 | bfi \reg7, \reg3, #16, #16 @ reg7 = pal[src[10]] | pal[src[11]] << 16\r |
93 | bfi \reg2, \reg6, #16, #16 @ reg2 = pal[src[12]] | pal[src[13]] << 16\r |
94 | \r |
95 | vmov \dA, \reg1, \reg7 @ dA = pal[src[8-11]]\r |
96 | sub \counter, \counter, #16 @ counter -= 16\r |
97 | \r |
98 | bfi \reg8, \reg4, #16, #16 @ reg8 = pal[src[14]] | pal[src[15]] << 16\r |
99 | add \src, \src, #16 @ src += 16\r |
100 | \r |
101 | vmov \dB, \reg2, \reg8 @ dB = pal[src[12-15]]\r |
102 | cmp \counter, #16\r |
103 | .endm\r |
104 | \r |
105 | .macro neon_normal1x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9\r |
106 | @ align src to 4 bytes\r |
107 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
108 | beq 10f\r |
109 | \r |
110 | @ first 1-3 pixels\r |
111 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
112 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
113 | \r |
114 | add \src, \src, \reg5 @ src += reg5\r |
115 | sub \counter, \counter, \reg5 @ counter -= reg5\r |
116 | \r |
117 | subS \reg5, \reg5, #1 @ reg5--\r |
118 | \r |
119 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
120 | ubfxne \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
121 | \r |
122 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[reg2]\r |
123 | \r |
124 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[reg3]\r |
125 | \r |
126 | strh \reg2, [\dst] @ dst[0] = reg2\r |
127 | \r |
128 | strneh \reg3, [\dst, #2]! @ dst[1] = reg3; dst++\r |
129 | subneS \reg5, \reg5, #1 @ reg5--\r |
130 | \r |
131 | ubfxne \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
132 | add \dst, \dst, #2 @ dst++\r |
133 | \r |
134 | ldrne \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[reg4]\r |
135 | \r |
136 | strneh \reg4, [\dst], #2 @ dst[2] = reg4; dst++\r |
137 | \r |
138 | @ middle pixels (16 per iteration)\r |
139 | 10:\r |
140 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, , d18, d19\r |
141 | \r |
142 | vst1.16 {d16-d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 2*16\r |
143 | bhs 10b\r |
144 | \r |
145 | @ last 0-15 bytes\r |
146 | \r |
147 | cmp \counter, #0\r |
148 | beq 40f\r |
149 | \r |
150 | cmp \counter, #4\r |
151 | blo 30f\r |
152 | \r |
153 | @ 4-12 pixels (4 pre iteration)\r |
154 | 20:\r |
155 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
156 | sub \counter, \counter, #4 @ counter -= 4\r |
157 | \r |
158 | add \src, \src, #4 @ src += 4\r |
159 | add \dst, \dst, #(2*4) @ dst += 4\r |
160 | \r |
161 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
162 | cmp \counter, #4\r |
163 | \r |
164 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
165 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
166 | \r |
167 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
168 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
169 | \r |
170 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
171 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
172 | \r |
173 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
174 | \r |
175 | strh \reg2, [\dst, #-8] @ dst[0] = reg2\r |
176 | \r |
177 | strh \reg3, [\dst, #-6] @ dst[1] = reg3\r |
178 | \r |
179 | strh \reg4, [\dst, #-4] @ dst[2] = reg4\r |
180 | \r |
181 | strh \reg1, [\dst, #-2] @ dst[3] = reg1\r |
182 | bhs 20b\r |
183 | \r |
184 | cmp \counter, #0\r |
185 | beq 40f\r |
186 | \r |
187 | @ last 1-3 pixels\r |
188 | 30:\r |
189 | ldrb \reg1, [\src] @ reg1 = src[0]\r |
190 | subS \counter, \counter, #1 @ counter--\r |
191 | \r |
192 | ldrneb \reg2, [\src, #1]! @ reg2 = src[1]; src++\r |
193 | \r |
194 | add \src, \src, #1 @ src++\r |
195 | \r |
196 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
197 | \r |
198 | ldrne \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[1]]\r |
199 | \r |
200 | strh \reg1, [\dst] @ dst[0] = reg1\r |
201 | \r |
202 | strneh \reg2, [\dst, #2]! @ dst[1] = reg2; dst++\r |
203 | subneS \counter, \counter, #1 @ counter--\r |
204 | \r |
205 | ldrneb \reg3, [\src], #1 @ reg3 = src[2]; src++\r |
206 | add \dst, \dst, #2 @ dst++\r |
207 | \r |
208 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[2]]\r |
209 | \r |
210 | strneh \reg3, [\dst], #2 @ dst[2] = reg3; dst++\r |
211 | \r |
212 | 40:\r |
213 | .endm\r |
214 | \r |
215 | .macro neon_normal2x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
216 | @ align src to 4 bytes\r |
217 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
218 | beq 10f\r |
219 | \r |
220 | @ first 1-3 pixels\r |
221 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
222 | 1:\r |
223 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
224 | add \reg2, \dst, \dststride\r |
225 | \r |
226 | add \dst, \dst, #4 @ dst += 2*2\r |
227 | sub \counter, \counter, #1 @ counter--\r |
228 | \r |
229 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
230 | subS \reg5, \reg5, #1 @ reg5--\r |
231 | \r |
232 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r |
233 | \r |
234 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r |
235 | \r |
236 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
237 | \r |
238 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r |
239 | bne 1b\r |
240 | \r |
241 | @ middle pixels (16 per iteration)\r |
242 | 10:\r |
243 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d20, d21\r |
244 | \r |
245 | vmov q9, q8\r |
246 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
247 | \r |
248 | vmov q11, q10\r |
249 | vst2.16 {q8,q9}, [\dst]! @ dst[0-7] = q8-q9; dst += 2*2*8\r |
250 | \r |
251 | vst2.16 {q10,q11}, [\dst]! @ dst[8-15] = q10-q11; dst += 2*2*8\r |
252 | \r |
253 | vst2.16 {q8,q9}, [\reg1]! @ dst1[0-7] = q8-q9; dst1 += 2*2*8\r |
254 | \r |
255 | vst2.16 {q10,q11}, [\reg1]! @ dst1[8-15] = q10-q11; dst1 += 2*2*8\r |
256 | bhs 10b\r |
257 | \r |
258 | @ last 0-15 bytes\r |
259 | \r |
260 | cmp \counter, #0\r |
261 | beq 40f\r |
262 | \r |
263 | cmp \counter, #4\r |
264 | blo 30f\r |
265 | \r |
266 | @ 4-12 pixels (4 pre iteration)\r |
267 | 20:\r |
268 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
269 | sub \counter, \counter, #4 @ counter -= 4\r |
270 | \r |
271 | add \src, \src, #4 @ src += 4\r |
272 | \r |
273 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
274 | cmp \counter, #4\r |
275 | \r |
276 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
277 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
278 | \r |
279 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
280 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
281 | \r |
282 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
283 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
284 | \r |
285 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
286 | \r |
287 | add \reg5, \dst, \dststride\r |
288 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
289 | \r |
290 | vmov.32 d16[0], \reg2\r |
291 | \r |
292 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
293 | \r |
294 | vmov.32 d16[1], \reg4\r |
295 | \r |
296 | vmov d17, d16\r |
297 | \r |
298 | vst2.16 {d16,d17}, [\dst]! @ dst[0-7] = d16-d17; dst += 2*2*4\r |
299 | \r |
300 | vst2.16 {d16,d17}, [\reg5] @ dst1[0-7] = d16-d17\r |
301 | bhs 20b\r |
302 | \r |
303 | cmp \counter, #0\r |
304 | beq 40f\r |
305 | \r |
306 | @ last 1-3 pixels\r |
307 | 30:\r |
308 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
309 | add \reg2, \dst, \dststride\r |
310 | \r |
311 | add \dst, \dst, #4 @ dst += 2*2\r |
312 | \r |
313 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
314 | subS \counter, \counter, #1 @ counter--\r |
315 | \r |
316 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r |
317 | \r |
318 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r |
319 | \r |
320 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
321 | \r |
322 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r |
323 | bne 30b\r |
324 | \r |
325 | 40:\r |
326 | .endm\r |
327 | \r |
328 | .macro neon_normal3x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
329 | @ align src to 4 bytes\r |
330 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
331 | beq 10f\r |
332 | \r |
333 | @ first 1-3 pixels\r |
334 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
335 | 1:\r |
336 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
337 | add \reg2, \dst, \dststride\r |
338 | \r |
339 | add \reg3, \reg2, \dststride\r |
340 | add \dst, \dst, #6 @ dst += 3*2\r |
341 | \r |
342 | sub \counter, \counter, #1 @ counter--\r |
343 | \r |
344 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
345 | subS \reg5, \reg5, #1 @ reg5--\r |
346 | \r |
347 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r |
348 | \r |
349 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r |
350 | \r |
351 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r |
352 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
353 | \r |
354 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
355 | \r |
356 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r |
357 | \r |
358 | strh \reg1, [\reg3] @ dst2[0] = reg1\r |
359 | \r |
360 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r |
361 | bne 1b\r |
362 | \r |
363 | @ middle pixels (16 per iteration)\r |
364 | 10:\r |
365 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d22, d23\r |
366 | \r |
367 | vmov q9, q8\r |
368 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
369 | \r |
370 | vmov q10, q8\r |
371 | add \reg2, \dst, \dststride, lsl #1 @ reg1 = dst + 2 * dststride\r |
372 | \r |
373 | vmov q12, q11\r |
374 | vst3.16 {d16,d18,d20}, [\dst]! @ dst[0-3] = q8-q10[0]; dst += 3*2*4\r |
375 | \r |
376 | vmov q13, q11\r |
377 | vst3.16 {d17,d19,d21}, [\dst]! @ dst[4-7] = q8-q10[1]; dst += 3*2*4\r |
378 | \r |
379 | vst3.16 {d22,d24,d26}, [\dst]! @ dst[8-11] = q11-q13[0]; dst += 3*2*4\r |
380 | \r |
381 | vst3.16 {d23,d25,d27}, [\dst]! @ dst[12-15] = q11-q13[1]; dst += 3*2*4\r |
382 | \r |
383 | vst3.16 {d16,d18,d20}, [\reg1]! @ dst1[0-3] = q8-q10[0]; dst1 += 3*2*4\r |
384 | \r |
385 | vst3.16 {d17,d19,d21}, [\reg1]! @ dst1[4-7] = q8-q10[1]; dst1 += 3*2*4\r |
386 | \r |
387 | vst3.16 {d22,d24,d26}, [\reg1]! @ dst1[8-11] = q11-q13[0]; dst1 += 3*2*4\r |
388 | \r |
389 | vst3.16 {d23,d25,d27}, [\reg1]! @ dst1[12-15] = q11-q13[1]; dst1 += 3*2*4\r |
390 | \r |
391 | vst3.16 {d16,d18,d20}, [\reg2]! @ dst2[0-3] = q8-q10[0]; dst2 += 3*2*4\r |
392 | \r |
393 | vst3.16 {d17,d19,d21}, [\reg2]! @ dst2[4-7] = q8-q10[1]; dst2 += 3*2*4\r |
394 | \r |
395 | vst3.16 {d22,d24,d26}, [\reg2]! @ dst2[8-11] = q11-q13[0]; dst2 += 3*2*4\r |
396 | \r |
397 | vst3.16 {d23,d25,d27}, [\reg2]! @ dst2[12-15] = q11-q13[1]; dst2 += 3*2*4\r |
398 | bhs 10b\r |
399 | \r |
400 | @ last 0-15 bytes\r |
401 | \r |
402 | cmp \counter, #0\r |
403 | beq 40f\r |
404 | \r |
405 | cmp \counter, #4\r |
406 | blo 30f\r |
407 | \r |
408 | @ 4-12 pixels (4 pre iteration)\r |
409 | 20:\r |
410 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
411 | sub \counter, \counter, #4 @ counter -= 4\r |
412 | \r |
413 | add \src, \src, #4 @ src += 4\r |
414 | \r |
415 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
416 | cmp \counter, #4\r |
417 | \r |
418 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
419 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
420 | \r |
421 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
422 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
423 | \r |
424 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
425 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
426 | \r |
427 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
428 | \r |
429 | add \reg5, \dst, \dststride\r |
430 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
431 | \r |
432 | vmov.32 d16[0], \reg2\r |
433 | add \reg6, \reg5, \dststride\r |
434 | \r |
435 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
436 | \r |
437 | vmov.32 d16[1], \reg4\r |
438 | \r |
439 | vmov d17, d16\r |
440 | \r |
441 | vmov d18, d16\r |
442 | \r |
443 | vst3.16 {d16,d17,d18}, [\dst]! @ dst[0-11] = d16-d18; dst += 3*2*4\r |
444 | \r |
445 | vst3.16 {d16,d17,d18}, [\reg5] @ dst1[0-11] = d16-d18\r |
446 | \r |
447 | vst3.16 {d16,d17,d18}, [\reg6] @ dst2[0-11] = d16-d18\r |
448 | bhs 20b\r |
449 | \r |
450 | cmp \counter, #0\r |
451 | beq 40f\r |
452 | \r |
453 | @ last 1-3 pixels\r |
454 | 30:\r |
455 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
456 | add \reg2, \dst, \dststride\r |
457 | \r |
458 | add \reg3, \reg2, \dststride\r |
459 | add \dst, \dst, #6 @ dst += 3*2\r |
460 | \r |
461 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
462 | subS \counter, \counter, #1 @ counter--\r |
463 | \r |
464 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r |
465 | \r |
466 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r |
467 | \r |
468 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r |
469 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
470 | \r |
471 | strh \reg1, [\reg2] @ dst1[0] = reg1\r |
472 | \r |
473 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r |
474 | \r |
475 | strh \reg1, [\reg3] @ dst2[0] = reg1\r |
476 | \r |
477 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r |
478 | bne 30b\r |
479 | \r |
480 | 40:\r |
481 | .endm\r |
482 | \r |
483 | .macro neon_normal4x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r |
484 | @ align src to 4 bytes\r |
485 | andS \reg5, \src, #3 @ reg5 = src & 3\r |
486 | beq 10f\r |
487 | \r |
488 | @ first 1-3 pixels\r |
489 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r |
490 | 1:\r |
491 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
492 | add \reg2, \dst, \dststride\r |
493 | \r |
494 | add \reg3, \reg2, \dststride\r |
495 | add \dst, \dst, #8 @ dst += 4*2\r |
496 | \r |
497 | sub \counter, \counter, #1 @ counter--\r |
498 | \r |
499 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
500 | add \reg4, \reg3, \dststride\r |
501 | \r |
502 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r |
503 | subS \reg5, \reg5, #1 @ reg5--\r |
504 | \r |
505 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r |
506 | \r |
507 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
508 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r |
509 | \r |
510 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r |
511 | \r |
512 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r |
513 | \r |
514 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r |
515 | \r |
516 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r |
517 | \r |
518 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r |
519 | \r |
520 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r |
521 | bne 1b\r |
522 | \r |
523 | @ middle pixels (16 per iteration)\r |
524 | 10:\r |
525 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d24, d25\r |
526 | \r |
527 | vmov q9, q8\r |
528 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r |
529 | \r |
530 | vmov q10, q8\r |
531 | add \reg2, \dst, \dststride, lsl #1 @ reg2 = dst + 2 * dststride\r |
532 | \r |
533 | vmov q11, q8\r |
534 | add \reg3, \reg1, \dststride,lsl #1 @ reg3 = dst + 3 * dststride\r |
535 | \r |
536 | vmov q13, q12\r |
537 | vst4.16 {d16,d18,d20,d22}, [\dst]! @ dst[0-3] = q8-q11[0]; dst += 4*2*4\r |
538 | \r |
539 | vmov q14, q12\r |
540 | \r |
541 | vmov q15, q12\r |
542 | vst4.16 {d17,d19,d21,d23}, [\dst]! @ dst[4-7] = q8-q11[1]; dst += 4*2*4\r |
543 | \r |
544 | vst4.16 {d24,d26,d28,d30}, [\dst]! @ dst[8-11] = q12-q15[0]; dst += 4*2*4\r |
545 | \r |
546 | vst4.16 {d25,d27,d29,d31}, [\dst]! @ dst[12-15] = q12-q15[1]; dst += 4*2*4\r |
547 | \r |
548 | vst4.16 {d16,d18,d20,d22}, [\reg1]! @ dst1[0-3] = q8-q11[0]; dst1 += 4*2*4\r |
549 | \r |
550 | vst4.16 {d17,d19,d21,d23}, [\reg1]! @ dst1[4-7] = q8-q11[1]; dst1 += 4*2*4\r |
551 | \r |
552 | vst4.16 {d24,d26,d28,d30}, [\reg1]! @ dst1[8-11] = q12-q15[0]; dst1 += 4*2*4\r |
553 | \r |
554 | vst4.16 {d25,d27,d29,d31}, [\reg1]! @ dst1[12-15] = q12-q15[1]; dst1 += 4*2*4\r |
555 | \r |
556 | vst4.16 {d16,d18,d20,d22}, [\reg2]! @ dst2[0-3] = q8-q11[0]; dst2 += 4*2*4\r |
557 | \r |
558 | vst4.16 {d17,d19,d21,d23}, [\reg2]! @ dst2[4-7] = q8-q11[1]; dst2 += 4*2*4\r |
559 | \r |
560 | vst4.16 {d24,d26,d28,d30}, [\reg2]! @ dst2[8-11] = q12-q15[0]; dst2 += 4*2*4\r |
561 | \r |
562 | vst4.16 {d25,d27,d29,d31}, [\reg2]! @ dst2[12-15] = q12-q15[1]; dst2 += 4*2*4\r |
563 | \r |
564 | vst4.16 {d16,d18,d20,d22}, [\reg3]! @ dst3[0-3] = q8-q11[0]; dst3 += 4*2*4\r |
565 | \r |
566 | vst4.16 {d17,d19,d21,d23}, [\reg3]! @ dst3[4-7] = q8-q11[1]; dst3 += 4*2*4\r |
567 | \r |
568 | vst4.16 {d24,d26,d28,d30}, [\reg3]! @ dst3[8-11] = q12-q15[0]; dst3 += 4*2*4\r |
569 | \r |
570 | vst4.16 {d25,d27,d29,d31}, [\reg3]! @ dst3[12-15] = q12-q15[1]; dst3 += 4*2*4\r |
571 | bhs 10b\r |
572 | \r |
573 | @ last 0-15 bytes\r |
574 | \r |
575 | cmp \counter, #0\r |
576 | beq 40f\r |
577 | \r |
578 | cmp \counter, #4\r |
579 | blo 30f\r |
580 | \r |
581 | @ 4-12 pixels (4 pre iteration)\r |
582 | 20:\r |
583 | ldr \reg1, [\src] @ reg1 = src[0-3]\r |
584 | sub \counter, \counter, #4 @ counter -= 4\r |
585 | \r |
586 | add \src, \src, #4 @ src += 4\r |
587 | \r |
588 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r |
589 | cmp \counter, #4\r |
590 | \r |
591 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r |
592 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r |
593 | \r |
594 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r |
595 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r |
596 | \r |
597 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r |
598 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r |
599 | \r |
600 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r |
601 | \r |
602 | add \reg5, \dst, \dststride\r |
603 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r |
604 | \r |
605 | vmov.32 d16[0], \reg2\r |
606 | add \reg6, \reg5, \dststride\r |
607 | \r |
608 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r |
609 | add \reg7, \reg6, \dststride\r |
610 | \r |
611 | vmov.32 d16[1], \reg4\r |
612 | \r |
613 | vmov d17, d16\r |
614 | \r |
615 | vmov d18, d16\r |
616 | \r |
617 | vmov d19, d16\r |
618 | \r |
619 | vst4.16 {d16,d17,d18,d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 4*2*4\r |
620 | \r |
621 | vst4.16 {d16,d17,d18,d19}, [\reg5] @ dst1[0-15] = d16-d19\r |
622 | \r |
623 | vst4.16 {d16,d17,d18,d19}, [\reg6] @ dst2[0-15] = d16-d19\r |
624 | \r |
625 | vst4.16 {d16,d17,d18,d19}, [\reg7] @ dst3[0-15] = d16-d19\r |
626 | bhs 20b\r |
627 | \r |
628 | cmp \counter, #0\r |
629 | beq 40f\r |
630 | \r |
631 | @ last 1-3 pixels\r |
632 | 30:\r |
633 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r |
634 | add \reg2, \dst, \dststride\r |
635 | \r |
636 | add \reg3, \reg2, \dststride\r |
637 | add \dst, \dst, #8 @ dst += 4*2\r |
638 | \r |
639 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r |
640 | add \reg4, \reg3, \dststride\r |
641 | \r |
642 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r |
643 | subS \counter, \counter, #1 @ counter--\r |
644 | \r |
645 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r |
646 | \r |
647 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r |
648 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r |
649 | \r |
650 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r |
651 | \r |
652 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r |
653 | \r |
654 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r |
655 | \r |
656 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r |
657 | \r |
658 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r |
659 | \r |
660 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r |
661 | bne 30b\r |
662 | \r |
663 | 40:\r |
664 | .endm\r |
665 | \r |