Commit | Line | Data |
---|---|---|
7fc3ac8a H |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r | |
3 | @@\r | |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r | |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r | |
6 | @@ the Software without restriction, including without limitation the rights to\r | |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r | |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r | |
9 | @@ so, subject to the following conditions:\r | |
10 | @@\r | |
11 | @@ The above copyright notice and this permission notice shall be included in all\r | |
12 | @@ copies or substantial portions of the Software.\r | |
13 | @@\r | |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r | |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r | |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r | |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r | |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r | |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r | |
20 | @@ SOFTWARE.\r | |
21 | @@\r | |
22 | \r | |
23 | \r | |
67381db0 | 24 | #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4)\r |
25 | /* can't use because gas wants ',' before ':' */\r | |
26 | #define A128\r | |
27 | #define A256\r | |
28 | #else\r | |
29 | #define A128 :128\r | |
30 | #define A256 :256\r | |
31 | #endif\r | |
32 | \r | |
7fc3ac8a H |
33 | \r |
34 | .macro _neon_normalxx_8_16_line_middle src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride, dA, dB\r | |
35 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
36 | \r | |
37 | ldr \reg2, [\src, #4] @ reg2 = src[4-7]\r | |
38 | \r | |
39 | ldr \reg3, [\src, #8] @ reg3 = src[8-11]\r | |
40 | \r | |
41 | ldr \reg4, [\src, #12] @ reg4 = src[12-15]\r | |
42 | ubfx \reg5, \reg1, #0, #8 @ reg5 = src[0]\r | |
43 | \r | |
44 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[0]]\r | |
45 | ubfx \reg6, \reg1, #8, #8 @ reg6 = src[1]\r | |
46 | \r | |
47 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[1]]\r | |
48 | ubfx \reg7, \reg1, #16, #8 @ reg7 = src[2]\r | |
49 | \r | |
50 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[2]]\r | |
51 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
52 | \r | |
53 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
54 | ubfx \reg8, \reg2, #0, #8 @ reg8 = src[4]\r | |
55 | \r | |
56 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[4]]\r | |
57 | ubfx \reg9, \reg2, #8, #8 @ reg9 = src[5]\r | |
58 | \r | |
59 | ldr \reg9, [\pal, \reg9, lsl #2] @ reg9 = pal[src[5]]\r | |
60 | bfi \reg5, \reg6, #16, #16 @ reg5 = pal[src[0]] | pal[src[1]] << 16\r | |
61 | \r | |
62 | bfi \reg7, \reg1, #16, #16 @ reg7 = pal[src[2]] | pal[src[3]] << 16\r | |
63 | ubfx \reg6, \reg2, #16, #8 @ reg6 = src[6]\r | |
64 | \r | |
65 | vmov d16, \reg5, \reg7 @ d16 = pal[src[0-3]]\r | |
66 | lsr \reg2, \reg2, #24 @ reg2 = src[7]\r | |
67 | \r | |
68 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[6]]\r | |
69 | bfi \reg8, \reg9, #16, #16 @ reg8 = pal[src[4]] | pal[src[5]] << 16\r | |
70 | \r | |
71 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[7]]\r | |
72 | ubfx \reg1, \reg3, #0, #8 @ reg1 = src[8]\r | |
73 | \r | |
74 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[8]]\r | |
75 | ubfx \reg5, \reg3, #8, #8 @ reg5 = src[9]\r | |
76 | \r | |
77 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[9]]\r | |
78 | ubfx \reg7, \reg3, #16, #8 @ reg7 = src[10]\r | |
79 | \r | |
80 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[10]]\r | |
81 | bfi \reg6, \reg2, #16, #16 @ reg6 = pal[src[6]] | pal[src[7]] << 16\r | |
82 | \r | |
83 | vmov d17, \reg8, \reg6 @ d17 = pal[src[4-7]]\r | |
84 | lsr \reg3, \reg3, #24 @ reg3 = src[11]\r | |
85 | \r | |
86 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[11]]\r | |
87 | ubfx \reg2, \reg4, #0, #8 @ reg2 = src[12]\r | |
88 | \r | |
89 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[12]]\r | |
90 | ubfx \reg6, \reg4, #8, #8 @ reg6 = src[13]\r | |
91 | \r | |
92 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[13]]\r | |
93 | ubfx \reg8, \reg4, #16, #8 @ reg8 = src[14]\r | |
94 | \r | |
95 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[14]]\r | |
96 | lsr \reg4, \reg4, #24 @ reg4 = src[15]\r | |
97 | \r | |
98 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[15]]\r | |
99 | bfi \reg1, \reg5, #16, #16 @ reg1 = pal[src[8]] | pal[src[9]] << 16\r | |
100 | \r | |
101 | bfi \reg7, \reg3, #16, #16 @ reg7 = pal[src[10]] | pal[src[11]] << 16\r | |
102 | bfi \reg2, \reg6, #16, #16 @ reg2 = pal[src[12]] | pal[src[13]] << 16\r | |
103 | \r | |
104 | vmov \dA, \reg1, \reg7 @ dA = pal[src[8-11]]\r | |
105 | sub \counter, \counter, #16 @ counter -= 16\r | |
106 | \r | |
107 | bfi \reg8, \reg4, #16, #16 @ reg8 = pal[src[14]] | pal[src[15]] << 16\r | |
108 | add \src, \src, #16 @ src += 16\r | |
109 | \r | |
110 | vmov \dB, \reg2, \reg8 @ dB = pal[src[12-15]]\r | |
111 | cmp \counter, #16\r | |
112 | .endm\r | |
113 | \r | |
114 | .macro neon_normal1x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9\r | |
115 | @ align src to 4 bytes\r | |
116 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
117 | beq 10f\r | |
118 | \r | |
119 | @ first 1-3 pixels\r | |
120 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
121 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
122 | \r | |
123 | add \src, \src, \reg5 @ src += reg5\r | |
124 | sub \counter, \counter, \reg5 @ counter -= reg5\r | |
125 | \r | |
126 | subS \reg5, \reg5, #1 @ reg5--\r | |
127 | \r | |
128 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
129 | ubfxne \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
130 | \r | |
131 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[reg2]\r | |
132 | \r | |
133 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[reg3]\r | |
134 | \r | |
135 | strh \reg2, [\dst] @ dst[0] = reg2\r | |
136 | \r | |
137 | strneh \reg3, [\dst, #2]! @ dst[1] = reg3; dst++\r | |
138 | subneS \reg5, \reg5, #1 @ reg5--\r | |
139 | \r | |
140 | ubfxne \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
141 | add \dst, \dst, #2 @ dst++\r | |
142 | \r | |
143 | ldrne \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[reg4]\r | |
144 | \r | |
145 | strneh \reg4, [\dst], #2 @ dst[2] = reg4; dst++\r | |
146 | \r | |
147 | @ middle pixels (16 per iteration)\r | |
148 | 10:\r | |
149 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, , d18, d19\r | |
150 | \r | |
151 | vst1.16 {d16-d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 2*16\r | |
152 | bhs 10b\r | |
153 | \r | |
154 | @ last 0-15 bytes\r | |
155 | \r | |
156 | cmp \counter, #0\r | |
157 | beq 40f\r | |
158 | \r | |
159 | cmp \counter, #4\r | |
160 | blo 30f\r | |
161 | \r | |
162 | @ 4-12 pixels (4 pre iteration)\r | |
163 | 20:\r | |
164 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
165 | sub \counter, \counter, #4 @ counter -= 4\r | |
166 | \r | |
167 | add \src, \src, #4 @ src += 4\r | |
168 | add \dst, \dst, #(2*4) @ dst += 4\r | |
169 | \r | |
170 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
171 | cmp \counter, #4\r | |
172 | \r | |
173 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
174 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
175 | \r | |
176 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
177 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
178 | \r | |
179 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
180 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
181 | \r | |
182 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
183 | \r | |
184 | strh \reg2, [\dst, #-8] @ dst[0] = reg2\r | |
185 | \r | |
186 | strh \reg3, [\dst, #-6] @ dst[1] = reg3\r | |
187 | \r | |
188 | strh \reg4, [\dst, #-4] @ dst[2] = reg4\r | |
189 | \r | |
190 | strh \reg1, [\dst, #-2] @ dst[3] = reg1\r | |
191 | bhs 20b\r | |
192 | \r | |
193 | cmp \counter, #0\r | |
194 | beq 40f\r | |
195 | \r | |
196 | @ last 1-3 pixels\r | |
197 | 30:\r | |
198 | ldrb \reg1, [\src] @ reg1 = src[0]\r | |
199 | subS \counter, \counter, #1 @ counter--\r | |
200 | \r | |
201 | ldrneb \reg2, [\src, #1]! @ reg2 = src[1]; src++\r | |
202 | \r | |
203 | add \src, \src, #1 @ src++\r | |
204 | \r | |
205 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
206 | \r | |
207 | ldrne \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[1]]\r | |
208 | \r | |
209 | strh \reg1, [\dst] @ dst[0] = reg1\r | |
210 | \r | |
211 | strneh \reg2, [\dst, #2]! @ dst[1] = reg2; dst++\r | |
212 | subneS \counter, \counter, #1 @ counter--\r | |
213 | \r | |
214 | ldrneb \reg3, [\src], #1 @ reg3 = src[2]; src++\r | |
215 | add \dst, \dst, #2 @ dst++\r | |
216 | \r | |
217 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[2]]\r | |
218 | \r | |
219 | strneh \reg3, [\dst], #2 @ dst[2] = reg3; dst++\r | |
220 | \r | |
221 | 40:\r | |
222 | .endm\r | |
223 | \r | |
224 | .macro neon_normal2x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
225 | @ align src to 4 bytes\r | |
226 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
227 | beq 10f\r | |
228 | \r | |
229 | @ first 1-3 pixels\r | |
230 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
231 | 1:\r | |
232 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
233 | add \reg2, \dst, \dststride\r | |
234 | \r | |
235 | add \dst, \dst, #4 @ dst += 2*2\r | |
236 | sub \counter, \counter, #1 @ counter--\r | |
237 | \r | |
238 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
239 | subS \reg5, \reg5, #1 @ reg5--\r | |
240 | \r | |
241 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r | |
242 | \r | |
243 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r | |
244 | \r | |
245 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
246 | \r | |
247 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r | |
248 | bne 1b\r | |
249 | \r | |
250 | @ middle pixels (16 per iteration)\r | |
251 | 10:\r | |
252 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d20, d21\r | |
253 | \r | |
254 | vmov q9, q8\r | |
255 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
256 | \r | |
257 | vmov q11, q10\r | |
258 | vst2.16 {q8,q9}, [\dst]! @ dst[0-7] = q8-q9; dst += 2*2*8\r | |
259 | \r | |
260 | vst2.16 {q10,q11}, [\dst]! @ dst[8-15] = q10-q11; dst += 2*2*8\r | |
261 | \r | |
262 | vst2.16 {q8,q9}, [\reg1]! @ dst1[0-7] = q8-q9; dst1 += 2*2*8\r | |
263 | \r | |
264 | vst2.16 {q10,q11}, [\reg1]! @ dst1[8-15] = q10-q11; dst1 += 2*2*8\r | |
265 | bhs 10b\r | |
266 | \r | |
267 | @ last 0-15 bytes\r | |
268 | \r | |
269 | cmp \counter, #0\r | |
270 | beq 40f\r | |
271 | \r | |
272 | cmp \counter, #4\r | |
273 | blo 30f\r | |
274 | \r | |
275 | @ 4-12 pixels (4 pre iteration)\r | |
276 | 20:\r | |
277 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
278 | sub \counter, \counter, #4 @ counter -= 4\r | |
279 | \r | |
280 | add \src, \src, #4 @ src += 4\r | |
281 | \r | |
282 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
283 | cmp \counter, #4\r | |
284 | \r | |
285 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
286 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
287 | \r | |
288 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
289 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
290 | \r | |
291 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
292 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
293 | \r | |
294 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
295 | \r | |
296 | add \reg5, \dst, \dststride\r | |
297 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
298 | \r | |
299 | vmov.32 d16[0], \reg2\r | |
300 | \r | |
301 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
302 | \r | |
303 | vmov.32 d16[1], \reg4\r | |
304 | \r | |
305 | vmov d17, d16\r | |
306 | \r | |
307 | vst2.16 {d16,d17}, [\dst]! @ dst[0-7] = d16-d17; dst += 2*2*4\r | |
308 | \r | |
309 | vst2.16 {d16,d17}, [\reg5] @ dst1[0-7] = d16-d17\r | |
310 | bhs 20b\r | |
311 | \r | |
312 | cmp \counter, #0\r | |
313 | beq 40f\r | |
314 | \r | |
315 | @ last 1-3 pixels\r | |
316 | 30:\r | |
317 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
318 | add \reg2, \dst, \dststride\r | |
319 | \r | |
320 | add \dst, \dst, #4 @ dst += 2*2\r | |
321 | \r | |
322 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
323 | subS \counter, \counter, #1 @ counter--\r | |
324 | \r | |
325 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r | |
326 | \r | |
327 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r | |
328 | \r | |
329 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
330 | \r | |
331 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r | |
332 | bne 30b\r | |
333 | \r | |
334 | 40:\r | |
335 | .endm\r | |
336 | \r | |
337 | .macro neon_normal3x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
338 | @ align src to 4 bytes\r | |
339 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
340 | beq 10f\r | |
341 | \r | |
342 | @ first 1-3 pixels\r | |
343 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
344 | 1:\r | |
345 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
346 | add \reg2, \dst, \dststride\r | |
347 | \r | |
348 | add \reg3, \reg2, \dststride\r | |
349 | add \dst, \dst, #6 @ dst += 3*2\r | |
350 | \r | |
351 | sub \counter, \counter, #1 @ counter--\r | |
352 | \r | |
353 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
354 | subS \reg5, \reg5, #1 @ reg5--\r | |
355 | \r | |
356 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r | |
357 | \r | |
358 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r | |
359 | \r | |
360 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r | |
361 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
362 | \r | |
363 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
364 | \r | |
365 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r | |
366 | \r | |
367 | strh \reg1, [\reg3] @ dst2[0] = reg1\r | |
368 | \r | |
369 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r | |
370 | bne 1b\r | |
371 | \r | |
372 | @ middle pixels (16 per iteration)\r | |
373 | 10:\r | |
374 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d22, d23\r | |
375 | \r | |
376 | vmov q9, q8\r | |
377 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
378 | \r | |
379 | vmov q10, q8\r | |
380 | add \reg2, \dst, \dststride, lsl #1 @ reg1 = dst + 2 * dststride\r | |
381 | \r | |
382 | vmov q12, q11\r | |
383 | vst3.16 {d16,d18,d20}, [\dst]! @ dst[0-3] = q8-q10[0]; dst += 3*2*4\r | |
384 | \r | |
385 | vmov q13, q11\r | |
386 | vst3.16 {d17,d19,d21}, [\dst]! @ dst[4-7] = q8-q10[1]; dst += 3*2*4\r | |
387 | \r | |
388 | vst3.16 {d22,d24,d26}, [\dst]! @ dst[8-11] = q11-q13[0]; dst += 3*2*4\r | |
389 | \r | |
390 | vst3.16 {d23,d25,d27}, [\dst]! @ dst[12-15] = q11-q13[1]; dst += 3*2*4\r | |
391 | \r | |
392 | vst3.16 {d16,d18,d20}, [\reg1]! @ dst1[0-3] = q8-q10[0]; dst1 += 3*2*4\r | |
393 | \r | |
394 | vst3.16 {d17,d19,d21}, [\reg1]! @ dst1[4-7] = q8-q10[1]; dst1 += 3*2*4\r | |
395 | \r | |
396 | vst3.16 {d22,d24,d26}, [\reg1]! @ dst1[8-11] = q11-q13[0]; dst1 += 3*2*4\r | |
397 | \r | |
398 | vst3.16 {d23,d25,d27}, [\reg1]! @ dst1[12-15] = q11-q13[1]; dst1 += 3*2*4\r | |
399 | \r | |
400 | vst3.16 {d16,d18,d20}, [\reg2]! @ dst2[0-3] = q8-q10[0]; dst2 += 3*2*4\r | |
401 | \r | |
402 | vst3.16 {d17,d19,d21}, [\reg2]! @ dst2[4-7] = q8-q10[1]; dst2 += 3*2*4\r | |
403 | \r | |
404 | vst3.16 {d22,d24,d26}, [\reg2]! @ dst2[8-11] = q11-q13[0]; dst2 += 3*2*4\r | |
405 | \r | |
406 | vst3.16 {d23,d25,d27}, [\reg2]! @ dst2[12-15] = q11-q13[1]; dst2 += 3*2*4\r | |
407 | bhs 10b\r | |
408 | \r | |
409 | @ last 0-15 bytes\r | |
410 | \r | |
411 | cmp \counter, #0\r | |
412 | beq 40f\r | |
413 | \r | |
414 | cmp \counter, #4\r | |
415 | blo 30f\r | |
416 | \r | |
417 | @ 4-12 pixels (4 pre iteration)\r | |
418 | 20:\r | |
419 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
420 | sub \counter, \counter, #4 @ counter -= 4\r | |
421 | \r | |
422 | add \src, \src, #4 @ src += 4\r | |
423 | \r | |
424 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
425 | cmp \counter, #4\r | |
426 | \r | |
427 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
428 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
429 | \r | |
430 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
431 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
432 | \r | |
433 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
434 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
435 | \r | |
436 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
437 | \r | |
438 | add \reg5, \dst, \dststride\r | |
439 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
440 | \r | |
441 | vmov.32 d16[0], \reg2\r | |
442 | add \reg6, \reg5, \dststride\r | |
443 | \r | |
444 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
445 | \r | |
446 | vmov.32 d16[1], \reg4\r | |
447 | \r | |
448 | vmov d17, d16\r | |
449 | \r | |
450 | vmov d18, d16\r | |
451 | \r | |
452 | vst3.16 {d16,d17,d18}, [\dst]! @ dst[0-11] = d16-d18; dst += 3*2*4\r | |
453 | \r | |
454 | vst3.16 {d16,d17,d18}, [\reg5] @ dst1[0-11] = d16-d18\r | |
455 | \r | |
456 | vst3.16 {d16,d17,d18}, [\reg6] @ dst2[0-11] = d16-d18\r | |
457 | bhs 20b\r | |
458 | \r | |
459 | cmp \counter, #0\r | |
460 | beq 40f\r | |
461 | \r | |
462 | @ last 1-3 pixels\r | |
463 | 30:\r | |
464 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
465 | add \reg2, \dst, \dststride\r | |
466 | \r | |
467 | add \reg3, \reg2, \dststride\r | |
468 | add \dst, \dst, #6 @ dst += 3*2\r | |
469 | \r | |
470 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
471 | subS \counter, \counter, #1 @ counter--\r | |
472 | \r | |
473 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r | |
474 | \r | |
475 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r | |
476 | \r | |
477 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r | |
478 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
479 | \r | |
480 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
481 | \r | |
482 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r | |
483 | \r | |
484 | strh \reg1, [\reg3] @ dst2[0] = reg1\r | |
485 | \r | |
486 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r | |
487 | bne 30b\r | |
488 | \r | |
489 | 40:\r | |
490 | .endm\r | |
491 | \r | |
492 | .macro neon_normal4x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
493 | @ align src to 4 bytes\r | |
494 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
495 | beq 10f\r | |
496 | \r | |
497 | @ first 1-3 pixels\r | |
498 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
499 | 1:\r | |
500 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
501 | add \reg2, \dst, \dststride\r | |
502 | \r | |
503 | add \reg3, \reg2, \dststride\r | |
504 | add \dst, \dst, #8 @ dst += 4*2\r | |
505 | \r | |
506 | sub \counter, \counter, #1 @ counter--\r | |
507 | \r | |
508 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
509 | add \reg4, \reg3, \dststride\r | |
510 | \r | |
511 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r | |
512 | subS \reg5, \reg5, #1 @ reg5--\r | |
513 | \r | |
514 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r | |
515 | \r | |
516 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
517 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r | |
518 | \r | |
519 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r | |
520 | \r | |
521 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r | |
522 | \r | |
523 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r | |
524 | \r | |
525 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r | |
526 | \r | |
527 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r | |
528 | \r | |
529 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r | |
530 | bne 1b\r | |
531 | \r | |
532 | @ middle pixels (16 per iteration)\r | |
533 | 10:\r | |
534 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d24, d25\r | |
535 | \r | |
536 | vmov q9, q8\r | |
537 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
538 | \r | |
539 | vmov q10, q8\r | |
540 | add \reg2, \dst, \dststride, lsl #1 @ reg2 = dst + 2 * dststride\r | |
541 | \r | |
542 | vmov q11, q8\r | |
543 | add \reg3, \reg1, \dststride,lsl #1 @ reg3 = dst + 3 * dststride\r | |
544 | \r | |
545 | vmov q13, q12\r | |
546 | vst4.16 {d16,d18,d20,d22}, [\dst]! @ dst[0-3] = q8-q11[0]; dst += 4*2*4\r | |
547 | \r | |
548 | vmov q14, q12\r | |
549 | \r | |
550 | vmov q15, q12\r | |
551 | vst4.16 {d17,d19,d21,d23}, [\dst]! @ dst[4-7] = q8-q11[1]; dst += 4*2*4\r | |
552 | \r | |
553 | vst4.16 {d24,d26,d28,d30}, [\dst]! @ dst[8-11] = q12-q15[0]; dst += 4*2*4\r | |
554 | \r | |
555 | vst4.16 {d25,d27,d29,d31}, [\dst]! @ dst[12-15] = q12-q15[1]; dst += 4*2*4\r | |
556 | \r | |
557 | vst4.16 {d16,d18,d20,d22}, [\reg1]! @ dst1[0-3] = q8-q11[0]; dst1 += 4*2*4\r | |
558 | \r | |
559 | vst4.16 {d17,d19,d21,d23}, [\reg1]! @ dst1[4-7] = q8-q11[1]; dst1 += 4*2*4\r | |
560 | \r | |
561 | vst4.16 {d24,d26,d28,d30}, [\reg1]! @ dst1[8-11] = q12-q15[0]; dst1 += 4*2*4\r | |
562 | \r | |
563 | vst4.16 {d25,d27,d29,d31}, [\reg1]! @ dst1[12-15] = q12-q15[1]; dst1 += 4*2*4\r | |
564 | \r | |
565 | vst4.16 {d16,d18,d20,d22}, [\reg2]! @ dst2[0-3] = q8-q11[0]; dst2 += 4*2*4\r | |
566 | \r | |
567 | vst4.16 {d17,d19,d21,d23}, [\reg2]! @ dst2[4-7] = q8-q11[1]; dst2 += 4*2*4\r | |
568 | \r | |
569 | vst4.16 {d24,d26,d28,d30}, [\reg2]! @ dst2[8-11] = q12-q15[0]; dst2 += 4*2*4\r | |
570 | \r | |
571 | vst4.16 {d25,d27,d29,d31}, [\reg2]! @ dst2[12-15] = q12-q15[1]; dst2 += 4*2*4\r | |
572 | \r | |
573 | vst4.16 {d16,d18,d20,d22}, [\reg3]! @ dst3[0-3] = q8-q11[0]; dst3 += 4*2*4\r | |
574 | \r | |
575 | vst4.16 {d17,d19,d21,d23}, [\reg3]! @ dst3[4-7] = q8-q11[1]; dst3 += 4*2*4\r | |
576 | \r | |
577 | vst4.16 {d24,d26,d28,d30}, [\reg3]! @ dst3[8-11] = q12-q15[0]; dst3 += 4*2*4\r | |
578 | \r | |
579 | vst4.16 {d25,d27,d29,d31}, [\reg3]! @ dst3[12-15] = q12-q15[1]; dst3 += 4*2*4\r | |
580 | bhs 10b\r | |
581 | \r | |
582 | @ last 0-15 bytes\r | |
583 | \r | |
584 | cmp \counter, #0\r | |
585 | beq 40f\r | |
586 | \r | |
587 | cmp \counter, #4\r | |
588 | blo 30f\r | |
589 | \r | |
590 | @ 4-12 pixels (4 pre iteration)\r | |
591 | 20:\r | |
592 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
593 | sub \counter, \counter, #4 @ counter -= 4\r | |
594 | \r | |
595 | add \src, \src, #4 @ src += 4\r | |
596 | \r | |
597 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
598 | cmp \counter, #4\r | |
599 | \r | |
600 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
601 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
602 | \r | |
603 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
604 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
605 | \r | |
606 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
607 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
608 | \r | |
609 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
610 | \r | |
611 | add \reg5, \dst, \dststride\r | |
612 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
613 | \r | |
614 | vmov.32 d16[0], \reg2\r | |
615 | add \reg6, \reg5, \dststride\r | |
616 | \r | |
617 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
618 | add \reg7, \reg6, \dststride\r | |
619 | \r | |
620 | vmov.32 d16[1], \reg4\r | |
621 | \r | |
622 | vmov d17, d16\r | |
623 | \r | |
624 | vmov d18, d16\r | |
625 | \r | |
626 | vmov d19, d16\r | |
627 | \r | |
628 | vst4.16 {d16,d17,d18,d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 4*2*4\r | |
629 | \r | |
630 | vst4.16 {d16,d17,d18,d19}, [\reg5] @ dst1[0-15] = d16-d19\r | |
631 | \r | |
632 | vst4.16 {d16,d17,d18,d19}, [\reg6] @ dst2[0-15] = d16-d19\r | |
633 | \r | |
634 | vst4.16 {d16,d17,d18,d19}, [\reg7] @ dst3[0-15] = d16-d19\r | |
635 | bhs 20b\r | |
636 | \r | |
637 | cmp \counter, #0\r | |
638 | beq 40f\r | |
639 | \r | |
640 | @ last 1-3 pixels\r | |
641 | 30:\r | |
642 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
643 | add \reg2, \dst, \dststride\r | |
644 | \r | |
645 | add \reg3, \reg2, \dststride\r | |
646 | add \dst, \dst, #8 @ dst += 4*2\r | |
647 | \r | |
648 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
649 | add \reg4, \reg3, \dststride\r | |
650 | \r | |
651 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r | |
652 | subS \counter, \counter, #1 @ counter--\r | |
653 | \r | |
654 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r | |
655 | \r | |
656 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
657 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r | |
658 | \r | |
659 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r | |
660 | \r | |
661 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r | |
662 | \r | |
663 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r | |
664 | \r | |
665 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r | |
666 | \r | |
667 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r | |
668 | \r | |
669 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r | |
670 | bne 30b\r | |
671 | \r | |
672 | 40:\r | |
673 | .endm\r | |
674 | \r |