Commit | Line | Data |
---|---|---|
7fc3ac8a H |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r | |
3 | @@\r | |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r | |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r | |
6 | @@ the Software without restriction, including without limitation the rights to\r | |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r | |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r | |
9 | @@ so, subject to the following conditions:\r | |
10 | @@\r | |
11 | @@ The above copyright notice and this permission notice shall be included in all\r | |
12 | @@ copies or substantial portions of the Software.\r | |
13 | @@\r | |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r | |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r | |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r | |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r | |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r | |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r | |
20 | @@ SOFTWARE.\r | |
21 | @@\r | |
22 | \r | |
23 | \r | |
67381db0 | 24 | #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4)\r |
25 | /* can't use because gas wants ',' before ':' */\r | |
26 | #define A128\r | |
27 | #define A256\r | |
28 | #else\r | |
29 | #define A128 :128\r | |
30 | #define A256 :256\r | |
31 | #endif\r | |
32 | \r | |
c688b90f | 33 | .macro bgr1555_to_rgb565 dr0 dr1 t0 t1 t2 ar\r |
34 | mov \ar, #0x07c0\r | |
6ce097ba | 35 | vshl.u16 \t0, \dr0, #11\r |
36 | vshl.u16 \t1, \dr1, #11\r | |
37 | vshl.u16 \dr0, \dr0, #1\r | |
38 | vshl.u16 \dr1, \dr1, #1\r | |
c688b90f | 39 | vdup.16 \t2, \ar\r |
6ce097ba | 40 | vsri.u16 \t0, \dr0, #11\r |
41 | vsri.u16 \t1, \dr1, #11\r | |
6ce097ba | 42 | vbif \dr0, \t0, \t2\r |
43 | vbif \dr1, \t1, \t2\r | |
44 | .endm\r | |
7fc3ac8a H |
45 | \r |
46 | .macro _neon_normalxx_8_16_line_middle src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride, dA, dB\r | |
47 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
48 | \r | |
49 | ldr \reg2, [\src, #4] @ reg2 = src[4-7]\r | |
50 | \r | |
51 | ldr \reg3, [\src, #8] @ reg3 = src[8-11]\r | |
52 | \r | |
53 | ldr \reg4, [\src, #12] @ reg4 = src[12-15]\r | |
54 | ubfx \reg5, \reg1, #0, #8 @ reg5 = src[0]\r | |
55 | \r | |
56 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[0]]\r | |
57 | ubfx \reg6, \reg1, #8, #8 @ reg6 = src[1]\r | |
58 | \r | |
59 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[1]]\r | |
60 | ubfx \reg7, \reg1, #16, #8 @ reg7 = src[2]\r | |
61 | \r | |
62 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[2]]\r | |
63 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
64 | \r | |
65 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
66 | ubfx \reg8, \reg2, #0, #8 @ reg8 = src[4]\r | |
67 | \r | |
68 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[4]]\r | |
69 | ubfx \reg9, \reg2, #8, #8 @ reg9 = src[5]\r | |
70 | \r | |
71 | ldr \reg9, [\pal, \reg9, lsl #2] @ reg9 = pal[src[5]]\r | |
72 | bfi \reg5, \reg6, #16, #16 @ reg5 = pal[src[0]] | pal[src[1]] << 16\r | |
73 | \r | |
74 | bfi \reg7, \reg1, #16, #16 @ reg7 = pal[src[2]] | pal[src[3]] << 16\r | |
75 | ubfx \reg6, \reg2, #16, #8 @ reg6 = src[6]\r | |
76 | \r | |
77 | vmov d16, \reg5, \reg7 @ d16 = pal[src[0-3]]\r | |
78 | lsr \reg2, \reg2, #24 @ reg2 = src[7]\r | |
79 | \r | |
80 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[6]]\r | |
81 | bfi \reg8, \reg9, #16, #16 @ reg8 = pal[src[4]] | pal[src[5]] << 16\r | |
82 | \r | |
83 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[7]]\r | |
84 | ubfx \reg1, \reg3, #0, #8 @ reg1 = src[8]\r | |
85 | \r | |
86 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[8]]\r | |
87 | ubfx \reg5, \reg3, #8, #8 @ reg5 = src[9]\r | |
88 | \r | |
89 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[9]]\r | |
90 | ubfx \reg7, \reg3, #16, #8 @ reg7 = src[10]\r | |
91 | \r | |
92 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[10]]\r | |
93 | bfi \reg6, \reg2, #16, #16 @ reg6 = pal[src[6]] | pal[src[7]] << 16\r | |
94 | \r | |
95 | vmov d17, \reg8, \reg6 @ d17 = pal[src[4-7]]\r | |
96 | lsr \reg3, \reg3, #24 @ reg3 = src[11]\r | |
97 | \r | |
98 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[11]]\r | |
99 | ubfx \reg2, \reg4, #0, #8 @ reg2 = src[12]\r | |
100 | \r | |
101 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[12]]\r | |
102 | ubfx \reg6, \reg4, #8, #8 @ reg6 = src[13]\r | |
103 | \r | |
104 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[13]]\r | |
105 | ubfx \reg8, \reg4, #16, #8 @ reg8 = src[14]\r | |
106 | \r | |
107 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[14]]\r | |
108 | lsr \reg4, \reg4, #24 @ reg4 = src[15]\r | |
109 | \r | |
110 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[15]]\r | |
111 | bfi \reg1, \reg5, #16, #16 @ reg1 = pal[src[8]] | pal[src[9]] << 16\r | |
112 | \r | |
113 | bfi \reg7, \reg3, #16, #16 @ reg7 = pal[src[10]] | pal[src[11]] << 16\r | |
114 | bfi \reg2, \reg6, #16, #16 @ reg2 = pal[src[12]] | pal[src[13]] << 16\r | |
115 | \r | |
116 | vmov \dA, \reg1, \reg7 @ dA = pal[src[8-11]]\r | |
117 | sub \counter, \counter, #16 @ counter -= 16\r | |
118 | \r | |
119 | bfi \reg8, \reg4, #16, #16 @ reg8 = pal[src[14]] | pal[src[15]] << 16\r | |
120 | add \src, \src, #16 @ src += 16\r | |
121 | \r | |
122 | vmov \dB, \reg2, \reg8 @ dB = pal[src[12-15]]\r | |
123 | cmp \counter, #16\r | |
124 | .endm\r | |
125 | \r | |
126 | .macro neon_normal1x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9\r | |
127 | @ align src to 4 bytes\r | |
128 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
129 | beq 10f\r | |
130 | \r | |
131 | @ first 1-3 pixels\r | |
132 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
133 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
134 | \r | |
135 | add \src, \src, \reg5 @ src += reg5\r | |
136 | sub \counter, \counter, \reg5 @ counter -= reg5\r | |
137 | \r | |
138 | subS \reg5, \reg5, #1 @ reg5--\r | |
139 | \r | |
140 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
141 | ubfxne \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
142 | \r | |
143 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[reg2]\r | |
144 | \r | |
145 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[reg3]\r | |
146 | \r | |
147 | strh \reg2, [\dst] @ dst[0] = reg2\r | |
148 | \r | |
149 | strneh \reg3, [\dst, #2]! @ dst[1] = reg3; dst++\r | |
150 | subneS \reg5, \reg5, #1 @ reg5--\r | |
151 | \r | |
152 | ubfxne \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
153 | add \dst, \dst, #2 @ dst++\r | |
154 | \r | |
155 | ldrne \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[reg4]\r | |
156 | \r | |
157 | strneh \reg4, [\dst], #2 @ dst[2] = reg4; dst++\r | |
158 | \r | |
159 | @ middle pixels (16 per iteration)\r | |
160 | 10:\r | |
161 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, , d18, d19\r | |
162 | \r | |
163 | vst1.16 {d16-d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 2*16\r | |
164 | bhs 10b\r | |
165 | \r | |
166 | @ last 0-15 bytes\r | |
167 | \r | |
168 | cmp \counter, #0\r | |
169 | beq 40f\r | |
170 | \r | |
171 | cmp \counter, #4\r | |
172 | blo 30f\r | |
173 | \r | |
174 | @ 4-12 pixels (4 pre iteration)\r | |
175 | 20:\r | |
176 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
177 | sub \counter, \counter, #4 @ counter -= 4\r | |
178 | \r | |
179 | add \src, \src, #4 @ src += 4\r | |
180 | add \dst, \dst, #(2*4) @ dst += 4\r | |
181 | \r | |
182 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
183 | cmp \counter, #4\r | |
184 | \r | |
185 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
186 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
187 | \r | |
188 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
189 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
190 | \r | |
191 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
192 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
193 | \r | |
194 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
195 | \r | |
196 | strh \reg2, [\dst, #-8] @ dst[0] = reg2\r | |
197 | \r | |
198 | strh \reg3, [\dst, #-6] @ dst[1] = reg3\r | |
199 | \r | |
200 | strh \reg4, [\dst, #-4] @ dst[2] = reg4\r | |
201 | \r | |
202 | strh \reg1, [\dst, #-2] @ dst[3] = reg1\r | |
203 | bhs 20b\r | |
204 | \r | |
205 | cmp \counter, #0\r | |
206 | beq 40f\r | |
207 | \r | |
208 | @ last 1-3 pixels\r | |
209 | 30:\r | |
210 | ldrb \reg1, [\src] @ reg1 = src[0]\r | |
211 | subS \counter, \counter, #1 @ counter--\r | |
212 | \r | |
213 | ldrneb \reg2, [\src, #1]! @ reg2 = src[1]; src++\r | |
214 | \r | |
215 | add \src, \src, #1 @ src++\r | |
216 | \r | |
217 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
218 | \r | |
219 | ldrne \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[1]]\r | |
220 | \r | |
221 | strh \reg1, [\dst] @ dst[0] = reg1\r | |
222 | \r | |
223 | strneh \reg2, [\dst, #2]! @ dst[1] = reg2; dst++\r | |
224 | subneS \counter, \counter, #1 @ counter--\r | |
225 | \r | |
226 | ldrneb \reg3, [\src], #1 @ reg3 = src[2]; src++\r | |
227 | add \dst, \dst, #2 @ dst++\r | |
228 | \r | |
229 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[2]]\r | |
230 | \r | |
231 | strneh \reg3, [\dst], #2 @ dst[2] = reg3; dst++\r | |
232 | \r | |
233 | 40:\r | |
234 | .endm\r | |
235 | \r | |
236 | .macro neon_normal2x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
237 | @ align src to 4 bytes\r | |
238 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
239 | beq 10f\r | |
240 | \r | |
241 | @ first 1-3 pixels\r | |
242 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
243 | 1:\r | |
244 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
245 | add \reg2, \dst, \dststride\r | |
246 | \r | |
247 | add \dst, \dst, #4 @ dst += 2*2\r | |
248 | sub \counter, \counter, #1 @ counter--\r | |
249 | \r | |
250 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
251 | subS \reg5, \reg5, #1 @ reg5--\r | |
252 | \r | |
253 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r | |
254 | \r | |
255 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r | |
256 | \r | |
257 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
258 | \r | |
259 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r | |
260 | bne 1b\r | |
261 | \r | |
262 | @ middle pixels (16 per iteration)\r | |
263 | 10:\r | |
264 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d20, d21\r | |
265 | \r | |
266 | vmov q9, q8\r | |
267 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
268 | \r | |
269 | vmov q11, q10\r | |
270 | vst2.16 {q8,q9}, [\dst]! @ dst[0-7] = q8-q9; dst += 2*2*8\r | |
271 | \r | |
272 | vst2.16 {q10,q11}, [\dst]! @ dst[8-15] = q10-q11; dst += 2*2*8\r | |
273 | \r | |
274 | vst2.16 {q8,q9}, [\reg1]! @ dst1[0-7] = q8-q9; dst1 += 2*2*8\r | |
275 | \r | |
276 | vst2.16 {q10,q11}, [\reg1]! @ dst1[8-15] = q10-q11; dst1 += 2*2*8\r | |
277 | bhs 10b\r | |
278 | \r | |
279 | @ last 0-15 bytes\r | |
280 | \r | |
281 | cmp \counter, #0\r | |
282 | beq 40f\r | |
283 | \r | |
284 | cmp \counter, #4\r | |
285 | blo 30f\r | |
286 | \r | |
287 | @ 4-12 pixels (4 pre iteration)\r | |
288 | 20:\r | |
289 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
290 | sub \counter, \counter, #4 @ counter -= 4\r | |
291 | \r | |
292 | add \src, \src, #4 @ src += 4\r | |
293 | \r | |
294 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
295 | cmp \counter, #4\r | |
296 | \r | |
297 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
298 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
299 | \r | |
300 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
301 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
302 | \r | |
303 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
304 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
305 | \r | |
306 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
307 | \r | |
308 | add \reg5, \dst, \dststride\r | |
309 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
310 | \r | |
311 | vmov.32 d16[0], \reg2\r | |
312 | \r | |
313 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
314 | \r | |
315 | vmov.32 d16[1], \reg4\r | |
316 | \r | |
317 | vmov d17, d16\r | |
318 | \r | |
319 | vst2.16 {d16,d17}, [\dst]! @ dst[0-7] = d16-d17; dst += 2*2*4\r | |
320 | \r | |
321 | vst2.16 {d16,d17}, [\reg5] @ dst1[0-7] = d16-d17\r | |
322 | bhs 20b\r | |
323 | \r | |
324 | cmp \counter, #0\r | |
325 | beq 40f\r | |
326 | \r | |
327 | @ last 1-3 pixels\r | |
328 | 30:\r | |
329 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
330 | add \reg2, \dst, \dststride\r | |
331 | \r | |
332 | add \dst, \dst, #4 @ dst += 2*2\r | |
333 | \r | |
334 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
335 | subS \counter, \counter, #1 @ counter--\r | |
336 | \r | |
337 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r | |
338 | \r | |
339 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r | |
340 | \r | |
341 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
342 | \r | |
343 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r | |
344 | bne 30b\r | |
345 | \r | |
346 | 40:\r | |
347 | .endm\r | |
348 | \r | |
349 | .macro neon_normal3x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
350 | @ align src to 4 bytes\r | |
351 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
352 | beq 10f\r | |
353 | \r | |
354 | @ first 1-3 pixels\r | |
355 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
356 | 1:\r | |
357 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
358 | add \reg2, \dst, \dststride\r | |
359 | \r | |
360 | add \reg3, \reg2, \dststride\r | |
361 | add \dst, \dst, #6 @ dst += 3*2\r | |
362 | \r | |
363 | sub \counter, \counter, #1 @ counter--\r | |
364 | \r | |
365 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
366 | subS \reg5, \reg5, #1 @ reg5--\r | |
367 | \r | |
368 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r | |
369 | \r | |
370 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r | |
371 | \r | |
372 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r | |
373 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
374 | \r | |
375 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
376 | \r | |
377 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r | |
378 | \r | |
379 | strh \reg1, [\reg3] @ dst2[0] = reg1\r | |
380 | \r | |
381 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r | |
382 | bne 1b\r | |
383 | \r | |
384 | @ middle pixels (16 per iteration)\r | |
385 | 10:\r | |
386 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d22, d23\r | |
387 | \r | |
388 | vmov q9, q8\r | |
389 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
390 | \r | |
391 | vmov q10, q8\r | |
392 | add \reg2, \dst, \dststride, lsl #1 @ reg1 = dst + 2 * dststride\r | |
393 | \r | |
394 | vmov q12, q11\r | |
395 | vst3.16 {d16,d18,d20}, [\dst]! @ dst[0-3] = q8-q10[0]; dst += 3*2*4\r | |
396 | \r | |
397 | vmov q13, q11\r | |
398 | vst3.16 {d17,d19,d21}, [\dst]! @ dst[4-7] = q8-q10[1]; dst += 3*2*4\r | |
399 | \r | |
400 | vst3.16 {d22,d24,d26}, [\dst]! @ dst[8-11] = q11-q13[0]; dst += 3*2*4\r | |
401 | \r | |
402 | vst3.16 {d23,d25,d27}, [\dst]! @ dst[12-15] = q11-q13[1]; dst += 3*2*4\r | |
403 | \r | |
404 | vst3.16 {d16,d18,d20}, [\reg1]! @ dst1[0-3] = q8-q10[0]; dst1 += 3*2*4\r | |
405 | \r | |
406 | vst3.16 {d17,d19,d21}, [\reg1]! @ dst1[4-7] = q8-q10[1]; dst1 += 3*2*4\r | |
407 | \r | |
408 | vst3.16 {d22,d24,d26}, [\reg1]! @ dst1[8-11] = q11-q13[0]; dst1 += 3*2*4\r | |
409 | \r | |
410 | vst3.16 {d23,d25,d27}, [\reg1]! @ dst1[12-15] = q11-q13[1]; dst1 += 3*2*4\r | |
411 | \r | |
412 | vst3.16 {d16,d18,d20}, [\reg2]! @ dst2[0-3] = q8-q10[0]; dst2 += 3*2*4\r | |
413 | \r | |
414 | vst3.16 {d17,d19,d21}, [\reg2]! @ dst2[4-7] = q8-q10[1]; dst2 += 3*2*4\r | |
415 | \r | |
416 | vst3.16 {d22,d24,d26}, [\reg2]! @ dst2[8-11] = q11-q13[0]; dst2 += 3*2*4\r | |
417 | \r | |
418 | vst3.16 {d23,d25,d27}, [\reg2]! @ dst2[12-15] = q11-q13[1]; dst2 += 3*2*4\r | |
419 | bhs 10b\r | |
420 | \r | |
421 | @ last 0-15 bytes\r | |
422 | \r | |
423 | cmp \counter, #0\r | |
424 | beq 40f\r | |
425 | \r | |
426 | cmp \counter, #4\r | |
427 | blo 30f\r | |
428 | \r | |
429 | @ 4-12 pixels (4 pre iteration)\r | |
430 | 20:\r | |
431 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
432 | sub \counter, \counter, #4 @ counter -= 4\r | |
433 | \r | |
434 | add \src, \src, #4 @ src += 4\r | |
435 | \r | |
436 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
437 | cmp \counter, #4\r | |
438 | \r | |
439 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
440 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
441 | \r | |
442 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
443 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
444 | \r | |
445 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
446 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
447 | \r | |
448 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
449 | \r | |
450 | add \reg5, \dst, \dststride\r | |
451 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
452 | \r | |
453 | vmov.32 d16[0], \reg2\r | |
454 | add \reg6, \reg5, \dststride\r | |
455 | \r | |
456 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
457 | \r | |
458 | vmov.32 d16[1], \reg4\r | |
459 | \r | |
460 | vmov d17, d16\r | |
461 | \r | |
462 | vmov d18, d16\r | |
463 | \r | |
464 | vst3.16 {d16,d17,d18}, [\dst]! @ dst[0-11] = d16-d18; dst += 3*2*4\r | |
465 | \r | |
466 | vst3.16 {d16,d17,d18}, [\reg5] @ dst1[0-11] = d16-d18\r | |
467 | \r | |
468 | vst3.16 {d16,d17,d18}, [\reg6] @ dst2[0-11] = d16-d18\r | |
469 | bhs 20b\r | |
470 | \r | |
471 | cmp \counter, #0\r | |
472 | beq 40f\r | |
473 | \r | |
474 | @ last 1-3 pixels\r | |
475 | 30:\r | |
476 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
477 | add \reg2, \dst, \dststride\r | |
478 | \r | |
479 | add \reg3, \reg2, \dststride\r | |
480 | add \dst, \dst, #6 @ dst += 3*2\r | |
481 | \r | |
482 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
483 | subS \counter, \counter, #1 @ counter--\r | |
484 | \r | |
485 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r | |
486 | \r | |
487 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r | |
488 | \r | |
489 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r | |
490 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
491 | \r | |
492 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
493 | \r | |
494 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r | |
495 | \r | |
496 | strh \reg1, [\reg3] @ dst2[0] = reg1\r | |
497 | \r | |
498 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r | |
499 | bne 30b\r | |
500 | \r | |
501 | 40:\r | |
502 | .endm\r | |
503 | \r | |
504 | .macro neon_normal4x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
505 | @ align src to 4 bytes\r | |
506 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
507 | beq 10f\r | |
508 | \r | |
509 | @ first 1-3 pixels\r | |
510 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
511 | 1:\r | |
512 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
513 | add \reg2, \dst, \dststride\r | |
514 | \r | |
515 | add \reg3, \reg2, \dststride\r | |
516 | add \dst, \dst, #8 @ dst += 4*2\r | |
517 | \r | |
518 | sub \counter, \counter, #1 @ counter--\r | |
519 | \r | |
520 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
521 | add \reg4, \reg3, \dststride\r | |
522 | \r | |
523 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r | |
524 | subS \reg5, \reg5, #1 @ reg5--\r | |
525 | \r | |
526 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r | |
527 | \r | |
528 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
529 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r | |
530 | \r | |
531 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r | |
532 | \r | |
533 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r | |
534 | \r | |
535 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r | |
536 | \r | |
537 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r | |
538 | \r | |
539 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r | |
540 | \r | |
541 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r | |
542 | bne 1b\r | |
543 | \r | |
544 | @ middle pixels (16 per iteration)\r | |
545 | 10:\r | |
546 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d24, d25\r | |
547 | \r | |
548 | vmov q9, q8\r | |
549 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
550 | \r | |
551 | vmov q10, q8\r | |
552 | add \reg2, \dst, \dststride, lsl #1 @ reg2 = dst + 2 * dststride\r | |
553 | \r | |
554 | vmov q11, q8\r | |
555 | add \reg3, \reg1, \dststride,lsl #1 @ reg3 = dst + 3 * dststride\r | |
556 | \r | |
557 | vmov q13, q12\r | |
558 | vst4.16 {d16,d18,d20,d22}, [\dst]! @ dst[0-3] = q8-q11[0]; dst += 4*2*4\r | |
559 | \r | |
560 | vmov q14, q12\r | |
561 | \r | |
562 | vmov q15, q12\r | |
563 | vst4.16 {d17,d19,d21,d23}, [\dst]! @ dst[4-7] = q8-q11[1]; dst += 4*2*4\r | |
564 | \r | |
565 | vst4.16 {d24,d26,d28,d30}, [\dst]! @ dst[8-11] = q12-q15[0]; dst += 4*2*4\r | |
566 | \r | |
567 | vst4.16 {d25,d27,d29,d31}, [\dst]! @ dst[12-15] = q12-q15[1]; dst += 4*2*4\r | |
568 | \r | |
569 | vst4.16 {d16,d18,d20,d22}, [\reg1]! @ dst1[0-3] = q8-q11[0]; dst1 += 4*2*4\r | |
570 | \r | |
571 | vst4.16 {d17,d19,d21,d23}, [\reg1]! @ dst1[4-7] = q8-q11[1]; dst1 += 4*2*4\r | |
572 | \r | |
573 | vst4.16 {d24,d26,d28,d30}, [\reg1]! @ dst1[8-11] = q12-q15[0]; dst1 += 4*2*4\r | |
574 | \r | |
575 | vst4.16 {d25,d27,d29,d31}, [\reg1]! @ dst1[12-15] = q12-q15[1]; dst1 += 4*2*4\r | |
576 | \r | |
577 | vst4.16 {d16,d18,d20,d22}, [\reg2]! @ dst2[0-3] = q8-q11[0]; dst2 += 4*2*4\r | |
578 | \r | |
579 | vst4.16 {d17,d19,d21,d23}, [\reg2]! @ dst2[4-7] = q8-q11[1]; dst2 += 4*2*4\r | |
580 | \r | |
581 | vst4.16 {d24,d26,d28,d30}, [\reg2]! @ dst2[8-11] = q12-q15[0]; dst2 += 4*2*4\r | |
582 | \r | |
583 | vst4.16 {d25,d27,d29,d31}, [\reg2]! @ dst2[12-15] = q12-q15[1]; dst2 += 4*2*4\r | |
584 | \r | |
585 | vst4.16 {d16,d18,d20,d22}, [\reg3]! @ dst3[0-3] = q8-q11[0]; dst3 += 4*2*4\r | |
586 | \r | |
587 | vst4.16 {d17,d19,d21,d23}, [\reg3]! @ dst3[4-7] = q8-q11[1]; dst3 += 4*2*4\r | |
588 | \r | |
589 | vst4.16 {d24,d26,d28,d30}, [\reg3]! @ dst3[8-11] = q12-q15[0]; dst3 += 4*2*4\r | |
590 | \r | |
591 | vst4.16 {d25,d27,d29,d31}, [\reg3]! @ dst3[12-15] = q12-q15[1]; dst3 += 4*2*4\r | |
592 | bhs 10b\r | |
593 | \r | |
594 | @ last 0-15 bytes\r | |
595 | \r | |
596 | cmp \counter, #0\r | |
597 | beq 40f\r | |
598 | \r | |
599 | cmp \counter, #4\r | |
600 | blo 30f\r | |
601 | \r | |
602 | @ 4-12 pixels (4 pre iteration)\r | |
603 | 20:\r | |
604 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
605 | sub \counter, \counter, #4 @ counter -= 4\r | |
606 | \r | |
607 | add \src, \src, #4 @ src += 4\r | |
608 | \r | |
609 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
610 | cmp \counter, #4\r | |
611 | \r | |
612 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
613 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
614 | \r | |
615 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
616 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
617 | \r | |
618 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
619 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
620 | \r | |
621 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
622 | \r | |
623 | add \reg5, \dst, \dststride\r | |
624 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
625 | \r | |
626 | vmov.32 d16[0], \reg2\r | |
627 | add \reg6, \reg5, \dststride\r | |
628 | \r | |
629 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
630 | add \reg7, \reg6, \dststride\r | |
631 | \r | |
632 | vmov.32 d16[1], \reg4\r | |
633 | \r | |
634 | vmov d17, d16\r | |
635 | \r | |
636 | vmov d18, d16\r | |
637 | \r | |
638 | vmov d19, d16\r | |
639 | \r | |
640 | vst4.16 {d16,d17,d18,d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 4*2*4\r | |
641 | \r | |
642 | vst4.16 {d16,d17,d18,d19}, [\reg5] @ dst1[0-15] = d16-d19\r | |
643 | \r | |
644 | vst4.16 {d16,d17,d18,d19}, [\reg6] @ dst2[0-15] = d16-d19\r | |
645 | \r | |
646 | vst4.16 {d16,d17,d18,d19}, [\reg7] @ dst3[0-15] = d16-d19\r | |
647 | bhs 20b\r | |
648 | \r | |
649 | cmp \counter, #0\r | |
650 | beq 40f\r | |
651 | \r | |
652 | @ last 1-3 pixels\r | |
653 | 30:\r | |
654 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
655 | add \reg2, \dst, \dststride\r | |
656 | \r | |
657 | add \reg3, \reg2, \dststride\r | |
658 | add \dst, \dst, #8 @ dst += 4*2\r | |
659 | \r | |
660 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
661 | add \reg4, \reg3, \dststride\r | |
662 | \r | |
663 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r | |
664 | subS \counter, \counter, #1 @ counter--\r | |
665 | \r | |
666 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r | |
667 | \r | |
668 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
669 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r | |
670 | \r | |
671 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r | |
672 | \r | |
673 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r | |
674 | \r | |
675 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r | |
676 | \r | |
677 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r | |
678 | \r | |
679 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r | |
680 | \r | |
681 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r | |
682 | bne 30b\r | |
683 | \r | |
684 | 40:\r | |
685 | .endm\r | |
686 | \r |