Commit | Line | Data |
---|---|---|
7fc3ac8a H |
1 | @@\r |
2 | @@ Copyright (C) 2012 Roman Pauer\r | |
3 | @@\r | |
4 | @@ Permission is hereby granted, free of charge, to any person obtaining a copy of\r | |
5 | @@ this software and associated documentation files (the "Software"), to deal in\r | |
6 | @@ the Software without restriction, including without limitation the rights to\r | |
7 | @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\r | |
8 | @@ of the Software, and to permit persons to whom the Software is furnished to do\r | |
9 | @@ so, subject to the following conditions:\r | |
10 | @@\r | |
11 | @@ The above copyright notice and this permission notice shall be included in all\r | |
12 | @@ copies or substantial portions of the Software.\r | |
13 | @@\r | |
14 | @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r | |
15 | @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r | |
16 | @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r | |
17 | @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r | |
18 | @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r | |
19 | @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r | |
20 | @@ SOFTWARE.\r | |
21 | @@\r | |
22 | \r | |
23 | \r | |
24 | \r | |
25 | .macro _neon_normalxx_8_16_line_middle src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride, dA, dB\r | |
26 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
27 | \r | |
28 | ldr \reg2, [\src, #4] @ reg2 = src[4-7]\r | |
29 | \r | |
30 | ldr \reg3, [\src, #8] @ reg3 = src[8-11]\r | |
31 | \r | |
32 | ldr \reg4, [\src, #12] @ reg4 = src[12-15]\r | |
33 | ubfx \reg5, \reg1, #0, #8 @ reg5 = src[0]\r | |
34 | \r | |
35 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[0]]\r | |
36 | ubfx \reg6, \reg1, #8, #8 @ reg6 = src[1]\r | |
37 | \r | |
38 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[1]]\r | |
39 | ubfx \reg7, \reg1, #16, #8 @ reg7 = src[2]\r | |
40 | \r | |
41 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[2]]\r | |
42 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
43 | \r | |
44 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
45 | ubfx \reg8, \reg2, #0, #8 @ reg8 = src[4]\r | |
46 | \r | |
47 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[4]]\r | |
48 | ubfx \reg9, \reg2, #8, #8 @ reg9 = src[5]\r | |
49 | \r | |
50 | ldr \reg9, [\pal, \reg9, lsl #2] @ reg9 = pal[src[5]]\r | |
51 | bfi \reg5, \reg6, #16, #16 @ reg5 = pal[src[0]] | pal[src[1]] << 16\r | |
52 | \r | |
53 | bfi \reg7, \reg1, #16, #16 @ reg7 = pal[src[2]] | pal[src[3]] << 16\r | |
54 | ubfx \reg6, \reg2, #16, #8 @ reg6 = src[6]\r | |
55 | \r | |
56 | vmov d16, \reg5, \reg7 @ d16 = pal[src[0-3]]\r | |
57 | lsr \reg2, \reg2, #24 @ reg2 = src[7]\r | |
58 | \r | |
59 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[6]]\r | |
60 | bfi \reg8, \reg9, #16, #16 @ reg8 = pal[src[4]] | pal[src[5]] << 16\r | |
61 | \r | |
62 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[7]]\r | |
63 | ubfx \reg1, \reg3, #0, #8 @ reg1 = src[8]\r | |
64 | \r | |
65 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[8]]\r | |
66 | ubfx \reg5, \reg3, #8, #8 @ reg5 = src[9]\r | |
67 | \r | |
68 | ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[9]]\r | |
69 | ubfx \reg7, \reg3, #16, #8 @ reg7 = src[10]\r | |
70 | \r | |
71 | ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[10]]\r | |
72 | bfi \reg6, \reg2, #16, #16 @ reg6 = pal[src[6]] | pal[src[7]] << 16\r | |
73 | \r | |
74 | vmov d17, \reg8, \reg6 @ d17 = pal[src[4-7]]\r | |
75 | lsr \reg3, \reg3, #24 @ reg3 = src[11]\r | |
76 | \r | |
77 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[11]]\r | |
78 | ubfx \reg2, \reg4, #0, #8 @ reg2 = src[12]\r | |
79 | \r | |
80 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[12]]\r | |
81 | ubfx \reg6, \reg4, #8, #8 @ reg6 = src[13]\r | |
82 | \r | |
83 | ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[13]]\r | |
84 | ubfx \reg8, \reg4, #16, #8 @ reg8 = src[14]\r | |
85 | \r | |
86 | ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[14]]\r | |
87 | lsr \reg4, \reg4, #24 @ reg4 = src[15]\r | |
88 | \r | |
89 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[15]]\r | |
90 | bfi \reg1, \reg5, #16, #16 @ reg1 = pal[src[8]] | pal[src[9]] << 16\r | |
91 | \r | |
92 | bfi \reg7, \reg3, #16, #16 @ reg7 = pal[src[10]] | pal[src[11]] << 16\r | |
93 | bfi \reg2, \reg6, #16, #16 @ reg2 = pal[src[12]] | pal[src[13]] << 16\r | |
94 | \r | |
95 | vmov \dA, \reg1, \reg7 @ dA = pal[src[8-11]]\r | |
96 | sub \counter, \counter, #16 @ counter -= 16\r | |
97 | \r | |
98 | bfi \reg8, \reg4, #16, #16 @ reg8 = pal[src[14]] | pal[src[15]] << 16\r | |
99 | add \src, \src, #16 @ src += 16\r | |
100 | \r | |
101 | vmov \dB, \reg2, \reg8 @ dB = pal[src[12-15]]\r | |
102 | cmp \counter, #16\r | |
103 | .endm\r | |
104 | \r | |
105 | .macro neon_normal1x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9\r | |
106 | @ align src to 4 bytes\r | |
107 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
108 | beq 10f\r | |
109 | \r | |
110 | @ first 1-3 pixels\r | |
111 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
112 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
113 | \r | |
114 | add \src, \src, \reg5 @ src += reg5\r | |
115 | sub \counter, \counter, \reg5 @ counter -= reg5\r | |
116 | \r | |
117 | subS \reg5, \reg5, #1 @ reg5--\r | |
118 | \r | |
119 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
120 | ubfxne \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
121 | \r | |
122 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[reg2]\r | |
123 | \r | |
124 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[reg3]\r | |
125 | \r | |
126 | strh \reg2, [\dst] @ dst[0] = reg2\r | |
127 | \r | |
128 | strneh \reg3, [\dst, #2]! @ dst[1] = reg3; dst++\r | |
129 | subneS \reg5, \reg5, #1 @ reg5--\r | |
130 | \r | |
131 | ubfxne \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
132 | add \dst, \dst, #2 @ dst++\r | |
133 | \r | |
134 | ldrne \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[reg4]\r | |
135 | \r | |
136 | strneh \reg4, [\dst], #2 @ dst[2] = reg4; dst++\r | |
137 | \r | |
138 | @ middle pixels (16 per iteration)\r | |
139 | 10:\r | |
140 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, , d18, d19\r | |
141 | \r | |
142 | vst1.16 {d16-d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 2*16\r | |
143 | bhs 10b\r | |
144 | \r | |
145 | @ last 0-15 bytes\r | |
146 | \r | |
147 | cmp \counter, #0\r | |
148 | beq 40f\r | |
149 | \r | |
150 | cmp \counter, #4\r | |
151 | blo 30f\r | |
152 | \r | |
153 | @ 4-12 pixels (4 pre iteration)\r | |
154 | 20:\r | |
155 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
156 | sub \counter, \counter, #4 @ counter -= 4\r | |
157 | \r | |
158 | add \src, \src, #4 @ src += 4\r | |
159 | add \dst, \dst, #(2*4) @ dst += 4\r | |
160 | \r | |
161 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
162 | cmp \counter, #4\r | |
163 | \r | |
164 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
165 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
166 | \r | |
167 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
168 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
169 | \r | |
170 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
171 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
172 | \r | |
173 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
174 | \r | |
175 | strh \reg2, [\dst, #-8] @ dst[0] = reg2\r | |
176 | \r | |
177 | strh \reg3, [\dst, #-6] @ dst[1] = reg3\r | |
178 | \r | |
179 | strh \reg4, [\dst, #-4] @ dst[2] = reg4\r | |
180 | \r | |
181 | strh \reg1, [\dst, #-2] @ dst[3] = reg1\r | |
182 | bhs 20b\r | |
183 | \r | |
184 | cmp \counter, #0\r | |
185 | beq 40f\r | |
186 | \r | |
187 | @ last 1-3 pixels\r | |
188 | 30:\r | |
189 | ldrb \reg1, [\src] @ reg1 = src[0]\r | |
190 | subS \counter, \counter, #1 @ counter--\r | |
191 | \r | |
192 | ldrneb \reg2, [\src, #1]! @ reg2 = src[1]; src++\r | |
193 | \r | |
194 | add \src, \src, #1 @ src++\r | |
195 | \r | |
196 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
197 | \r | |
198 | ldrne \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[1]]\r | |
199 | \r | |
200 | strh \reg1, [\dst] @ dst[0] = reg1\r | |
201 | \r | |
202 | strneh \reg2, [\dst, #2]! @ dst[1] = reg2; dst++\r | |
203 | subneS \counter, \counter, #1 @ counter--\r | |
204 | \r | |
205 | ldrneb \reg3, [\src], #1 @ reg3 = src[2]; src++\r | |
206 | add \dst, \dst, #2 @ dst++\r | |
207 | \r | |
208 | ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[2]]\r | |
209 | \r | |
210 | strneh \reg3, [\dst], #2 @ dst[2] = reg3; dst++\r | |
211 | \r | |
212 | 40:\r | |
213 | .endm\r | |
214 | \r | |
215 | .macro neon_normal2x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
216 | @ align src to 4 bytes\r | |
217 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
218 | beq 10f\r | |
219 | \r | |
220 | @ first 1-3 pixels\r | |
221 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
222 | 1:\r | |
223 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
224 | add \reg2, \dst, \dststride\r | |
225 | \r | |
226 | add \dst, \dst, #4 @ dst += 2*2\r | |
227 | sub \counter, \counter, #1 @ counter--\r | |
228 | \r | |
229 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
230 | subS \reg5, \reg5, #1 @ reg5--\r | |
231 | \r | |
232 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r | |
233 | \r | |
234 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r | |
235 | \r | |
236 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
237 | \r | |
238 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r | |
239 | bne 1b\r | |
240 | \r | |
241 | @ middle pixels (16 per iteration)\r | |
242 | 10:\r | |
243 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d20, d21\r | |
244 | \r | |
245 | vmov q9, q8\r | |
246 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
247 | \r | |
248 | vmov q11, q10\r | |
249 | vst2.16 {q8,q9}, [\dst]! @ dst[0-7] = q8-q9; dst += 2*2*8\r | |
250 | \r | |
251 | vst2.16 {q10,q11}, [\dst]! @ dst[8-15] = q10-q11; dst += 2*2*8\r | |
252 | \r | |
253 | vst2.16 {q8,q9}, [\reg1]! @ dst1[0-7] = q8-q9; dst1 += 2*2*8\r | |
254 | \r | |
255 | vst2.16 {q10,q11}, [\reg1]! @ dst1[8-15] = q10-q11; dst1 += 2*2*8\r | |
256 | bhs 10b\r | |
257 | \r | |
258 | @ last 0-15 bytes\r | |
259 | \r | |
260 | cmp \counter, #0\r | |
261 | beq 40f\r | |
262 | \r | |
263 | cmp \counter, #4\r | |
264 | blo 30f\r | |
265 | \r | |
266 | @ 4-12 pixels (4 pre iteration)\r | |
267 | 20:\r | |
268 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
269 | sub \counter, \counter, #4 @ counter -= 4\r | |
270 | \r | |
271 | add \src, \src, #4 @ src += 4\r | |
272 | \r | |
273 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
274 | cmp \counter, #4\r | |
275 | \r | |
276 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
277 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
278 | \r | |
279 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
280 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
281 | \r | |
282 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
283 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
284 | \r | |
285 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
286 | \r | |
287 | add \reg5, \dst, \dststride\r | |
288 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
289 | \r | |
290 | vmov.32 d16[0], \reg2\r | |
291 | \r | |
292 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
293 | \r | |
294 | vmov.32 d16[1], \reg4\r | |
295 | \r | |
296 | vmov d17, d16\r | |
297 | \r | |
298 | vst2.16 {d16,d17}, [\dst]! @ dst[0-7] = d16-d17; dst += 2*2*4\r | |
299 | \r | |
300 | vst2.16 {d16,d17}, [\reg5] @ dst1[0-7] = d16-d17\r | |
301 | bhs 20b\r | |
302 | \r | |
303 | cmp \counter, #0\r | |
304 | beq 40f\r | |
305 | \r | |
306 | @ last 1-3 pixels\r | |
307 | 30:\r | |
308 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
309 | add \reg2, \dst, \dststride\r | |
310 | \r | |
311 | add \dst, \dst, #4 @ dst += 2*2\r | |
312 | \r | |
313 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
314 | subS \counter, \counter, #1 @ counter--\r | |
315 | \r | |
316 | strh \reg1, [\dst, #-4] @ dst[0] = reg1\r | |
317 | \r | |
318 | strh \reg1, [\dst, #-2] @ dst[1] = reg1\r | |
319 | \r | |
320 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
321 | \r | |
322 | strh \reg1, [\reg2, #2] @ dst1[1] = reg1\r | |
323 | bne 30b\r | |
324 | \r | |
325 | 40:\r | |
326 | .endm\r | |
327 | \r | |
328 | .macro neon_normal3x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
329 | @ align src to 4 bytes\r | |
330 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
331 | beq 10f\r | |
332 | \r | |
333 | @ first 1-3 pixels\r | |
334 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
335 | 1:\r | |
336 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
337 | add \reg2, \dst, \dststride\r | |
338 | \r | |
339 | add \reg3, \reg2, \dststride\r | |
340 | add \dst, \dst, #6 @ dst += 3*2\r | |
341 | \r | |
342 | sub \counter, \counter, #1 @ counter--\r | |
343 | \r | |
344 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
345 | subS \reg5, \reg5, #1 @ reg5--\r | |
346 | \r | |
347 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r | |
348 | \r | |
349 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r | |
350 | \r | |
351 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r | |
352 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
353 | \r | |
354 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
355 | \r | |
356 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r | |
357 | \r | |
358 | strh \reg1, [\reg3] @ dst2[0] = reg1\r | |
359 | \r | |
360 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r | |
361 | bne 1b\r | |
362 | \r | |
363 | @ middle pixels (16 per iteration)\r | |
364 | 10:\r | |
365 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d22, d23\r | |
366 | \r | |
367 | vmov q9, q8\r | |
368 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
369 | \r | |
370 | vmov q10, q8\r | |
371 | add \reg2, \dst, \dststride, lsl #1 @ reg1 = dst + 2 * dststride\r | |
372 | \r | |
373 | vmov q12, q11\r | |
374 | vst3.16 {d16,d18,d20}, [\dst]! @ dst[0-3] = q8-q10[0]; dst += 3*2*4\r | |
375 | \r | |
376 | vmov q13, q11\r | |
377 | vst3.16 {d17,d19,d21}, [\dst]! @ dst[4-7] = q8-q10[1]; dst += 3*2*4\r | |
378 | \r | |
379 | vst3.16 {d22,d24,d26}, [\dst]! @ dst[8-11] = q11-q13[0]; dst += 3*2*4\r | |
380 | \r | |
381 | vst3.16 {d23,d25,d27}, [\dst]! @ dst[12-15] = q11-q13[1]; dst += 3*2*4\r | |
382 | \r | |
383 | vst3.16 {d16,d18,d20}, [\reg1]! @ dst1[0-3] = q8-q10[0]; dst1 += 3*2*4\r | |
384 | \r | |
385 | vst3.16 {d17,d19,d21}, [\reg1]! @ dst1[4-7] = q8-q10[1]; dst1 += 3*2*4\r | |
386 | \r | |
387 | vst3.16 {d22,d24,d26}, [\reg1]! @ dst1[8-11] = q11-q13[0]; dst1 += 3*2*4\r | |
388 | \r | |
389 | vst3.16 {d23,d25,d27}, [\reg1]! @ dst1[12-15] = q11-q13[1]; dst1 += 3*2*4\r | |
390 | \r | |
391 | vst3.16 {d16,d18,d20}, [\reg2]! @ dst2[0-3] = q8-q10[0]; dst2 += 3*2*4\r | |
392 | \r | |
393 | vst3.16 {d17,d19,d21}, [\reg2]! @ dst2[4-7] = q8-q10[1]; dst2 += 3*2*4\r | |
394 | \r | |
395 | vst3.16 {d22,d24,d26}, [\reg2]! @ dst2[8-11] = q11-q13[0]; dst2 += 3*2*4\r | |
396 | \r | |
397 | vst3.16 {d23,d25,d27}, [\reg2]! @ dst2[12-15] = q11-q13[1]; dst2 += 3*2*4\r | |
398 | bhs 10b\r | |
399 | \r | |
400 | @ last 0-15 bytes\r | |
401 | \r | |
402 | cmp \counter, #0\r | |
403 | beq 40f\r | |
404 | \r | |
405 | cmp \counter, #4\r | |
406 | blo 30f\r | |
407 | \r | |
408 | @ 4-12 pixels (4 pre iteration)\r | |
409 | 20:\r | |
410 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
411 | sub \counter, \counter, #4 @ counter -= 4\r | |
412 | \r | |
413 | add \src, \src, #4 @ src += 4\r | |
414 | \r | |
415 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
416 | cmp \counter, #4\r | |
417 | \r | |
418 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
419 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
420 | \r | |
421 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
422 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
423 | \r | |
424 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
425 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
426 | \r | |
427 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
428 | \r | |
429 | add \reg5, \dst, \dststride\r | |
430 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
431 | \r | |
432 | vmov.32 d16[0], \reg2\r | |
433 | add \reg6, \reg5, \dststride\r | |
434 | \r | |
435 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
436 | \r | |
437 | vmov.32 d16[1], \reg4\r | |
438 | \r | |
439 | vmov d17, d16\r | |
440 | \r | |
441 | vmov d18, d16\r | |
442 | \r | |
443 | vst3.16 {d16,d17,d18}, [\dst]! @ dst[0-11] = d16-d18; dst += 3*2*4\r | |
444 | \r | |
445 | vst3.16 {d16,d17,d18}, [\reg5] @ dst1[0-11] = d16-d18\r | |
446 | \r | |
447 | vst3.16 {d16,d17,d18}, [\reg6] @ dst2[0-11] = d16-d18\r | |
448 | bhs 20b\r | |
449 | \r | |
450 | cmp \counter, #0\r | |
451 | beq 40f\r | |
452 | \r | |
453 | @ last 1-3 pixels\r | |
454 | 30:\r | |
455 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
456 | add \reg2, \dst, \dststride\r | |
457 | \r | |
458 | add \reg3, \reg2, \dststride\r | |
459 | add \dst, \dst, #6 @ dst += 3*2\r | |
460 | \r | |
461 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
462 | subS \counter, \counter, #1 @ counter--\r | |
463 | \r | |
464 | strh \reg1, [\dst, #-6] @ dst[0] = reg1\r | |
465 | \r | |
466 | strh \reg1, [\dst, #-4] @ dst[1] = reg1\r | |
467 | \r | |
468 | strh \reg1, [\dst, #-2] @ dst[2] = reg1\r | |
469 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
470 | \r | |
471 | strh \reg1, [\reg2] @ dst1[0] = reg1\r | |
472 | \r | |
473 | str \reg1, [\reg2, #2] @ dst1[1-2] = reg1\r | |
474 | \r | |
475 | strh \reg1, [\reg3] @ dst2[0] = reg1\r | |
476 | \r | |
477 | str \reg1, [\reg3, #2] @ dst2[1-2] = reg1\r | |
478 | bne 30b\r | |
479 | \r | |
480 | 40:\r | |
481 | .endm\r | |
482 | \r | |
483 | .macro neon_normal4x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride\r | |
484 | @ align src to 4 bytes\r | |
485 | andS \reg5, \src, #3 @ reg5 = src & 3\r | |
486 | beq 10f\r | |
487 | \r | |
488 | @ first 1-3 pixels\r | |
489 | rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)\r | |
490 | 1:\r | |
491 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
492 | add \reg2, \dst, \dststride\r | |
493 | \r | |
494 | add \reg3, \reg2, \dststride\r | |
495 | add \dst, \dst, #8 @ dst += 4*2\r | |
496 | \r | |
497 | sub \counter, \counter, #1 @ counter--\r | |
498 | \r | |
499 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
500 | add \reg4, \reg3, \dststride\r | |
501 | \r | |
502 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r | |
503 | subS \reg5, \reg5, #1 @ reg5--\r | |
504 | \r | |
505 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r | |
506 | \r | |
507 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
508 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r | |
509 | \r | |
510 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r | |
511 | \r | |
512 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r | |
513 | \r | |
514 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r | |
515 | \r | |
516 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r | |
517 | \r | |
518 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r | |
519 | \r | |
520 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r | |
521 | bne 1b\r | |
522 | \r | |
523 | @ middle pixels (16 per iteration)\r | |
524 | 10:\r | |
525 | _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d24, d25\r | |
526 | \r | |
527 | vmov q9, q8\r | |
528 | add \reg1, \dst, \dststride @ reg1 = dst + dststride\r | |
529 | \r | |
530 | vmov q10, q8\r | |
531 | add \reg2, \dst, \dststride, lsl #1 @ reg2 = dst + 2 * dststride\r | |
532 | \r | |
533 | vmov q11, q8\r | |
534 | add \reg3, \reg1, \dststride,lsl #1 @ reg3 = dst + 3 * dststride\r | |
535 | \r | |
536 | vmov q13, q12\r | |
537 | vst4.16 {d16,d18,d20,d22}, [\dst]! @ dst[0-3] = q8-q11[0]; dst += 4*2*4\r | |
538 | \r | |
539 | vmov q14, q12\r | |
540 | \r | |
541 | vmov q15, q12\r | |
542 | vst4.16 {d17,d19,d21,d23}, [\dst]! @ dst[4-7] = q8-q11[1]; dst += 4*2*4\r | |
543 | \r | |
544 | vst4.16 {d24,d26,d28,d30}, [\dst]! @ dst[8-11] = q12-q15[0]; dst += 4*2*4\r | |
545 | \r | |
546 | vst4.16 {d25,d27,d29,d31}, [\dst]! @ dst[12-15] = q12-q15[1]; dst += 4*2*4\r | |
547 | \r | |
548 | vst4.16 {d16,d18,d20,d22}, [\reg1]! @ dst1[0-3] = q8-q11[0]; dst1 += 4*2*4\r | |
549 | \r | |
550 | vst4.16 {d17,d19,d21,d23}, [\reg1]! @ dst1[4-7] = q8-q11[1]; dst1 += 4*2*4\r | |
551 | \r | |
552 | vst4.16 {d24,d26,d28,d30}, [\reg1]! @ dst1[8-11] = q12-q15[0]; dst1 += 4*2*4\r | |
553 | \r | |
554 | vst4.16 {d25,d27,d29,d31}, [\reg1]! @ dst1[12-15] = q12-q15[1]; dst1 += 4*2*4\r | |
555 | \r | |
556 | vst4.16 {d16,d18,d20,d22}, [\reg2]! @ dst2[0-3] = q8-q11[0]; dst2 += 4*2*4\r | |
557 | \r | |
558 | vst4.16 {d17,d19,d21,d23}, [\reg2]! @ dst2[4-7] = q8-q11[1]; dst2 += 4*2*4\r | |
559 | \r | |
560 | vst4.16 {d24,d26,d28,d30}, [\reg2]! @ dst2[8-11] = q12-q15[0]; dst2 += 4*2*4\r | |
561 | \r | |
562 | vst4.16 {d25,d27,d29,d31}, [\reg2]! @ dst2[12-15] = q12-q15[1]; dst2 += 4*2*4\r | |
563 | \r | |
564 | vst4.16 {d16,d18,d20,d22}, [\reg3]! @ dst3[0-3] = q8-q11[0]; dst3 += 4*2*4\r | |
565 | \r | |
566 | vst4.16 {d17,d19,d21,d23}, [\reg3]! @ dst3[4-7] = q8-q11[1]; dst3 += 4*2*4\r | |
567 | \r | |
568 | vst4.16 {d24,d26,d28,d30}, [\reg3]! @ dst3[8-11] = q12-q15[0]; dst3 += 4*2*4\r | |
569 | \r | |
570 | vst4.16 {d25,d27,d29,d31}, [\reg3]! @ dst3[12-15] = q12-q15[1]; dst3 += 4*2*4\r | |
571 | bhs 10b\r | |
572 | \r | |
573 | @ last 0-15 bytes\r | |
574 | \r | |
575 | cmp \counter, #0\r | |
576 | beq 40f\r | |
577 | \r | |
578 | cmp \counter, #4\r | |
579 | blo 30f\r | |
580 | \r | |
581 | @ 4-12 pixels (4 pre iteration)\r | |
582 | 20:\r | |
583 | ldr \reg1, [\src] @ reg1 = src[0-3]\r | |
584 | sub \counter, \counter, #4 @ counter -= 4\r | |
585 | \r | |
586 | add \src, \src, #4 @ src += 4\r | |
587 | \r | |
588 | ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]\r | |
589 | cmp \counter, #4\r | |
590 | \r | |
591 | ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]\r | |
592 | ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]\r | |
593 | \r | |
594 | ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]\r | |
595 | ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]\r | |
596 | \r | |
597 | ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]\r | |
598 | lsr \reg1, \reg1, #24 @ reg1 = src[3]\r | |
599 | \r | |
600 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]\r | |
601 | \r | |
602 | add \reg5, \dst, \dststride\r | |
603 | bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16\r | |
604 | \r | |
605 | vmov.32 d16[0], \reg2\r | |
606 | add \reg6, \reg5, \dststride\r | |
607 | \r | |
608 | bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16\r | |
609 | add \reg7, \reg6, \dststride\r | |
610 | \r | |
611 | vmov.32 d16[1], \reg4\r | |
612 | \r | |
613 | vmov d17, d16\r | |
614 | \r | |
615 | vmov d18, d16\r | |
616 | \r | |
617 | vmov d19, d16\r | |
618 | \r | |
619 | vst4.16 {d16,d17,d18,d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 4*2*4\r | |
620 | \r | |
621 | vst4.16 {d16,d17,d18,d19}, [\reg5] @ dst1[0-15] = d16-d19\r | |
622 | \r | |
623 | vst4.16 {d16,d17,d18,d19}, [\reg6] @ dst2[0-15] = d16-d19\r | |
624 | \r | |
625 | vst4.16 {d16,d17,d18,d19}, [\reg7] @ dst3[0-15] = d16-d19\r | |
626 | bhs 20b\r | |
627 | \r | |
628 | cmp \counter, #0\r | |
629 | beq 40f\r | |
630 | \r | |
631 | @ last 1-3 pixels\r | |
632 | 30:\r | |
633 | ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++\r | |
634 | add \reg2, \dst, \dststride\r | |
635 | \r | |
636 | add \reg3, \reg2, \dststride\r | |
637 | add \dst, \dst, #8 @ dst += 4*2\r | |
638 | \r | |
639 | ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]\r | |
640 | add \reg4, \reg3, \dststride\r | |
641 | \r | |
642 | strh \reg1, [\dst, #-8] @ dst[0] = reg1\r | |
643 | subS \counter, \counter, #1 @ counter--\r | |
644 | \r | |
645 | strh \reg1, [\dst, #-6] @ dst[1] = reg1\r | |
646 | \r | |
647 | bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16\r | |
648 | str \reg1, [\dst, #-4] @ dst[2-3] = reg1\r | |
649 | \r | |
650 | str \reg1, [\reg2] @ dst1[0-1] = reg1\r | |
651 | \r | |
652 | str \reg1, [\reg2, #4] @ dst1[2-3] = reg1\r | |
653 | \r | |
654 | str \reg1, [\reg3] @ dst2[0-1] = reg1\r | |
655 | \r | |
656 | str \reg1, [\reg3, #4] @ dst2[2-3] = reg1\r | |
657 | \r | |
658 | str \reg1, [\reg4] @ dst3[0-1] = reg1\r | |
659 | \r | |
660 | str \reg1, [\reg4, #4] @ dst3[2-3] = reg1\r | |
661 | bne 30b\r | |
662 | \r | |
663 | 40:\r | |
664 | .endm\r | |
665 | \r |