2 @@ Copyright (C) 2012 Roman Pauer
\r
4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
\r
5 @@ this software and associated documentation files (the "Software"), to deal in
\r
6 @@ the Software without restriction, including without limitation the rights to
\r
7 @@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
\r
8 @@ of the Software, and to permit persons to whom the Software is furnished to do
\r
9 @@ so, subject to the following conditions:
\r
11 @@ The above copyright notice and this permission notice shall be included in all
\r
12 @@ copies or substantial portions of the Software.
\r
14 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
\r
15 @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
\r
16 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
\r
17 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
\r
18 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
\r
19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
\r
24 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4)
\r
25 /* can't use because gas wants ',' before ':' */
\r
33 .macro bgr1555_to_rgb565 dr0 dr1 t0 t1 t2 ar
\r
35 vshl.u16 \t0, \dr0, #11
\r
36 vshl.u16 \t1, \dr1, #11
\r
37 vshl.u16 \dr0, \dr0, #1
\r
38 vshl.u16 \dr1, \dr1, #1
\r
40 vsri.u16 \t0, \dr0, #11
\r
41 vsri.u16 \t1, \dr1, #11
\r
46 .macro _neon_normalxx_8_16_line_middle src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride, dA, dB
\r
47 ldr \reg1, [\src] @ reg1 = src[0-3]
\r
49 ldr \reg2, [\src, #4] @ reg2 = src[4-7]
\r
51 ldr \reg3, [\src, #8] @ reg3 = src[8-11]
\r
53 ldr \reg4, [\src, #12] @ reg4 = src[12-15]
\r
54 ubfx \reg5, \reg1, #0, #8 @ reg5 = src[0]
\r
56 ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[0]]
\r
57 ubfx \reg6, \reg1, #8, #8 @ reg6 = src[1]
\r
59 ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[1]]
\r
60 ubfx \reg7, \reg1, #16, #8 @ reg7 = src[2]
\r
62 ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[2]]
\r
63 lsr \reg1, \reg1, #24 @ reg1 = src[3]
\r
65 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
\r
66 ubfx \reg8, \reg2, #0, #8 @ reg8 = src[4]
\r
68 ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[4]]
\r
69 ubfx \reg9, \reg2, #8, #8 @ reg9 = src[5]
\r
71 ldr \reg9, [\pal, \reg9, lsl #2] @ reg9 = pal[src[5]]
\r
72 bfi \reg5, \reg6, #16, #16 @ reg5 = pal[src[0]] | pal[src[1]] << 16
\r
74 bfi \reg7, \reg1, #16, #16 @ reg7 = pal[src[2]] | pal[src[3]] << 16
\r
75 ubfx \reg6, \reg2, #16, #8 @ reg6 = src[6]
\r
77 vmov d16, \reg5, \reg7 @ d16 = pal[src[0-3]]
\r
78 lsr \reg2, \reg2, #24 @ reg2 = src[7]
\r
80 ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[6]]
\r
81 bfi \reg8, \reg9, #16, #16 @ reg8 = pal[src[4]] | pal[src[5]] << 16
\r
83 ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[7]]
\r
84 ubfx \reg1, \reg3, #0, #8 @ reg1 = src[8]
\r
86 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[8]]
\r
87 ubfx \reg5, \reg3, #8, #8 @ reg5 = src[9]
\r
89 ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[9]]
\r
90 ubfx \reg7, \reg3, #16, #8 @ reg7 = src[10]
\r
92 ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[10]]
\r
93 bfi \reg6, \reg2, #16, #16 @ reg6 = pal[src[6]] | pal[src[7]] << 16
\r
95 vmov d17, \reg8, \reg6 @ d17 = pal[src[4-7]]
\r
96 lsr \reg3, \reg3, #24 @ reg3 = src[11]
\r
98 ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[11]]
\r
99 ubfx \reg2, \reg4, #0, #8 @ reg2 = src[12]
\r
101 ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[12]]
\r
102 ubfx \reg6, \reg4, #8, #8 @ reg6 = src[13]
\r
104 ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[13]]
\r
105 ubfx \reg8, \reg4, #16, #8 @ reg8 = src[14]
\r
107 ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[14]]
\r
108 lsr \reg4, \reg4, #24 @ reg4 = src[15]
\r
110 ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[15]]
\r
111 bfi \reg1, \reg5, #16, #16 @ reg1 = pal[src[8]] | pal[src[9]] << 16
\r
113 bfi \reg7, \reg3, #16, #16 @ reg7 = pal[src[10]] | pal[src[11]] << 16
\r
114 bfi \reg2, \reg6, #16, #16 @ reg2 = pal[src[12]] | pal[src[13]] << 16
\r
116 vmov \dA, \reg1, \reg7 @ dA = pal[src[8-11]]
\r
117 sub \counter, \counter, #16 @ counter -= 16
\r
119 bfi \reg8, \reg4, #16, #16 @ reg8 = pal[src[14]] | pal[src[15]] << 16
\r
120 add \src, \src, #16 @ src += 16
\r
122 vmov \dB, \reg2, \reg8 @ dB = pal[src[12-15]]
\r
126 .macro neon_normal1x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9
\r
127 @ align src to 4 bytes
\r
128 andS \reg5, \src, #3 @ reg5 = src & 3
\r
132 ldr \reg1, [\src] @ reg1 = src[0-3]
\r
133 rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)
\r
135 add \src, \src, \reg5 @ src += reg5
\r
136 sub \counter, \counter, \reg5 @ counter -= reg5
\r
138 subS \reg5, \reg5, #1 @ reg5--
\r
140 ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
\r
141 ubfxne \reg3, \reg1, #8, #8 @ reg3 = src[1]
\r
143 ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[reg2]
\r
145 ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[reg3]
\r
147 strh \reg2, [\dst] @ dst[0] = reg2
\r
149 strneh \reg3, [\dst, #2]! @ dst[1] = reg3; dst++
\r
150 subneS \reg5, \reg5, #1 @ reg5--
\r
152 ubfxne \reg4, \reg1, #16, #8 @ reg4 = src[2]
\r
153 add \dst, \dst, #2 @ dst++
\r
155 ldrne \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[reg4]
\r
157 strneh \reg4, [\dst], #2 @ dst[2] = reg4; dst++
\r
159 @ middle pixels (16 per iteration)
\r
161 _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, , d18, d19
\r
163 vst1.16 {d16-d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 2*16
\r
174 @ 4-12 pixels (4 pre iteration)
\r
176 ldr \reg1, [\src] @ reg1 = src[0-3]
\r
177 sub \counter, \counter, #4 @ counter -= 4
\r
179 add \src, \src, #4 @ src += 4
\r
180 add \dst, \dst, #(2*4) @ dst += 4
\r
182 ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
\r
185 ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]
\r
186 ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]
\r
188 ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]
\r
189 ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]
\r
191 ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]
\r
192 lsr \reg1, \reg1, #24 @ reg1 = src[3]
\r
194 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
\r
196 strh \reg2, [\dst, #-8] @ dst[0] = reg2
\r
198 strh \reg3, [\dst, #-6] @ dst[1] = reg3
\r
200 strh \reg4, [\dst, #-4] @ dst[2] = reg4
\r
202 strh \reg1, [\dst, #-2] @ dst[3] = reg1
\r
210 ldrb \reg1, [\src] @ reg1 = src[0]
\r
211 subS \counter, \counter, #1 @ counter--
\r
213 ldrneb \reg2, [\src, #1]! @ reg2 = src[1]; src++
\r
215 add \src, \src, #1 @ src++
\r
217 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
\r
219 ldrne \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[1]]
\r
221 strh \reg1, [\dst] @ dst[0] = reg1
\r
223 strneh \reg2, [\dst, #2]! @ dst[1] = reg2; dst++
\r
224 subneS \counter, \counter, #1 @ counter--
\r
226 ldrneb \reg3, [\src], #1 @ reg3 = src[2]; src++
\r
227 add \dst, \dst, #2 @ dst++
\r
229 ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[2]]
\r
231 strneh \reg3, [\dst], #2 @ dst[2] = reg3; dst++
\r
236 .macro neon_normal2x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride
\r
237 @ align src to 4 bytes
\r
238 andS \reg5, \src, #3 @ reg5 = src & 3
\r
242 rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)
\r
244 ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
\r
245 add \reg2, \dst, \dststride
\r
247 add \dst, \dst, #4 @ dst += 2*2
\r
248 sub \counter, \counter, #1 @ counter--
\r
250 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
\r
251 subS \reg5, \reg5, #1 @ reg5--
\r
253 strh \reg1, [\dst, #-4] @ dst[0] = reg1
\r
255 strh \reg1, [\dst, #-2] @ dst[1] = reg1
\r
257 strh \reg1, [\reg2] @ dst1[0] = reg1
\r
259 strh \reg1, [\reg2, #2] @ dst1[1] = reg1
\r
262 @ middle pixels (16 per iteration)
\r
264 _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d20, d21
\r
267 add \reg1, \dst, \dststride @ reg1 = dst + dststride
\r
270 vst2.16 {q8,q9}, [\dst]! @ dst[0-7] = q8-q9; dst += 2*2*8
\r
272 vst2.16 {q10,q11}, [\dst]! @ dst[8-15] = q10-q11; dst += 2*2*8
\r
274 vst2.16 {q8,q9}, [\reg1]! @ dst1[0-7] = q8-q9; dst1 += 2*2*8
\r
276 vst2.16 {q10,q11}, [\reg1]! @ dst1[8-15] = q10-q11; dst1 += 2*2*8
\r
287 @ 4-12 pixels (4 pre iteration)
\r
289 ldr \reg1, [\src] @ reg1 = src[0-3]
\r
290 sub \counter, \counter, #4 @ counter -= 4
\r
292 add \src, \src, #4 @ src += 4
\r
294 ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
\r
297 ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]
\r
298 ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]
\r
300 ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]
\r
301 ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]
\r
303 ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]
\r
304 lsr \reg1, \reg1, #24 @ reg1 = src[3]
\r
306 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
\r
308 add \reg5, \dst, \dststride
\r
309 bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16
\r
311 vmov.32 d16[0], \reg2
\r
313 bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16
\r
315 vmov.32 d16[1], \reg4
\r
319 vst2.16 {d16,d17}, [\dst]! @ dst[0-7] = d16-d17; dst += 2*2*4
\r
321 vst2.16 {d16,d17}, [\reg5] @ dst1[0-7] = d16-d17
\r
329 ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
\r
330 add \reg2, \dst, \dststride
\r
332 add \dst, \dst, #4 @ dst += 2*2
\r
334 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
\r
335 subS \counter, \counter, #1 @ counter--
\r
337 strh \reg1, [\dst, #-4] @ dst[0] = reg1
\r
339 strh \reg1, [\dst, #-2] @ dst[1] = reg1
\r
341 strh \reg1, [\reg2] @ dst1[0] = reg1
\r
343 strh \reg1, [\reg2, #2] @ dst1[1] = reg1
\r
349 .macro neon_normal3x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride
\r
350 @ align src to 4 bytes
\r
351 andS \reg5, \src, #3 @ reg5 = src & 3
\r
355 rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)
\r
357 ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
\r
358 add \reg2, \dst, \dststride
\r
360 add \reg3, \reg2, \dststride
\r
361 add \dst, \dst, #6 @ dst += 3*2
\r
363 sub \counter, \counter, #1 @ counter--
\r
365 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
\r
366 subS \reg5, \reg5, #1 @ reg5--
\r
368 strh \reg1, [\dst, #-6] @ dst[0] = reg1
\r
370 strh \reg1, [\dst, #-4] @ dst[1] = reg1
\r
372 strh \reg1, [\dst, #-2] @ dst[2] = reg1
\r
373 bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16
\r
375 strh \reg1, [\reg2] @ dst1[0] = reg1
\r
377 str \reg1, [\reg2, #2] @ dst1[1-2] = reg1
\r
379 strh \reg1, [\reg3] @ dst2[0] = reg1
\r
381 str \reg1, [\reg3, #2] @ dst2[1-2] = reg1
\r
384 @ middle pixels (16 per iteration)
\r
386 _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d22, d23
\r
389 add \reg1, \dst, \dststride @ reg1 = dst + dststride
\r
392 add \reg2, \dst, \dststride, lsl #1 @ reg1 = dst + 2 * dststride
\r
395 vst3.16 {d16,d18,d20}, [\dst]! @ dst[0-3] = q8-q10[0]; dst += 3*2*4
\r
398 vst3.16 {d17,d19,d21}, [\dst]! @ dst[4-7] = q8-q10[1]; dst += 3*2*4
\r
400 vst3.16 {d22,d24,d26}, [\dst]! @ dst[8-11] = q11-q13[0]; dst += 3*2*4
\r
402 vst3.16 {d23,d25,d27}, [\dst]! @ dst[12-15] = q11-q13[1]; dst += 3*2*4
\r
404 vst3.16 {d16,d18,d20}, [\reg1]! @ dst1[0-3] = q8-q10[0]; dst1 += 3*2*4
\r
406 vst3.16 {d17,d19,d21}, [\reg1]! @ dst1[4-7] = q8-q10[1]; dst1 += 3*2*4
\r
408 vst3.16 {d22,d24,d26}, [\reg1]! @ dst1[8-11] = q11-q13[0]; dst1 += 3*2*4
\r
410 vst3.16 {d23,d25,d27}, [\reg1]! @ dst1[12-15] = q11-q13[1]; dst1 += 3*2*4
\r
412 vst3.16 {d16,d18,d20}, [\reg2]! @ dst2[0-3] = q8-q10[0]; dst2 += 3*2*4
\r
414 vst3.16 {d17,d19,d21}, [\reg2]! @ dst2[4-7] = q8-q10[1]; dst2 += 3*2*4
\r
416 vst3.16 {d22,d24,d26}, [\reg2]! @ dst2[8-11] = q11-q13[0]; dst2 += 3*2*4
\r
418 vst3.16 {d23,d25,d27}, [\reg2]! @ dst2[12-15] = q11-q13[1]; dst2 += 3*2*4
\r
429 @ 4-12 pixels (4 pre iteration)
\r
431 ldr \reg1, [\src] @ reg1 = src[0-3]
\r
432 sub \counter, \counter, #4 @ counter -= 4
\r
434 add \src, \src, #4 @ src += 4
\r
436 ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
\r
439 ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]
\r
440 ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]
\r
442 ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]
\r
443 ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]
\r
445 ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]
\r
446 lsr \reg1, \reg1, #24 @ reg1 = src[3]
\r
448 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
\r
450 add \reg5, \dst, \dststride
\r
451 bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16
\r
453 vmov.32 d16[0], \reg2
\r
454 add \reg6, \reg5, \dststride
\r
456 bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16
\r
458 vmov.32 d16[1], \reg4
\r
464 vst3.16 {d16,d17,d18}, [\dst]! @ dst[0-11] = d16-d18; dst += 3*2*4
\r
466 vst3.16 {d16,d17,d18}, [\reg5] @ dst1[0-11] = d16-d18
\r
468 vst3.16 {d16,d17,d18}, [\reg6] @ dst2[0-11] = d16-d18
\r
476 ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
\r
477 add \reg2, \dst, \dststride
\r
479 add \reg3, \reg2, \dststride
\r
480 add \dst, \dst, #6 @ dst += 3*2
\r
482 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
\r
483 subS \counter, \counter, #1 @ counter--
\r
485 strh \reg1, [\dst, #-6] @ dst[0] = reg1
\r
487 strh \reg1, [\dst, #-4] @ dst[1] = reg1
\r
489 strh \reg1, [\dst, #-2] @ dst[2] = reg1
\r
490 bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16
\r
492 strh \reg1, [\reg2] @ dst1[0] = reg1
\r
494 str \reg1, [\reg2, #2] @ dst1[1-2] = reg1
\r
496 strh \reg1, [\reg3] @ dst2[0] = reg1
\r
498 str \reg1, [\reg3, #2] @ dst2[1-2] = reg1
\r
504 .macro neon_normal4x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride
\r
505 @ align src to 4 bytes
\r
506 andS \reg5, \src, #3 @ reg5 = src & 3
\r
510 rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)
\r
512 ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
\r
513 add \reg2, \dst, \dststride
\r
515 add \reg3, \reg2, \dststride
\r
516 add \dst, \dst, #8 @ dst += 4*2
\r
518 sub \counter, \counter, #1 @ counter--
\r
520 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
\r
521 add \reg4, \reg3, \dststride
\r
523 strh \reg1, [\dst, #-8] @ dst[0] = reg1
\r
524 subS \reg5, \reg5, #1 @ reg5--
\r
526 strh \reg1, [\dst, #-6] @ dst[1] = reg1
\r
528 bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16
\r
529 str \reg1, [\dst, #-4] @ dst[2-3] = reg1
\r
531 str \reg1, [\reg2] @ dst1[0-1] = reg1
\r
533 str \reg1, [\reg2, #4] @ dst1[2-3] = reg1
\r
535 str \reg1, [\reg3] @ dst2[0-1] = reg1
\r
537 str \reg1, [\reg3, #4] @ dst2[2-3] = reg1
\r
539 str \reg1, [\reg4] @ dst3[0-1] = reg1
\r
541 str \reg1, [\reg4, #4] @ dst3[2-3] = reg1
\r
544 @ middle pixels (16 per iteration)
\r
546 _neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d24, d25
\r
549 add \reg1, \dst, \dststride @ reg1 = dst + dststride
\r
552 add \reg2, \dst, \dststride, lsl #1 @ reg2 = dst + 2 * dststride
\r
555 add \reg3, \reg1, \dststride,lsl #1 @ reg3 = dst + 3 * dststride
\r
558 vst4.16 {d16,d18,d20,d22}, [\dst]! @ dst[0-3] = q8-q11[0]; dst += 4*2*4
\r
563 vst4.16 {d17,d19,d21,d23}, [\dst]! @ dst[4-7] = q8-q11[1]; dst += 4*2*4
\r
565 vst4.16 {d24,d26,d28,d30}, [\dst]! @ dst[8-11] = q12-q15[0]; dst += 4*2*4
\r
567 vst4.16 {d25,d27,d29,d31}, [\dst]! @ dst[12-15] = q12-q15[1]; dst += 4*2*4
\r
569 vst4.16 {d16,d18,d20,d22}, [\reg1]! @ dst1[0-3] = q8-q11[0]; dst1 += 4*2*4
\r
571 vst4.16 {d17,d19,d21,d23}, [\reg1]! @ dst1[4-7] = q8-q11[1]; dst1 += 4*2*4
\r
573 vst4.16 {d24,d26,d28,d30}, [\reg1]! @ dst1[8-11] = q12-q15[0]; dst1 += 4*2*4
\r
575 vst4.16 {d25,d27,d29,d31}, [\reg1]! @ dst1[12-15] = q12-q15[1]; dst1 += 4*2*4
\r
577 vst4.16 {d16,d18,d20,d22}, [\reg2]! @ dst2[0-3] = q8-q11[0]; dst2 += 4*2*4
\r
579 vst4.16 {d17,d19,d21,d23}, [\reg2]! @ dst2[4-7] = q8-q11[1]; dst2 += 4*2*4
\r
581 vst4.16 {d24,d26,d28,d30}, [\reg2]! @ dst2[8-11] = q12-q15[0]; dst2 += 4*2*4
\r
583 vst4.16 {d25,d27,d29,d31}, [\reg2]! @ dst2[12-15] = q12-q15[1]; dst2 += 4*2*4
\r
585 vst4.16 {d16,d18,d20,d22}, [\reg3]! @ dst3[0-3] = q8-q11[0]; dst3 += 4*2*4
\r
587 vst4.16 {d17,d19,d21,d23}, [\reg3]! @ dst3[4-7] = q8-q11[1]; dst3 += 4*2*4
\r
589 vst4.16 {d24,d26,d28,d30}, [\reg3]! @ dst3[8-11] = q12-q15[0]; dst3 += 4*2*4
\r
591 vst4.16 {d25,d27,d29,d31}, [\reg3]! @ dst3[12-15] = q12-q15[1]; dst3 += 4*2*4
\r
602 @ 4-12 pixels (4 pre iteration)
\r
604 ldr \reg1, [\src] @ reg1 = src[0-3]
\r
605 sub \counter, \counter, #4 @ counter -= 4
\r
607 add \src, \src, #4 @ src += 4
\r
609 ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
\r
612 ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]
\r
613 ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]
\r
615 ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]
\r
616 ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]
\r
618 ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]
\r
619 lsr \reg1, \reg1, #24 @ reg1 = src[3]
\r
621 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
\r
623 add \reg5, \dst, \dststride
\r
624 bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16
\r
626 vmov.32 d16[0], \reg2
\r
627 add \reg6, \reg5, \dststride
\r
629 bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16
\r
630 add \reg7, \reg6, \dststride
\r
632 vmov.32 d16[1], \reg4
\r
640 vst4.16 {d16,d17,d18,d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 4*2*4
\r
642 vst4.16 {d16,d17,d18,d19}, [\reg5] @ dst1[0-15] = d16-d19
\r
644 vst4.16 {d16,d17,d18,d19}, [\reg6] @ dst2[0-15] = d16-d19
\r
646 vst4.16 {d16,d17,d18,d19}, [\reg7] @ dst3[0-15] = d16-d19
\r
654 ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
\r
655 add \reg2, \dst, \dststride
\r
657 add \reg3, \reg2, \dststride
\r
658 add \dst, \dst, #8 @ dst += 4*2
\r
660 ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
\r
661 add \reg4, \reg3, \dststride
\r
663 strh \reg1, [\dst, #-8] @ dst[0] = reg1
\r
664 subS \counter, \counter, #1 @ counter--
\r
666 strh \reg1, [\dst, #-6] @ dst[1] = reg1
\r
668 bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16
\r
669 str \reg1, [\dst, #-4] @ dst[2-3] = reg1
\r
671 str \reg1, [\reg2] @ dst1[0-1] = reg1
\r
673 str \reg1, [\reg2, #4] @ dst1[2-3] = reg1
\r
675 str \reg1, [\reg3] @ dst2[0-1] = reg1
\r
677 str \reg1, [\reg3, #4] @ dst2[2-3] = reg1
\r
679 str \reg1, [\reg4] @ dst3[0-1] = reg1
\r
681 str \reg1, [\reg4, #4] @ dst3[2-3] = reg1
\r