a1f34081 |
1 | /* |
a1eff5db |
2 | * (C) GraÅžvydas "notaz" Ignotas, 2011,2012 |
a1f34081 |
3 | * |
4 | * This work is licensed under the terms of any of these licenses |
5 | * (at your option): |
6 | * - GNU GPL, version 2 or later. |
7 | * - GNU LGPL, version 2.1 or later. |
8 | * See the COPYING file in the top-level directory. |
9 | */ |
10 | |
11 | .text |
12 | .align 2 |
13 | |
2c4e54dd |
14 | #define func(name) \ |
15 | .global name; \ |
16 | name |
17 | |
a1f34081 |
18 | @ void *dst, const void *src, int count, uint abits |
19 | .macro do_argb bgr2rgb |
20 | vdup.i8 d0, r3 |
21 | 0: |
edd481ef |
22 | cmp r2, #8 |
23 | pld [r1, #64*2] |
24 | blt 3f |
25 | 1: |
a1f34081 |
26 | vld4.8 {d4-d7}, [r1]! |
edd481ef |
27 | 2: |
a1f34081 |
28 | .if \bgr2rgb |
29 | vswp d4, d6 @ BGR->RGB |
30 | .endif |
31 | vmov.i8 d7, d0 |
32 | subs r2, r2, #8 |
33 | blt do_argb_finish |
34 | vst4.8 {d4-d7}, [r0]! |
35 | bxeq lr |
36 | nop |
37 | b 0b |
edd481ef |
38 | |
39 | 3: |
40 | @ unaligned ending nastiness :( |
41 | add r12, r1, #8*4 |
42 | lsr r12, #12 |
43 | cmp r12, r1, lsr #12 @ crossing page? |
44 | beq 1b @ nope, overreading is safe |
45 | |
46 | @ _wb_'s bad luck, do some slow stuff here |
47 | push {r0-r2,lr} |
48 | sub sp, #8*4 |
49 | mov r0, sp |
50 | lsl r2, #2 |
51 | bl memcpy |
52 | vld4.8 {d4-d7}, [sp]! |
53 | pop {r0-r2,lr} |
54 | b 2b |
a1f34081 |
55 | .endm |
56 | |
a1eff5db |
57 | @ void *dst, const void *src, int count, uint global_alpha |
58 | .macro do_argb_alpha bgr2rgb global_alpha |
bdfa6989 |
59 | mov r12, #0xff |
a1eff5db |
60 | .if \global_alpha |
bdfa6989 |
61 | vdup.16 q11, r3 |
a1eff5db |
62 | .endif |
63 | vdup.i16 q12, r12 |
edd481ef |
64 | |
bdfa6989 |
65 | 0: |
a1eff5db |
66 | pld [r1, #64*2] |
edd481ef |
67 | cmp r2, #8 |
a1eff5db |
68 | pld [r0, #64*2] |
edd481ef |
69 | blt 3f |
70 | 1: |
bdfa6989 |
71 | vld4.8 {d4-d7}, [r1]! |
72 | vld4.8 {d0-d3}, [r0] |
edd481ef |
73 | 2: |
bdfa6989 |
74 | .if \bgr2rgb |
75 | vswp d4, d6 @ BGR->RGB |
a1eff5db |
76 | .endif |
77 | .if !\global_alpha |
78 | vmovl.u8 q11, d7 |
bdfa6989 |
79 | .endif |
80 | @ d = (((s-d)*a+255)>>8)+d |
81 | vsubl.u8 q8, d4, d0 |
82 | vsubl.u8 q9, d5, d1 |
83 | vsubl.u8 q10,d6, d2 |
84 | vmul.s16 q8, q8, q11 |
85 | vmul.s16 q9, q9, q11 |
86 | vmul.s16 q10,q10,q11 |
87 | vaddhn.i16 d4, q8, q12 |
88 | vaddhn.i16 d5, q9, q12 |
89 | vaddhn.i16 d6, q10,q12 |
90 | vadd.i8 q2, q0 |
91 | vadd.i8 d6, d2 |
92 | vmov.i8 d7, d3 |
93 | subs r2, r2, #8 |
94 | blt do_argb_finish |
95 | vst4.8 {d4-d7}, [r0]! |
96 | bxeq lr |
97 | nop |
98 | b 0b |
edd481ef |
99 | |
100 | 3: |
101 | @ unaligned ending nastiness :( |
102 | add r3, r0, #8*4 |
103 | add r12, r1, #8*4 |
104 | lsr r3, #12 |
105 | lsr r12, #12 |
106 | cmp r3, r0, lsr #12 @ are we crossing |
107 | cmpeq r12, r1, lsr #12 @ the page boundary? |
108 | beq 1b @ nope, overreading is safe |
109 | |
110 | @ _wb_'s bad luck, do some slow stuff here |
111 | push {r0-r2, lr} |
112 | vpush {q11, q12} |
113 | sub sp, #8*4*2 |
114 | lsl r2, #2 |
115 | mov r1, r0 |
116 | mov r0, sp |
117 | bl memcpy |
118 | ldr r2, [sp, #8*4*2 + 16*2 + 8] @ stacked r2 |
119 | add r0, sp, #8*4 |
120 | ldr r1, [sp, #8*4*2 + 16*2 + 4] |
121 | lsl r2, #2 |
122 | bl memcpy |
123 | vld4.8 {d0-d3}, [sp]! |
124 | vld4.8 {d4-d7}, [sp]! |
125 | vpop {q11, q12} |
126 | pop {r0-r2, lr} |
127 | b 2b |
bdfa6989 |
128 | .endm |
129 | |
130 | |
a1f34081 |
131 | do_argb_finish: |
132 | add r2, r2, #8 |
133 | vzip.8 d4, d5 @ RRR..|GGG.. -> RGRG.. |
134 | vzip.8 d6, d7 @ BBB..|000.. -> B0B0.. |
135 | vzip.16 q2, q3 |
136 | |
137 | vst1.32 d4[0], [r0]! |
138 | cmp r2, #1 |
139 | bxle lr |
140 | vst1.32 d4[1], [r0]! |
141 | cmp r2, #2 |
142 | bxle lr |
143 | vst1.32 d5[0], [r0]! |
144 | cmp r2, #3 |
145 | bxle lr |
146 | vst1.32 d5[1], [r0]! |
147 | cmp r2, #4 |
148 | bxle lr |
149 | vst1.32 d6[0], [r0]! |
150 | cmp r2, #5 |
151 | bxle lr |
152 | vst1.32 d6[1], [r0]! |
153 | cmp r2, #6 |
154 | bxle lr |
155 | vst1.32 d7[0], [r0]! |
156 | bx lr |
157 | |
158 | |
2c4e54dd |
159 | @ void *dst, const void *src, int count, uint global_alpha |
160 | .macro do_argb_to_rgb565_alpha bgr2rgb global_alpha |
161 | mov r12, #0xff |
162 | .if \global_alpha |
163 | vdup.16 q11, r3 |
164 | .endif |
165 | vdup.i16 q12, r12 |
166 | 0: |
167 | pld [r1, #64*2] |
edd481ef |
168 | cmp r2, #8 |
2c4e54dd |
169 | pld [r0, #64*2] |
edd481ef |
170 | blt 3f |
171 | 1: |
2c4e54dd |
172 | vld4.8 {d4-d7}, [r1]! |
173 | vld2.8 {d1-d2}, [r0] |
174 | .if \bgr2rgb |
175 | vswp d4, d6 @ BGR->RGB |
176 | .endif |
177 | .if !\global_alpha |
178 | vmovl.u8 q11, d7 |
179 | .endif |
180 | vshl.i8 d0, d1, #3 |
181 | vshr.u8 d1, d1, #3 |
182 | vsri.i8 d0, d0, #5 @ B |
183 | vsli.i8 d1, d2, #5 |
184 | vsri.i8 d2, d2, #5 @ R |
185 | vsri.i8 d1, d1, #6 @ G |
186 | @ d = (((s-d)*a+255)>>8)+d |
187 | vsubl.u8 q8, d4, d0 |
188 | vsubl.u8 q9, d5, d1 |
189 | vsubl.u8 q10,d6, d2 |
190 | vmul.s16 q8, q8, q11 |
191 | vmul.s16 q9, q9, q11 |
192 | vmul.s16 q10,q10,q11 |
193 | vaddhn.i16 d4, q8, q12 |
194 | vaddhn.i16 d5, q9, q12 |
195 | vaddhn.i16 d6, q10,q12 |
196 | vadd.i8 q2, q0 |
197 | vadd.i8 d2, d6 @ rrrr rrrr |
198 | vshr.u8 d0, d5, #2 |
199 | vshr.u8 d1, d4, #3 @ 000b bbbb |
200 | vsri.i8 d2, d5, #5 @ rrrr rggg |
201 | vsli.i8 d1, d0, #5 @ gggb bbbb |
202 | subs r2, r2, #8 |
203 | blt do_rgb565_finish |
204 | vst2.8 {d1-d2}, [r0]! |
205 | bxeq lr |
206 | nop |
207 | b 0b |
edd481ef |
208 | |
209 | 3: |
210 | @ unaligned ending nastiness :( |
211 | add r3, r0, #8*2 |
212 | add r12, r1, #8*4 |
213 | lsr r3, #12 |
214 | lsr r12, #12 |
215 | cmp r3, r0, lsr #12 @ are we crossing |
216 | cmpeq r12, r1, lsr #12 @ the page boundary? |
217 | beq 1b @ nope, overreading is safe |
218 | |
219 | nop |
220 | bx lr @ abandon ship! (until someone complains) |
2c4e54dd |
221 | .endm |
222 | |
223 | |
224 | do_rgb565_finish: |
225 | vzip.8 d1, d2 |
226 | add r2, r2, #8 |
227 | |
228 | vst1.16 d1[0], [r0]! |
229 | cmp r2, #1 |
230 | bxle lr |
231 | vst1.16 d1[1], [r0]! |
232 | cmp r2, #2 |
233 | bxle lr |
234 | vst1.16 d1[2], [r0]! |
235 | cmp r2, #3 |
236 | bxle lr |
237 | vst1.16 d1[3], [r0]! |
238 | cmp r2, #4 |
239 | bxle lr |
240 | vst1.16 d2[0], [r0]! |
241 | cmp r2, #5 |
242 | bxle lr |
243 | vst1.16 d2[1], [r0]! |
244 | cmp r2, #6 |
245 | bxle lr |
246 | vst1.16 d2[2], [r0]! |
247 | bx lr |
248 | |
249 | |
250 | func(neon_ARGBtoXRGB): |
a1f34081 |
251 | do_argb 0 |
252 | |
2c4e54dd |
253 | func(neon_ABGRtoXRGB): |
a1f34081 |
254 | do_argb 1 |
255 | |
2c4e54dd |
256 | func(neon_ARGBtoXRGBalpha): |
a1eff5db |
257 | do_argb_alpha 0, 0 |
a1f34081 |
258 | |
2c4e54dd |
259 | func(neon_ABGRtoXRGBalpha): |
a1eff5db |
260 | do_argb_alpha 1, 0 |
a1f34081 |
261 | |
2c4e54dd |
262 | func(neon_ARGBtoXRGBalphaS): |
a1eff5db |
263 | do_argb_alpha 0, 1 |
bdfa6989 |
264 | |
2c4e54dd |
265 | func(neon_ABGRtoXRGBalphaS): |
a1eff5db |
266 | do_argb_alpha 1, 1 |
bdfa6989 |
267 | |
2c4e54dd |
268 | func(neon_ARGBtoRGB565alpha): |
269 | do_argb_to_rgb565_alpha 0, 0 |
270 | |
271 | func(neon_ABGRtoRGB565alpha): |
272 | do_argb_to_rgb565_alpha 1, 0 |
273 | |
a1f34081 |
274 | @ vim:filetype=armasm |