tune the preloads a bit
[sdl_omap.git] / src / video / SDL_blit_neon.S
... / ...
CommitLineData
1/*
2 * (C) GraÅžvydas "notaz" Ignotas, 2011,2012
3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
11.text
12.align 2
13
14#define func(name) \
15 .global name; \
16 name
17
18@ void *dst, const void *src, int count, uint abits
19.macro do_argb bgr2rgb
20 vdup.i8 d0, r3
210:
22 cmp r2, #8
23 pld [r1, #64*2]
24 blt 3f
251:
26 vld4.8 {d4-d7}, [r1]!
272:
28.if \bgr2rgb
29 vswp d4, d6 @ BGR->RGB
30.endif
31 vmov.i8 d7, d0
32 subs r2, r2, #8
33 blt do_argb_finish
34 vst4.8 {d4-d7}, [r0]!
35 bxeq lr
36 nop
37 b 0b
38
393:
40 @ unaligned ending nastiness :(
41 add r12, r1, #8*4
42 lsr r12, #12
43 cmp r12, r1, lsr #12 @ crossing page?
44 beq 1b @ nope, overreading is safe
45
46 @ _wb_'s bad luck, do some slow stuff here
47 push {r0-r2,lr}
48 sub sp, #8*4
49 mov r0, sp
50 lsl r2, #2
51 bl memcpy
52 vld4.8 {d4-d7}, [sp]!
53 pop {r0-r2,lr}
54 b 2b
55.endm
56
57@ void *dst, const void *src, int count, uint global_alpha
58.macro do_argb_alpha bgr2rgb global_alpha
59 mov r12, #0xff
60.if \global_alpha
61 vdup.16 q11, r3
62.endif
63 vdup.i16 q12, r12
64
650:
66 pld [r1, #64*2]
67 cmp r2, #8
68 pld [r0, #64*2]
69 blt 3f
701:
71 vld4.8 {d4-d7}, [r1]!
72 vld4.8 {d0-d3}, [r0]
732:
74.if \bgr2rgb
75 vswp d4, d6 @ BGR->RGB
76.endif
77.if !\global_alpha
78 vmovl.u8 q11, d7
79.endif
80 @ d = (((s-d)*a+255)>>8)+d
81 vsubl.u8 q8, d4, d0
82 vsubl.u8 q9, d5, d1
83 vsubl.u8 q10,d6, d2
84 vmul.s16 q8, q8, q11
85 vmul.s16 q9, q9, q11
86 vmul.s16 q10,q10,q11
87 vaddhn.i16 d4, q8, q12
88 vaddhn.i16 d5, q9, q12
89 vaddhn.i16 d6, q10,q12
90 vadd.i8 q2, q0
91 vadd.i8 d6, d2
92 vmov.i8 d7, d3
93 subs r2, r2, #8
94 blt do_argb_finish
95 vst4.8 {d4-d7}, [r0]!
96 bxeq lr
97 nop
98 b 0b
99
1003:
101 @ unaligned ending nastiness :(
102 add r3, r0, #8*4
103 add r12, r1, #8*4
104 lsr r3, #12
105 lsr r12, #12
106 cmp r3, r0, lsr #12 @ are we crossing
107 cmpeq r12, r1, lsr #12 @ the page boundary?
108 beq 1b @ nope, overreading is safe
109
110 @ _wb_'s bad luck, do some slow stuff here
111 push {r0-r2, lr}
112 vpush {q11, q12}
113 sub sp, #8*4*2
114 lsl r2, #2
115 mov r1, r0
116 mov r0, sp
117 bl memcpy
118 ldr r2, [sp, #8*4*2 + 16*2 + 8] @ stacked r2
119 add r0, sp, #8*4
120 ldr r1, [sp, #8*4*2 + 16*2 + 4]
121 lsl r2, #2
122 bl memcpy
123 vld4.8 {d0-d3}, [sp]!
124 vld4.8 {d4-d7}, [sp]!
125 vpop {q11, q12}
126 pop {r0-r2, lr}
127 b 2b
128.endm
129
130
131do_argb_finish:
132 add r2, r2, #8
133 vzip.8 d4, d5 @ RRR..|GGG.. -> RGRG..
134 vzip.8 d6, d7 @ BBB..|000.. -> B0B0..
135 vzip.16 q2, q3
136
137 vst1.32 d4[0], [r0]!
138 cmp r2, #1
139 bxle lr
140 vst1.32 d4[1], [r0]!
141 cmp r2, #2
142 bxle lr
143 vst1.32 d5[0], [r0]!
144 cmp r2, #3
145 bxle lr
146 vst1.32 d5[1], [r0]!
147 cmp r2, #4
148 bxle lr
149 vst1.32 d6[0], [r0]!
150 cmp r2, #5
151 bxle lr
152 vst1.32 d6[1], [r0]!
153 cmp r2, #6
154 bxle lr
155 vst1.32 d7[0], [r0]!
156 bx lr
157
158
159@ void *dst, const void *src, int count, uint global_alpha
160.macro do_argb_to_rgb565_alpha bgr2rgb global_alpha
161 mov r12, #0xff
162.if \global_alpha
163 vdup.16 q11, r3
164.endif
165 vdup.i16 q12, r12
1660:
167 pld [r1, #64*2]
168 cmp r2, #8
169 pld [r0, #64*2]
170 blt 3f
1711:
172 vld4.8 {d4-d7}, [r1]!
173 vld2.8 {d1-d2}, [r0]
174.if \bgr2rgb
175 vswp d4, d6 @ BGR->RGB
176.endif
177.if !\global_alpha
178 vmovl.u8 q11, d7
179.endif
180 vshl.i8 d0, d1, #3
181 vshr.u8 d1, d1, #3
182 vsri.i8 d0, d0, #5 @ B
183 vsli.i8 d1, d2, #5
184 vsri.i8 d2, d2, #5 @ R
185 vsri.i8 d1, d1, #6 @ G
186 @ d = (((s-d)*a+255)>>8)+d
187 vsubl.u8 q8, d4, d0
188 vsubl.u8 q9, d5, d1
189 vsubl.u8 q10,d6, d2
190 vmul.s16 q8, q8, q11
191 vmul.s16 q9, q9, q11
192 vmul.s16 q10,q10,q11
193 vaddhn.i16 d4, q8, q12
194 vaddhn.i16 d5, q9, q12
195 vaddhn.i16 d6, q10,q12
196 vadd.i8 q2, q0
197 vadd.i8 d2, d6 @ rrrr rrrr
198 vshr.u8 d0, d5, #2
199 vshr.u8 d1, d4, #3 @ 000b bbbb
200 vsri.i8 d2, d5, #5 @ rrrr rggg
201 vsli.i8 d1, d0, #5 @ gggb bbbb
202 subs r2, r2, #8
203 blt do_rgb565_finish
204 vst2.8 {d1-d2}, [r0]!
205 bxeq lr
206 nop
207 b 0b
208
2093:
210 @ unaligned ending nastiness :(
211 add r3, r0, #8*2
212 add r12, r1, #8*4
213 lsr r3, #12
214 lsr r12, #12
215 cmp r3, r0, lsr #12 @ are we crossing
216 cmpeq r12, r1, lsr #12 @ the page boundary?
217 beq 1b @ nope, overreading is safe
218
219 nop
220 bx lr @ abandon ship! (until someone complains)
221.endm
222
223
224do_rgb565_finish:
225 vzip.8 d1, d2
226 add r2, r2, #8
227
228 vst1.16 d1[0], [r0]!
229 cmp r2, #1
230 bxle lr
231 vst1.16 d1[1], [r0]!
232 cmp r2, #2
233 bxle lr
234 vst1.16 d1[2], [r0]!
235 cmp r2, #3
236 bxle lr
237 vst1.16 d1[3], [r0]!
238 cmp r2, #4
239 bxle lr
240 vst1.16 d2[0], [r0]!
241 cmp r2, #5
242 bxle lr
243 vst1.16 d2[1], [r0]!
244 cmp r2, #6
245 bxle lr
246 vst1.16 d2[2], [r0]!
247 bx lr
248
249
250func(neon_ARGBtoXRGB):
251 do_argb 0
252
253func(neon_ABGRtoXRGB):
254 do_argb 1
255
256func(neon_ARGBtoXRGBalpha):
257 do_argb_alpha 0, 0
258
259func(neon_ABGRtoXRGBalpha):
260 do_argb_alpha 1, 0
261
262func(neon_ARGBtoXRGBalphaS):
263 do_argb_alpha 0, 1
264
265func(neon_ABGRtoXRGBalphaS):
266 do_argb_alpha 1, 1
267
268func(neon_ARGBtoRGB565alpha):
269 do_argb_to_rgb565_alpha 0, 0
270
271func(neon_ABGRtoRGB565alpha):
272 do_argb_to_rgb565_alpha 1, 0
273
274@ vim:filetype=armasm