tune the preloads a bit
[sdl_omap.git] / src / video / SDL_blit_neon.S
CommitLineData
a1f34081 1/*
a1eff5db 2 * (C) GraÅžvydas "notaz" Ignotas, 2011,2012
a1f34081 3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
11.text
12.align 2
13
2c4e54dd 14#define func(name) \
15 .global name; \
16 name
17
a1f34081 18@ void *dst, const void *src, int count, uint abits
19.macro do_argb bgr2rgb
20 vdup.i8 d0, r3
210:
edd481ef 22 cmp r2, #8
23 pld [r1, #64*2]
24 blt 3f
251:
a1f34081 26 vld4.8 {d4-d7}, [r1]!
edd481ef 272:
a1f34081 28.if \bgr2rgb
29 vswp d4, d6 @ BGR->RGB
30.endif
31 vmov.i8 d7, d0
32 subs r2, r2, #8
33 blt do_argb_finish
34 vst4.8 {d4-d7}, [r0]!
35 bxeq lr
36 nop
37 b 0b
edd481ef 38
393:
40 @ unaligned ending nastiness :(
41 add r12, r1, #8*4
42 lsr r12, #12
43 cmp r12, r1, lsr #12 @ crossing page?
44 beq 1b @ nope, overreading is safe
45
46 @ _wb_'s bad luck, do some slow stuff here
47 push {r0-r2,lr}
48 sub sp, #8*4
49 mov r0, sp
50 lsl r2, #2
51 bl memcpy
52 vld4.8 {d4-d7}, [sp]!
53 pop {r0-r2,lr}
54 b 2b
a1f34081 55.endm
56
a1eff5db 57@ void *dst, const void *src, int count, uint global_alpha
58.macro do_argb_alpha bgr2rgb global_alpha
bdfa6989 59 mov r12, #0xff
a1eff5db 60.if \global_alpha
bdfa6989 61 vdup.16 q11, r3
a1eff5db 62.endif
63 vdup.i16 q12, r12
edd481ef 64
bdfa6989 650:
edd481ef 66 cmp r2, #8
edd481ef 67 blt 3f
681:
bdfa6989 69 vld4.8 {d4-d7}, [r1]!
199f36ec 70 pld [r1, #64*2]
bdfa6989 71 vld4.8 {d0-d3}, [r0]
199f36ec 72 pld [r0, #64+32]
edd481ef 732:
bdfa6989 74.if \bgr2rgb
75 vswp d4, d6 @ BGR->RGB
a1eff5db 76.endif
77.if !\global_alpha
78 vmovl.u8 q11, d7
bdfa6989 79.endif
80 @ d = (((s-d)*a+255)>>8)+d
81 vsubl.u8 q8, d4, d0
82 vsubl.u8 q9, d5, d1
83 vsubl.u8 q10,d6, d2
84 vmul.s16 q8, q8, q11
85 vmul.s16 q9, q9, q11
86 vmul.s16 q10,q10,q11
87 vaddhn.i16 d4, q8, q12
88 vaddhn.i16 d5, q9, q12
89 vaddhn.i16 d6, q10,q12
90 vadd.i8 q2, q0
91 vadd.i8 d6, d2
92 vmov.i8 d7, d3
93 subs r2, r2, #8
94 blt do_argb_finish
95 vst4.8 {d4-d7}, [r0]!
96 bxeq lr
97 nop
98 b 0b
edd481ef 99
1003:
101 @ unaligned ending nastiness :(
102 add r3, r0, #8*4
103 add r12, r1, #8*4
104 lsr r3, #12
105 lsr r12, #12
106 cmp r3, r0, lsr #12 @ are we crossing
107 cmpeq r12, r1, lsr #12 @ the page boundary?
108 beq 1b @ nope, overreading is safe
109
110 @ _wb_'s bad luck, do some slow stuff here
111 push {r0-r2, lr}
112 vpush {q11, q12}
113 sub sp, #8*4*2
114 lsl r2, #2
115 mov r1, r0
116 mov r0, sp
117 bl memcpy
118 ldr r2, [sp, #8*4*2 + 16*2 + 8] @ stacked r2
119 add r0, sp, #8*4
120 ldr r1, [sp, #8*4*2 + 16*2 + 4]
121 lsl r2, #2
122 bl memcpy
123 vld4.8 {d0-d3}, [sp]!
124 vld4.8 {d4-d7}, [sp]!
125 vpop {q11, q12}
126 pop {r0-r2, lr}
127 b 2b
bdfa6989 128.endm
129
130
a1f34081 131do_argb_finish:
132 add r2, r2, #8
133 vzip.8 d4, d5 @ RRR..|GGG.. -> RGRG..
134 vzip.8 d6, d7 @ BBB..|000.. -> B0B0..
135 vzip.16 q2, q3
136
137 vst1.32 d4[0], [r0]!
138 cmp r2, #1
139 bxle lr
140 vst1.32 d4[1], [r0]!
141 cmp r2, #2
142 bxle lr
143 vst1.32 d5[0], [r0]!
144 cmp r2, #3
145 bxle lr
146 vst1.32 d5[1], [r0]!
147 cmp r2, #4
148 bxle lr
149 vst1.32 d6[0], [r0]!
150 cmp r2, #5
151 bxle lr
152 vst1.32 d6[1], [r0]!
153 cmp r2, #6
154 bxle lr
155 vst1.32 d7[0], [r0]!
156 bx lr
157
158
2c4e54dd 159@ void *dst, const void *src, int count, uint global_alpha
160.macro do_argb_to_rgb565_alpha bgr2rgb global_alpha
161 mov r12, #0xff
162.if \global_alpha
163 vdup.16 q11, r3
164.endif
165 vdup.i16 q12, r12
1660:
edd481ef 167 cmp r2, #8
edd481ef 168 blt 3f
1691:
2c4e54dd 170 vld4.8 {d4-d7}, [r1]!
199f36ec 171 pld [r1, #64*2]
2c4e54dd 172 vld2.8 {d1-d2}, [r0]
199f36ec 173 pld [r0, #64+32]
2c4e54dd 174.if \bgr2rgb
175 vswp d4, d6 @ BGR->RGB
176.endif
177.if !\global_alpha
178 vmovl.u8 q11, d7
179.endif
180 vshl.i8 d0, d1, #3
181 vshr.u8 d1, d1, #3
182 vsri.i8 d0, d0, #5 @ B
183 vsli.i8 d1, d2, #5
184 vsri.i8 d2, d2, #5 @ R
185 vsri.i8 d1, d1, #6 @ G
186 @ d = (((s-d)*a+255)>>8)+d
187 vsubl.u8 q8, d4, d0
188 vsubl.u8 q9, d5, d1
189 vsubl.u8 q10,d6, d2
190 vmul.s16 q8, q8, q11
191 vmul.s16 q9, q9, q11
192 vmul.s16 q10,q10,q11
193 vaddhn.i16 d4, q8, q12
194 vaddhn.i16 d5, q9, q12
195 vaddhn.i16 d6, q10,q12
196 vadd.i8 q2, q0
197 vadd.i8 d2, d6 @ rrrr rrrr
198 vshr.u8 d0, d5, #2
199 vshr.u8 d1, d4, #3 @ 000b bbbb
200 vsri.i8 d2, d5, #5 @ rrrr rggg
201 vsli.i8 d1, d0, #5 @ gggb bbbb
202 subs r2, r2, #8
203 blt do_rgb565_finish
204 vst2.8 {d1-d2}, [r0]!
205 bxeq lr
206 nop
207 b 0b
edd481ef 208
2093:
210 @ unaligned ending nastiness :(
211 add r3, r0, #8*2
212 add r12, r1, #8*4
213 lsr r3, #12
214 lsr r12, #12
215 cmp r3, r0, lsr #12 @ are we crossing
216 cmpeq r12, r1, lsr #12 @ the page boundary?
217 beq 1b @ nope, overreading is safe
218
219 nop
220 bx lr @ abandon ship! (until someone complains)
2c4e54dd 221.endm
222
223
224do_rgb565_finish:
225 vzip.8 d1, d2
226 add r2, r2, #8
227
228 vst1.16 d1[0], [r0]!
229 cmp r2, #1
230 bxle lr
231 vst1.16 d1[1], [r0]!
232 cmp r2, #2
233 bxle lr
234 vst1.16 d1[2], [r0]!
235 cmp r2, #3
236 bxle lr
237 vst1.16 d1[3], [r0]!
238 cmp r2, #4
239 bxle lr
240 vst1.16 d2[0], [r0]!
241 cmp r2, #5
242 bxle lr
243 vst1.16 d2[1], [r0]!
244 cmp r2, #6
245 bxle lr
246 vst1.16 d2[2], [r0]!
247 bx lr
248
249
250func(neon_ARGBtoXRGB):
a1f34081 251 do_argb 0
252
2c4e54dd 253func(neon_ABGRtoXRGB):
a1f34081 254 do_argb 1
255
2c4e54dd 256func(neon_ARGBtoXRGBalpha):
a1eff5db 257 do_argb_alpha 0, 0
a1f34081 258
2c4e54dd 259func(neon_ABGRtoXRGBalpha):
a1eff5db 260 do_argb_alpha 1, 0
a1f34081 261
2c4e54dd 262func(neon_ARGBtoXRGBalphaS):
a1eff5db 263 do_argb_alpha 0, 1
bdfa6989 264
2c4e54dd 265func(neon_ABGRtoXRGBalphaS):
a1eff5db 266 do_argb_alpha 1, 1
bdfa6989 267
2c4e54dd 268func(neon_ARGBtoRGB565alpha):
269 do_argb_to_rgb565_alpha 0, 0
270
271func(neon_ABGRtoRGB565alpha):
272 do_argb_to_rgb565_alpha 1, 0
273
a1f34081 274@ vim:filetype=armasm