int height = info->d_height; \
Uint8 *src = info->s_pixels; \
Uint8 *dst = info->d_pixels; \
- int srcskip = info->s_skip; \
- int dstskip = info->d_skip; \
+ int dstBpp = info->dst->BytesPerPixel; \
+ int srcstride = width * 4 + info->s_skip; \
+ int dststride = width * dstBpp + info->d_skip; \
\
while ( height-- ) { \
- neon_name(dst, src, width); \
- src += width * 4 + srcskip; \
- dst += width * 4 + dstskip; \
+ neon_name(dst, src, width); \
+ src += srcstride; \
+ dst += dststride; \
} \
}
make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
+make_neon_caller(BlitABGRtoRGB565alpha_neon, neon_ABGRtoRGB565alpha)
+make_neon_caller(BlitARGBtoRGB565alpha_neon, neon_ARGBtoRGB565alpha)
make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
df->Bmask == 0x1f && SDL_HasAltiVec())
return Blit32to565PixelAlphaAltivec;
else
+#endif
+#ifdef __ARM_NEON__
+ if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
+ && sf->Gmask == 0xff00 && df->Gmask == 0x7e0) {
+ if((sf->Bmask >> 3) == df->Bmask || (sf->Rmask >> 3) == df->Rmask)
+ return BlitARGBtoRGB565alpha_neon;
+ else
+ return BlitABGRtoRGB565alpha_neon;
+ }
+ else
#endif
if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
&& sf->Gmask == 0xff00
.text
.align 2
+#define func(name) \
+ .global name; \
+ name
+
@ void *dst, const void *src, int count, uint abits
.macro do_argb bgr2rgb
vdup.i8 d0, r3
bx lr
-.global neon_ARGBtoXRGB
-neon_ARGBtoXRGB:
+@ void *dst, const void *src, int count, uint global_alpha
+.macro do_argb_to_rgb565_alpha bgr2rgb global_alpha
+ mov r12, #0xff
+.if \global_alpha
+ vdup.16 q11, r3
+.endif
+ vdup.i16 q12, r12
+0:
+ pld [r1, #64*2]
+ pld [r0, #64*2]
+ vld4.8 {d4-d7}, [r1]!
+ vld2.8 {d1-d2}, [r0]
+.if \bgr2rgb
+ vswp d4, d6 @ BGR->RGB
+.endif
+.if !\global_alpha
+ vmovl.u8 q11, d7
+.endif
+ vshl.i8 d0, d1, #3
+ vshr.u8 d1, d1, #3
+ vsri.i8 d0, d0, #5 @ B
+ vsli.i8 d1, d2, #5
+ vsri.i8 d2, d2, #5 @ R
+ vsri.i8 d1, d1, #6 @ G
+ @ d = (((s-d)*a+255)>>8)+d
+ vsubl.u8 q8, d4, d0
+ vsubl.u8 q9, d5, d1
+ vsubl.u8 q10,d6, d2
+ vmul.s16 q8, q8, q11
+ vmul.s16 q9, q9, q11
+ vmul.s16 q10,q10,q11
+ vaddhn.i16 d4, q8, q12
+ vaddhn.i16 d5, q9, q12
+ vaddhn.i16 d6, q10,q12
+ vadd.i8 q2, q0
+ vadd.i8 d2, d6 @ rrrr rrrr
+ vshr.u8 d0, d5, #2
+ vshr.u8 d1, d4, #3 @ 000b bbbb
+ vsri.i8 d2, d5, #5 @ rrrr rggg
+ vsli.i8 d1, d0, #5 @ gggb bbbb
+ subs r2, r2, #8
+ blt do_rgb565_finish
+ vst2.8 {d1-d2}, [r0]!
+ bxeq lr
+ nop
+ b 0b
+.endm
+
+
+do_rgb565_finish:
+ vzip.8 d1, d2
+ add r2, r2, #8
+
+ vst1.16 d1[0], [r0]!
+ cmp r2, #1
+ bxle lr
+ vst1.16 d1[1], [r0]!
+ cmp r2, #2
+ bxle lr
+ vst1.16 d1[2], [r0]!
+ cmp r2, #3
+ bxle lr
+ vst1.16 d1[3], [r0]!
+ cmp r2, #4
+ bxle lr
+ vst1.16 d2[0], [r0]!
+ cmp r2, #5
+ bxle lr
+ vst1.16 d2[1], [r0]!
+ cmp r2, #6
+ bxle lr
+ vst1.16 d2[2], [r0]!
+ bx lr
+
+
+func(neon_ARGBtoXRGB):
do_argb 0
-.global neon_ABGRtoXRGB
-neon_ABGRtoXRGB:
+func(neon_ABGRtoXRGB):
do_argb 1
-.global neon_ARGBtoXRGBalpha
-neon_ARGBtoXRGBalpha:
+func(neon_ARGBtoXRGBalpha):
do_argb_alpha 0, 0
-.global neon_ABGRtoXRGBalpha
-neon_ABGRtoXRGBalpha:
+func(neon_ABGRtoXRGBalpha):
do_argb_alpha 1, 0
-.global neon_ARGBtoXRGBalphaS
-neon_ARGBtoXRGBalphaS:
+func(neon_ARGBtoXRGBalphaS):
do_argb_alpha 0, 1
-.global neon_ABGRtoXRGBalphaS
-neon_ABGRtoXRGBalphaS:
+func(neon_ABGRtoXRGBalphaS):
do_argb_alpha 1, 1
+func(neon_ARGBtoRGB565alpha):
+ do_argb_to_rgb565_alpha 0, 0
+
+func(neon_ABGRtoRGB565alpha):
+ do_argb_to_rgb565_alpha 1, 0
+
@ vim:filetype=armasm