/* Functions to perform alpha blended blitting */
+#ifdef __ARM_NEON__
+
+/* NEON optimized blitter callers */
+#define make_neon_caller(name, neon_name) \
+extern void neon_name(void *dst, const void *src, int count); \
+static void name(SDL_BlitInfo *info) \
+{ \
+ int width = info->d_width; \
+ int height = info->d_height; \
+ Uint8 *src = info->s_pixels; \
+ Uint8 *dst = info->d_pixels; \
+ int srcskip = info->s_skip; \
+ int dstskip = info->d_skip; \
+\
+ while ( height-- ) { \
+ neon_name(dst, src, width); \
+ src += width * 4 + srcskip; \
+ dst += width * 4 + dstskip; \
+ } \
+}
+
+make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
+make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
+
+#endif /* __ARM_NEON__ */
+
/* N->1 blending with per-surface alpha */
static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
{
if(!(surface->map->dst->flags & SDL_HWSURFACE)
&& SDL_HasAltiVec())
return BlitRGBtoRGBPixelAlphaAltivec;
+#endif
+#ifdef __ARM_NEON__
+ return BlitARGBtoXRGBalpha_neon;
#endif
return BlitRGBtoRGBPixelAlpha;
}
}
+#ifdef __ARM_NEON__
+ if (sf->Gmask == df->Gmask && sf->Amask == 0xff000000 &&
+ ((sf->Rmask == 0xff && df->Rmask == 0xff0000 && sf->Bmask == 0xff0000 && df->Bmask == 0xff) ||
+ (sf->Rmask == 0xff0000 && df->Rmask == 0xff && sf->Bmask == 0xff && df->Bmask == 0xff0000)))
+ {
+ return BlitABGRtoXRGBalpha_neon;
+ }
+#endif
#if SDL_ALTIVEC_BLITTERS
if (sf->Amask && sf->BytesPerPixel == 4 &&
!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
#endif /* SDL_HERMES_BLITTERS */
+#ifdef __ARM_NEON__
+
+/* NEON optimized blitter callers */
+#define make_neon_caller(name, neon_name) \
+extern void neon_name(void *dst, const void *src, int count, unsigned int abits); \
+static void name(SDL_BlitInfo *info) \
+{ \
+ int width = info->d_width; \
+ int height = info->d_height; \
+ Uint8 *src = info->s_pixels; \
+ Uint8 *dst = info->d_pixels; \
+ int srcskip = info->s_skip; \
+ int dstskip = info->d_skip; \
+ unsigned int abits = info->dst->Amask ? 0xff : 0; \
+\
+ while ( height-- ) { \
+ neon_name(dst, src, width, abits); \
+ src += width * 4 + srcskip; \
+ dst += width * 4 + dstskip; \
+ } \
+}
+
+make_neon_caller(BlitABGRtoXRGB_neon, neon_ABGRtoXRGB)
+make_neon_caller(BlitARGBtoXRGB_neon, neon_ARGBtoXRGB)
+
+#endif /* __ARM_NEON__ */
+
/* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
#define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
0, NULL, Blit_RGB888_RGB565, NO_ALPHA },
{ 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F,
0, NULL, Blit_RGB888_RGB555, NO_ALPHA },
+#endif
+#ifdef __ARM_NEON__
+ { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x00FF0000,0x0000FF00,0x000000FF,
+ 0, NULL, BlitARGBtoXRGB_neon, NO_ALPHA | SET_ALPHA },
+ { 0x000000FF,0x0000FF00,0x00FF0000, 4, 0x00FF0000,0x0000FF00,0x000000FF,
+ 0, NULL, BlitABGRtoXRGB_neon, NO_ALPHA | SET_ALPHA },
+ /* RGB->BGR is same as BGR->RGB */
+ { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x000000FF,0x0000FF00,0x00FF0000,
+ 0, NULL, BlitABGRtoXRGB_neon, NO_ALPHA | SET_ALPHA },
#endif
/* Default for 32-bit RGB source, used if no other blitter matches */
{ 0,0,0, 0, 0,0,0, 0, NULL, BlitNtoN, 0 }
--- /dev/null
+/*
+ * (C) GraÅžvydas "notaz" Ignotas, 2011
+ *
+ * This work is licensed under the terms of any of these licenses
+ * (at your option):
+ * - GNU GPL, version 2 or later.
+ * - GNU LGPL, version 2.1 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+.text
+.align 2
+
+@ void *dst, const void *src, int count, uint abits
+.macro do_argb bgr2rgb
+ vdup.i8 d0, r3
+0:
+ vld4.8 {d4-d7}, [r1]!
+.if \bgr2rgb
+ vswp d4, d6 @ BGR->RGB
+.endif
+ vmov.i8 d7, d0
+ subs r2, r2, #8
+ blt do_argb_finish
+ vst4.8 {d4-d7}, [r0]!
+ bxeq lr
+ nop
+ b 0b
+.endm
+
+@ void *dst, const void *src, int count
+.macro do_argb_alpha bgr2rgb
+ mov r3, #0xff
+ vdup.i16 q12, r3
+0:
+ vld4.8 {d4-d7}, [r1]!
+ vld4.8 {d0-d3}, [r0]
+.if \bgr2rgb
+ vswp d4, d6 @ BGR->RGB
+.endif
+ vmovl.u8 q11, d7
+ @ d = (((s-d)*a+255)>>8)+d
+ vsubl.u8 q8, d4, d0
+ vsubl.u8 q9, d5, d1
+ vsubl.u8 q10,d6, d2
+ vmul.s16 q8, q8, q11
+ vmul.s16 q9, q9, q11
+ vmul.s16 q10,q10,q11
+ vaddhn.i16 d4, q8, q12
+ vaddhn.i16 d5, q9, q12
+ vaddhn.i16 d6, q10,q12
+ vadd.i8 q2, q0
+ vadd.i8 d6, d2
+ vmov.i8 d7, d3
+ subs r2, r2, #8
+ blt do_argb_finish
+ vst4.8 {d4-d7}, [r0]!
+ bxeq lr
+ nop
+ b 0b
+.endm
+
+do_argb_finish:
+ add r2, r2, #8
+ vzip.8 d4, d5 @ RRR..|GGG.. -> RGRG..
+ vzip.8 d6, d7 @ BBB..|000.. -> B0B0..
+ vzip.16 q2, q3
+
+ vst1.32 d4[0], [r0]!
+ cmp r2, #1
+ bxle lr
+ vst1.32 d4[1], [r0]!
+ cmp r2, #2
+ bxle lr
+ vst1.32 d5[0], [r0]!
+ cmp r2, #3
+ bxle lr
+ vst1.32 d5[1], [r0]!
+ cmp r2, #4
+ bxle lr
+ vst1.32 d6[0], [r0]!
+ cmp r2, #5
+ bxle lr
+ vst1.32 d6[1], [r0]!
+ cmp r2, #6
+ bxle lr
+ vst1.32 d7[0], [r0]!
+ bx lr
+
+
+.global neon_ARGBtoXRGB
+neon_ARGBtoXRGB:
+ do_argb 0
+
+.global neon_ABGRtoXRGB
+neon_ABGRtoXRGB:
+ do_argb 1
+
+.global neon_ARGBtoXRGBalpha
+neon_ARGBtoXRGBalpha:
+ do_argb_alpha 0
+
+.global neon_ABGRtoXRGBalpha
+neon_ABGRtoXRGBalpha:
+ do_argb_alpha 1
+
+@ vim:filetype=armasm