add some NEON 32bpp blitters
authornotaz <notasas@gmail.com>
Sun, 18 Dec 2011 21:52:11 +0000 (23:52 +0200)
committernotaz <notasas@gmail.com>
Sun, 18 Dec 2011 22:49:05 +0000 (00:49 +0200)
configure.in
src/video/SDL_blit_A.c
src/video/SDL_blit_N.c
src/video/SDL_blit_neon.S [new file with mode: 0644]

index f6a708c..1a339a8 100644 (file)
@@ -1505,6 +1505,25 @@ AC_HELP_STRING([--enable-video-caca], [use libcaca video driver [[default=no]]])
     fi
 }
 
+dnl Check for ARM NEON
+CheckARM_NEON()
+{
+    AC_MSG_CHECKING(for ARM NEON support)
+    have_arm_neon=no
+    AC_TRY_COMPILE([
+    ],[
+     #ifndef __ARM_NEON__
+     #error NEON not available
+     #endif
+    ],[
+     have_arm_neon=yes
+    ])
+    AC_MSG_RESULT($have_arm_neon)
+    if test x$have_arm_neon = xyes; then
+        SOURCES="$SOURCES $srcdir/src/video/SDL_blit_neon.S"
+    fi
+}
+
 dnl Set up the QTopia video driver if enabled
 CheckQtopia()
 {
@@ -2361,6 +2380,7 @@ case "$host" in
         CheckLinuxVersion
         CheckRPATH
         CheckOMAPDSS
+        CheckARM_NEON
         # Set up files for the audio library
         if test x$enable_audio = xyes; then
           case $ARCH in
index 2c52209..5a4cff0 100644 (file)
 
 /* Functions to perform alpha blended blitting */
 
+#ifdef __ARM_NEON__
+
+/* NEON optimized blitter callers */
+#define make_neon_caller(name, neon_name) \
+extern void neon_name(void *dst, const void *src, int count); \
+static void name(SDL_BlitInfo *info) \
+{ \
+       int width = info->d_width; \
+       int height = info->d_height; \
+       Uint8 *src = info->s_pixels; \
+       Uint8 *dst = info->d_pixels; \
+       int srcskip = info->s_skip; \
+       int dstskip = info->d_skip; \
+\
+       while ( height-- ) { \
+            neon_name(dst, src, width); \
+           src += width * 4 + srcskip; \
+           dst += width * 4 + dstskip; \
+       } \
+}
+
+make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
+make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
+
+#endif /* __ARM_NEON__ */
+
 /* N->1 blending with per-surface alpha */
 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
 {
@@ -2879,10 +2905,21 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
                        if(!(surface->map->dst->flags & SDL_HWSURFACE)
                                && SDL_HasAltiVec())
                                return BlitRGBtoRGBPixelAlphaAltivec;
+#endif
+#ifdef __ARM_NEON__
+                       return BlitARGBtoXRGBalpha_neon;
 #endif
                        return BlitRGBtoRGBPixelAlpha;
                }
            }
+#ifdef __ARM_NEON__
+           if (sf->Gmask == df->Gmask && sf->Amask == 0xff000000 &&
+               ((sf->Rmask == 0xff && df->Rmask == 0xff0000 && sf->Bmask == 0xff0000 && df->Bmask == 0xff) ||
+                (sf->Rmask == 0xff0000 && df->Rmask == 0xff && sf->Bmask == 0xff && df->Bmask == 0xff0000)))
+           {
+               return BlitABGRtoXRGBalpha_neon;
+           }
+#endif
 #if SDL_ALTIVEC_BLITTERS
            if (sf->Amask && sf->BytesPerPixel == 4 &&
                !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
index e4f9589..0830190 100644 (file)
@@ -1210,6 +1210,33 @@ static void Blit_RGB888_RGB565(SDL_BlitInfo *info)
 
 #endif /* SDL_HERMES_BLITTERS */
 
+#ifdef __ARM_NEON__
+
+/* NEON optimized blitter callers */
+#define make_neon_caller(name, neon_name) \
+extern void neon_name(void *dst, const void *src, int count, unsigned int abits); \
+static void name(SDL_BlitInfo *info) \
+{ \
+       int width = info->d_width; \
+       int height = info->d_height; \
+       Uint8 *src = info->s_pixels; \
+       Uint8 *dst = info->d_pixels; \
+       int srcskip = info->s_skip; \
+       int dstskip = info->d_skip; \
+       unsigned int abits = info->dst->Amask ? 0xff : 0; \
+\
+       while ( height-- ) { \
+            neon_name(dst, src, width, abits); \
+           src += width * 4 + srcskip; \
+           dst += width * 4 + dstskip; \
+       } \
+}
+
+make_neon_caller(BlitABGRtoXRGB_neon, neon_ABGRtoXRGB)
+make_neon_caller(BlitARGBtoXRGB_neon, neon_ARGBtoXRGB)
+
+#endif /* __ARM_NEON__ */
+
 
 /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
 #define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
@@ -2360,6 +2387,15 @@ static const struct blit_table normal_blit_4[] = {
       0, NULL, Blit_RGB888_RGB565, NO_ALPHA },
     { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F,
       0, NULL, Blit_RGB888_RGB555, NO_ALPHA },
+#endif
+#ifdef __ARM_NEON__
+    { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x00FF0000,0x0000FF00,0x000000FF,
+      0, NULL, BlitARGBtoXRGB_neon, NO_ALPHA | SET_ALPHA },
+    { 0x000000FF,0x0000FF00,0x00FF0000, 4, 0x00FF0000,0x0000FF00,0x000000FF,
+      0, NULL, BlitABGRtoXRGB_neon, NO_ALPHA | SET_ALPHA },
+    /* RGB->BGR is same as BGR->RGB */
+    { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x000000FF,0x0000FF00,0x00FF0000,
+      0, NULL, BlitABGRtoXRGB_neon, NO_ALPHA | SET_ALPHA },
 #endif
        /* Default for 32-bit RGB source, used if no other blitter matches */
        { 0,0,0, 0, 0,0,0, 0, NULL, BlitNtoN, 0 }
diff --git a/src/video/SDL_blit_neon.S b/src/video/SDL_blit_neon.S
new file mode 100644 (file)
index 0000000..438d9fc
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * (C) GraÅžvydas "notaz" Ignotas, 2011
+ *
+ * This work is licensed under the terms of any of these licenses
+ * (at your option):
+ *  - GNU GPL, version 2 or later.
+ *  - GNU LGPL, version 2.1 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+.text
+.align 2
+
+@ void *dst, const void *src, int count, uint abits
+.macro do_argb bgr2rgb
+    vdup.i8    d0, r3
+0:
+    vld4.8     {d4-d7}, [r1]!
+.if \bgr2rgb
+    vswp       d4, d6          @ BGR->RGB
+.endif
+    vmov.i8    d7, d0
+    subs       r2, r2, #8
+    blt        do_argb_finish
+    vst4.8     {d4-d7}, [r0]!
+    bxeq       lr
+    nop
+    b          0b
+.endm
+
+@ void *dst, const void *src, int count
+.macro do_argb_alpha bgr2rgb
+    mov        r3, #0xff
+    vdup.i16   q12, r3
+0:
+    vld4.8     {d4-d7}, [r1]!
+    vld4.8     {d0-d3}, [r0]
+.if \bgr2rgb
+    vswp       d4, d6          @ BGR->RGB
+.endif
+    vmovl.u8   q11, d7
+    @ d = (((s-d)*a+255)>>8)+d
+    vsubl.u8   q8, d4, d0
+    vsubl.u8   q9, d5, d1
+    vsubl.u8   q10,d6, d2
+    vmul.s16   q8, q8, q11
+    vmul.s16   q9, q9, q11
+    vmul.s16   q10,q10,q11
+    vaddhn.i16 d4, q8, q12
+    vaddhn.i16 d5, q9, q12
+    vaddhn.i16 d6, q10,q12
+    vadd.i8    q2, q0
+    vadd.i8    d6, d2
+    vmov.i8    d7, d3
+    subs       r2, r2, #8
+    blt        do_argb_finish
+    vst4.8     {d4-d7}, [r0]!
+    bxeq       lr
+    nop
+    b          0b
+.endm
+
+do_argb_finish:
+    add        r2, r2, #8
+    vzip.8     d4, d5          @ RRR..|GGG.. -> RGRG..
+    vzip.8     d6, d7          @ BBB..|000.. -> B0B0..
+    vzip.16    q2, q3
+              
+    vst1.32    d4[0], [r0]!
+    cmp        r2, #1
+    bxle       lr
+    vst1.32    d4[1], [r0]!
+    cmp        r2, #2
+    bxle       lr
+    vst1.32    d5[0], [r0]!
+    cmp        r2, #3
+    bxle       lr
+    vst1.32    d5[1], [r0]!
+    cmp        r2, #4
+    bxle       lr
+    vst1.32    d6[0], [r0]!
+    cmp        r2, #5
+    bxle       lr
+    vst1.32    d6[1], [r0]!
+    cmp        r2, #6
+    bxle       lr
+    vst1.32    d7[0], [r0]!
+    bx         lr
+
+
+.global neon_ARGBtoXRGB
+neon_ARGBtoXRGB:
+    do_argb 0
+
+.global neon_ABGRtoXRGB
+neon_ABGRtoXRGB:
+    do_argb 1
+
+.global neon_ARGBtoXRGBalpha
+neon_ARGBtoXRGBalpha:
+    do_argb_alpha 0
+
+.global neon_ABGRtoXRGBalpha
+neon_ABGRtoXRGBalpha:
+    do_argb_alpha 1
+
+@ vim:filetype=armasm