From: notaz Date: Sun, 18 Dec 2011 21:52:11 +0000 (+0200) Subject: add some NEON 32bpp blitters X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a1f340810c4f9a4a617f1757d1ff0255e0e3fe58;p=sdl_omap.git add some NEON 32bpp blitters --- diff --git a/configure.in b/configure.in index f6a708c..1a339a8 100644 --- a/configure.in +++ b/configure.in @@ -1505,6 +1505,25 @@ AC_HELP_STRING([--enable-video-caca], [use libcaca video driver [[default=no]]]) fi } +dnl Check for ARM NEON +CheckARM_NEON() +{ + AC_MSG_CHECKING(for ARM NEON support) + have_arm_neon=no + AC_TRY_COMPILE([ + ],[ + #ifndef __ARM_NEON__ + #error NEON not available + #endif + ],[ + have_arm_neon=yes + ]) + AC_MSG_RESULT($have_arm_neon) + if test x$have_arm_neon = xyes; then + SOURCES="$SOURCES $srcdir/src/video/SDL_blit_neon.S" + fi +} + dnl Set up the QTopia video driver if enabled CheckQtopia() { @@ -2361,6 +2380,7 @@ case "$host" in CheckLinuxVersion CheckRPATH CheckOMAPDSS + CheckARM_NEON # Set up files for the audio library if test x$enable_audio = xyes; then case $ARCH in diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 2c52209..5a4cff0 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -61,6 +61,32 @@ /* Functions to perform alpha blended blitting */ +#ifdef __ARM_NEON__ + +/* NEON optimized blitter callers */ +#define make_neon_caller(name, neon_name) \ +extern void neon_name(void *dst, const void *src, int count); \ +static void name(SDL_BlitInfo *info) \ +{ \ + int width = info->d_width; \ + int height = info->d_height; \ + Uint8 *src = info->s_pixels; \ + Uint8 *dst = info->d_pixels; \ + int srcskip = info->s_skip; \ + int dstskip = info->d_skip; \ +\ + while ( height-- ) { \ + neon_name(dst, src, width); \ + src += width * 4 + srcskip; \ + dst += width * 4 + dstskip; \ + } \ +} + +make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha) +make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha) + +#endif /* __ARM_NEON__ */ + /* N->1 blending with per-surface alpha */ static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) { @@ -2879,10 +2905,21 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) return BlitRGBtoRGBPixelAlphaAltivec; +#endif +#ifdef __ARM_NEON__ + return BlitARGBtoXRGBalpha_neon; #endif return BlitRGBtoRGBPixelAlpha; } } +#ifdef __ARM_NEON__ + if (sf->Gmask == df->Gmask && sf->Amask == 0xff000000 && + ((sf->Rmask == 0xff && df->Rmask == 0xff0000 && sf->Bmask == 0xff0000 && df->Bmask == 0xff) || + (sf->Rmask == 0xff0000 && df->Rmask == 0xff && sf->Bmask == 0xff && df->Bmask == 0xff0000))) + { + return BlitABGRtoXRGBalpha_neon; + } +#endif #if SDL_ALTIVEC_BLITTERS if (sf->Amask && sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c index e4f9589..0830190 100644 --- a/src/video/SDL_blit_N.c +++ b/src/video/SDL_blit_N.c @@ -1210,6 +1210,33 @@ static void Blit_RGB888_RGB565(SDL_BlitInfo *info) #endif /* SDL_HERMES_BLITTERS */ +#ifdef __ARM_NEON__ + +/* NEON optimized blitter callers */ +#define make_neon_caller(name, neon_name) \ +extern void neon_name(void *dst, const void *src, int count, unsigned int abits); \ +static void name(SDL_BlitInfo *info) \ +{ \ + int width = info->d_width; \ + int height = info->d_height; \ + Uint8 *src = info->s_pixels; \ + Uint8 *dst = info->d_pixels; \ + int srcskip = info->s_skip; \ + int dstskip = info->d_skip; \ + unsigned int abits = info->dst->Amask ? 0xff : 0; \ +\ + while ( height-- ) { \ + neon_name(dst, src, width, abits); \ + src += width * 4 + srcskip; \ + dst += width * 4 + dstskip; \ + } \ +} + +make_neon_caller(BlitABGRtoXRGB_neon, neon_ABGRtoXRGB) +make_neon_caller(BlitARGBtoXRGB_neon, neon_ARGBtoXRGB) + +#endif /* __ARM_NEON__ */ + /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */ #define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1]) @@ -2360,6 +2387,15 @@ static const struct blit_table normal_blit_4[] = { 0, NULL, Blit_RGB888_RGB565, NO_ALPHA }, { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F, 0, NULL, Blit_RGB888_RGB555, NO_ALPHA }, +#endif +#ifdef __ARM_NEON__ + { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x00FF0000,0x0000FF00,0x000000FF, + 0, NULL, BlitARGBtoXRGB_neon, NO_ALPHA | SET_ALPHA }, + { 0x000000FF,0x0000FF00,0x00FF0000, 4, 0x00FF0000,0x0000FF00,0x000000FF, + 0, NULL, BlitABGRtoXRGB_neon, NO_ALPHA | SET_ALPHA }, + /* RGB->BGR is same as BGR->RGB */ + { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x000000FF,0x0000FF00,0x00FF0000, + 0, NULL, BlitABGRtoXRGB_neon, NO_ALPHA | SET_ALPHA }, #endif /* Default for 32-bit RGB source, used if no other blitter matches */ { 0,0,0, 0, 0,0,0, 0, NULL, BlitNtoN, 0 } diff --git a/src/video/SDL_blit_neon.S b/src/video/SDL_blit_neon.S new file mode 100644 index 0000000..438d9fc --- /dev/null +++ b/src/video/SDL_blit_neon.S @@ -0,0 +1,107 @@ +/* + * (C) Gražvydas "notaz" Ignotas, 2011 + * + * This work is licensed under the terms of any of these licenses + * (at your option): + * - GNU GPL, version 2 or later. + * - GNU LGPL, version 2.1 or later. + * See the COPYING file in the top-level directory. + */ + +.text +.align 2 + +@ void *dst, const void *src, int count, uint abits +.macro do_argb bgr2rgb + vdup.i8 d0, r3 +0: + vld4.8 {d4-d7}, [r1]! +.if \bgr2rgb + vswp d4, d6 @ BGR->RGB +.endif + vmov.i8 d7, d0 + subs r2, r2, #8 + blt do_argb_finish + vst4.8 {d4-d7}, [r0]! + bxeq lr + nop + b 0b +.endm + +@ void *dst, const void *src, int count +.macro do_argb_alpha bgr2rgb + mov r3, #0xff + vdup.i16 q12, r3 +0: + vld4.8 {d4-d7}, [r1]! + vld4.8 {d0-d3}, [r0] +.if \bgr2rgb + vswp d4, d6 @ BGR->RGB +.endif + vmovl.u8 q11, d7 + @ d = (((s-d)*a+255)>>8)+d + vsubl.u8 q8, d4, d0 + vsubl.u8 q9, d5, d1 + vsubl.u8 q10,d6, d2 + vmul.s16 q8, q8, q11 + vmul.s16 q9, q9, q11 + vmul.s16 q10,q10,q11 + vaddhn.i16 d4, q8, q12 + vaddhn.i16 d5, q9, q12 + vaddhn.i16 d6, q10,q12 + vadd.i8 q2, q0 + vadd.i8 d6, d2 + vmov.i8 d7, d3 + subs r2, r2, #8 + blt do_argb_finish + vst4.8 {d4-d7}, [r0]! + bxeq lr + nop + b 0b +.endm + +do_argb_finish: + add r2, r2, #8 + vzip.8 d4, d5 @ RRR..|GGG.. -> RGRG.. + vzip.8 d6, d7 @ BBB..|000.. -> B0B0.. + vzip.16 q2, q3 + + vst1.32 d4[0], [r0]! + cmp r2, #1 + bxle lr + vst1.32 d4[1], [r0]! + cmp r2, #2 + bxle lr + vst1.32 d5[0], [r0]! + cmp r2, #3 + bxle lr + vst1.32 d5[1], [r0]! + cmp r2, #4 + bxle lr + vst1.32 d6[0], [r0]! + cmp r2, #5 + bxle lr + vst1.32 d6[1], [r0]! + cmp r2, #6 + bxle lr + vst1.32 d7[0], [r0]! + bx lr + + +.global neon_ARGBtoXRGB +neon_ARGBtoXRGB: + do_argb 0 + +.global neon_ABGRtoXRGB +neon_ABGRtoXRGB: + do_argb 1 + +.global neon_ARGBtoXRGBalpha +neon_ARGBtoXRGBalpha: + do_argb_alpha 0 + +.global neon_ABGRtoXRGBalpha +neon_ABGRtoXRGBalpha: + do_argb_alpha 1 + +@ vim:filetype=armasm