NEONize a few more blit types
authornotaz <notasas@gmail.com>
Sat, 9 Feb 2013 21:01:18 +0000 (23:01 +0200)
committernotaz <notasas@gmail.com>
Sat, 9 Feb 2013 21:06:29 +0000 (23:06 +0200)
src/video/SDL_blit_A.c
src/video/SDL_blit_neon.S

index d5000b2..b013ed3 100644 (file)
@@ -72,13 +72,14 @@ static void name(SDL_BlitInfo *info) \
        int height = info->d_height; \
        Uint8 *src = info->s_pixels; \
        Uint8 *dst = info->d_pixels; \
-       int srcskip = info->s_skip; \
-       int dstskip = info->d_skip; \
+       int dstBpp = info->dst->BytesPerPixel; \
+       int srcstride = width * 4 + info->s_skip; \
+       int dststride = width * dstBpp + info->d_skip; \
 \
        while ( height-- ) { \
-            neon_name(dst, src, width); \
-           src += width * 4 + srcskip; \
-           dst += width * 4 + dstskip; \
+           neon_name(dst, src, width); \
+           src += srcstride; \
+           dst += dststride; \
        } \
 }
 
@@ -103,6 +104,8 @@ static void name(SDL_BlitInfo *info) \
 
 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
+make_neon_caller(BlitABGRtoRGB565alpha_neon, neon_ABGRtoRGB565alpha)
+make_neon_caller(BlitARGBtoRGB565alpha_neon, neon_ARGBtoRGB565alpha)
 make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
 make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
 
@@ -2904,6 +2907,16 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
           df->Bmask == 0x1f && SDL_HasAltiVec())
             return Blit32to565PixelAlphaAltivec;
         else
+#endif
+#ifdef __ARM_NEON__
+           if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
+              && sf->Gmask == 0xff00 && df->Gmask == 0x7e0) {
+               if((sf->Bmask >> 3) == df->Bmask || (sf->Rmask >> 3) == df->Rmask)
+                   return BlitARGBtoRGB565alpha_neon;
+               else
+                   return BlitABGRtoRGB565alpha_neon;
+           }
+           else
 #endif
            if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
               && sf->Gmask == 0xff00
index af9af36..344ae05 100644 (file)
 .text
 .align 2
 
+#define func(name) \
+    .global name; \
+    name
+
 @ void *dst, const void *src, int count, uint abits
 .macro do_argb bgr2rgb
     vdup.i8    d0, r3
@@ -96,28 +100,103 @@ do_argb_finish:
     bx         lr
 
 
-.global neon_ARGBtoXRGB
-neon_ARGBtoXRGB:
+@ void *dst, const void *src, int count, uint global_alpha
+.macro do_argb_to_rgb565_alpha bgr2rgb global_alpha
+    mov        r12, #0xff
+.if \global_alpha
+    vdup.16    q11, r3
+.endif
+    vdup.i16   q12, r12
+0:
+    pld        [r1, #64*2]
+    pld        [r0, #64*2]
+    vld4.8     {d4-d7}, [r1]!
+    vld2.8     {d1-d2}, [r0]
+.if \bgr2rgb
+    vswp       d4, d6          @ BGR->RGB
+.endif
+.if !\global_alpha
+    vmovl.u8   q11, d7
+.endif
+    vshl.i8    d0, d1, #3
+    vshr.u8    d1, d1, #3
+    vsri.i8    d0, d0, #5       @ B
+    vsli.i8    d1, d2, #5
+    vsri.i8    d2, d2, #5       @ R
+    vsri.i8    d1, d1, #6       @ G
+    @ d = (((s-d)*a+255)>>8)+d
+    vsubl.u8   q8, d4, d0
+    vsubl.u8   q9, d5, d1
+    vsubl.u8   q10,d6, d2
+    vmul.s16   q8, q8, q11
+    vmul.s16   q9, q9, q11
+    vmul.s16   q10,q10,q11
+    vaddhn.i16 d4, q8, q12
+    vaddhn.i16 d5, q9, q12
+    vaddhn.i16 d6, q10,q12
+    vadd.i8    q2, q0
+    vadd.i8    d2, d6           @ rrrr rrrr
+    vshr.u8    d0, d5, #2
+    vshr.u8    d1, d4, #3       @ 000b bbbb
+    vsri.i8    d2, d5, #5       @ rrrr rggg
+    vsli.i8    d1, d0, #5       @ gggb bbbb
+    subs       r2, r2, #8
+    blt        do_rgb565_finish
+    vst2.8     {d1-d2}, [r0]!
+    bxeq       lr
+    nop
+    b          0b
+.endm
+
+
+do_rgb565_finish:
+    vzip.8     d1, d2
+    add        r2, r2, #8
+
+    vst1.16    d1[0], [r0]!
+    cmp        r2, #1
+    bxle       lr
+    vst1.16    d1[1], [r0]!
+    cmp        r2, #2
+    bxle       lr
+    vst1.16    d1[2], [r0]!
+    cmp        r2, #3
+    bxle       lr
+    vst1.16    d1[3], [r0]!
+    cmp        r2, #4
+    bxle       lr
+    vst1.16    d2[0], [r0]!
+    cmp        r2, #5
+    bxle       lr
+    vst1.16    d2[1], [r0]!
+    cmp        r2, #6
+    bxle       lr
+    vst1.16    d2[2], [r0]!
+    bx         lr
+
+
+func(neon_ARGBtoXRGB):
     do_argb 0
 
-.global neon_ABGRtoXRGB
-neon_ABGRtoXRGB:
+func(neon_ABGRtoXRGB):
     do_argb 1
 
-.global neon_ARGBtoXRGBalpha
-neon_ARGBtoXRGBalpha:
+func(neon_ARGBtoXRGBalpha):
     do_argb_alpha 0, 0
 
-.global neon_ABGRtoXRGBalpha
-neon_ABGRtoXRGBalpha:
+func(neon_ABGRtoXRGBalpha):
     do_argb_alpha 1, 0
 
-.global neon_ARGBtoXRGBalphaS
-neon_ARGBtoXRGBalphaS:
+func(neon_ARGBtoXRGBalphaS):
     do_argb_alpha 0, 1
 
-.global neon_ABGRtoXRGBalphaS
-neon_ABGRtoXRGBalphaS:
+func(neon_ABGRtoXRGBalphaS):
     do_argb_alpha 1, 1
 
+func(neon_ARGBtoRGB565alpha):
+    do_argb_to_rgb565_alpha 0, 0
+
+func(neon_ABGRtoRGB565alpha):
+    do_argb_to_rgb565_alpha 1, 0
+
 @ vim:filetype=armasm