X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fvideo%2FSDL_blit_neon.S;h=dcbfd74310f9841043f45e7060b020922514f461;hb=199f36ec4a30dfc5a736a2d0c6e0840d83f7e9e7;hp=2823bcee38ce597f2d62f1899f94f28d3c5aa016;hpb=bdfa698900e2b6c8601b77004a8ba91f5b30dbb6;p=sdl_omap.git diff --git a/src/video/SDL_blit_neon.S b/src/video/SDL_blit_neon.S index 2823bce..dcbfd74 100644 --- a/src/video/SDL_blit_neon.S +++ b/src/video/SDL_blit_neon.S @@ -1,5 +1,5 @@ /* - * (C) Gražvydas "notaz" Ignotas, 2011 + * (C) Gražvydas "notaz" Ignotas, 2011,2012 * * This work is licensed under the terms of any of these licenses * (at your option): @@ -11,11 +11,20 @@ .text .align 2 +#define func(name) \ + .global name; \ + name + @ void *dst, const void *src, int count, uint abits .macro do_argb bgr2rgb vdup.i8 d0, r3 0: + cmp r2, #8 + pld [r1, #64*2] + blt 3f +1: vld4.8 {d4-d7}, [r1]! +2: .if \bgr2rgb vswp d4, d6 @ BGR->RGB .endif @@ -26,19 +35,48 @@ bxeq lr nop b 0b + +3: + @ unaligned ending nastiness :( + add r12, r1, #8*4 + lsr r12, #12 + cmp r12, r1, lsr #12 @ crossing page? + beq 1b @ nope, overreading is safe + + @ _wb_'s bad luck, do some slow stuff here + push {r0-r2,lr} + sub sp, #8*4 + mov r0, sp + lsl r2, #2 + bl memcpy + vld4.8 {d4-d7}, [sp]! + pop {r0-r2,lr} + b 2b .endm -@ void *dst, const void *src, int count -.macro do_argb_alpha bgr2rgb - mov r3, #0xff - vdup.i16 q12, r3 +@ void *dst, const void *src, int count, uint global_alpha +.macro do_argb_alpha bgr2rgb global_alpha + mov r12, #0xff +.if \global_alpha + vdup.16 q11, r3 +.endif + vdup.i16 q12, r12 + 0: + cmp r2, #8 + blt 3f +1: vld4.8 {d4-d7}, [r1]! + pld [r1, #64*2] vld4.8 {d0-d3}, [r0] + pld [r0, #64+32] +2: .if \bgr2rgb vswp d4, d6 @ BGR->RGB .endif +.if !\global_alpha vmovl.u8 q11, d7 +.endif @ d = (((s-d)*a+255)>>8)+d vsubl.u8 q8, d4, d0 vsubl.u8 q9, d5, d1 @@ -58,20 +96,93 @@ bxeq lr nop b 0b + +3: + @ unaligned ending nastiness :( + add r3, r0, #8*4 + add r12, r1, #8*4 + lsr r3, #12 + lsr r12, #12 + cmp r3, r0, lsr #12 @ are we crossing + cmpeq r12, r1, lsr #12 @ the page boundary? + beq 1b @ nope, overreading is safe + + @ _wb_'s bad luck, do some slow stuff here + push {r0-r2, lr} + vpush {q11, q12} + sub sp, #8*4*2 + lsl r2, #2 + mov r1, r0 + mov r0, sp + bl memcpy + ldr r2, [sp, #8*4*2 + 16*2 + 8] @ stacked r2 + add r0, sp, #8*4 + ldr r1, [sp, #8*4*2 + 16*2 + 4] + lsl r2, #2 + bl memcpy + vld4.8 {d0-d3}, [sp]! + vld4.8 {d4-d7}, [sp]! + vpop {q11, q12} + pop {r0-r2, lr} + b 2b .endm -@ void *dst, const void *src, int count, uint alpha -.macro do_argb_alphaS bgr2rgb +do_argb_finish: + add r2, r2, #8 + vzip.8 d4, d5 @ RRR..|GGG.. -> RGRG.. + vzip.8 d6, d7 @ BBB..|000.. -> B0B0.. + vzip.16 q2, q3 + + vst1.32 d4[0], [r0]! + cmp r2, #1 + bxle lr + vst1.32 d4[1], [r0]! + cmp r2, #2 + bxle lr + vst1.32 d5[0], [r0]! + cmp r2, #3 + bxle lr + vst1.32 d5[1], [r0]! + cmp r2, #4 + bxle lr + vst1.32 d6[0], [r0]! + cmp r2, #5 + bxle lr + vst1.32 d6[1], [r0]! + cmp r2, #6 + bxle lr + vst1.32 d7[0], [r0]! + bx lr + + +@ void *dst, const void *src, int count, uint global_alpha +.macro do_argb_to_rgb565_alpha bgr2rgb global_alpha mov r12, #0xff +.if \global_alpha vdup.16 q11, r3 - vdup.16 q12, r12 +.endif + vdup.i16 q12, r12 0: + cmp r2, #8 + blt 3f +1: vld4.8 {d4-d7}, [r1]! - vld4.8 {d0-d3}, [r0] + pld [r1, #64*2] + vld2.8 {d1-d2}, [r0] + pld [r0, #64+32] .if \bgr2rgb vswp d4, d6 @ BGR->RGB .endif +.if !\global_alpha + vmovl.u8 q11, d7 +.endif + vshl.i8 d0, d1, #3 + vshr.u8 d1, d1, #3 + vsri.i8 d0, d0, #5 @ B + vsli.i8 d1, d2, #5 + vsri.i8 d2, d2, #5 @ R + vsri.i8 d1, d1, #6 @ G @ d = (((s-d)*a+255)>>8)+d vsubl.u8 q8, d4, d0 vsubl.u8 q9, d5, d1 @@ -83,67 +194,81 @@ vaddhn.i16 d5, q9, q12 vaddhn.i16 d6, q10,q12 vadd.i8 q2, q0 - vadd.i8 d6, d2 - vmov.i8 d7, d3 + vadd.i8 d2, d6 @ rrrr rrrr + vshr.u8 d0, d5, #2 + vshr.u8 d1, d4, #3 @ 000b bbbb + vsri.i8 d2, d5, #5 @ rrrr rggg + vsli.i8 d1, d0, #5 @ gggb bbbb subs r2, r2, #8 - blt do_argb_finish - vst4.8 {d4-d7}, [r0]! + blt do_rgb565_finish + vst2.8 {d1-d2}, [r0]! bxeq lr nop b 0b + +3: + @ unaligned ending nastiness :( + add r3, r0, #8*2 + add r12, r1, #8*4 + lsr r3, #12 + lsr r12, #12 + cmp r3, r0, lsr #12 @ are we crossing + cmpeq r12, r1, lsr #12 @ the page boundary? + beq 1b @ nope, overreading is safe + + nop + bx lr @ abandon ship! (until someone complains) .endm -do_argb_finish: +do_rgb565_finish: + vzip.8 d1, d2 add r2, r2, #8 - vzip.8 d4, d5 @ RRR..|GGG.. -> RGRG.. - vzip.8 d6, d7 @ BBB..|000.. -> B0B0.. - vzip.16 q2, q3 - - vst1.32 d4[0], [r0]! + + vst1.16 d1[0], [r0]! cmp r2, #1 bxle lr - vst1.32 d4[1], [r0]! + vst1.16 d1[1], [r0]! cmp r2, #2 bxle lr - vst1.32 d5[0], [r0]! + vst1.16 d1[2], [r0]! cmp r2, #3 bxle lr - vst1.32 d5[1], [r0]! + vst1.16 d1[3], [r0]! cmp r2, #4 bxle lr - vst1.32 d6[0], [r0]! + vst1.16 d2[0], [r0]! cmp r2, #5 bxle lr - vst1.32 d6[1], [r0]! + vst1.16 d2[1], [r0]! cmp r2, #6 bxle lr - vst1.32 d7[0], [r0]! + vst1.16 d2[2], [r0]! bx lr -.global neon_ARGBtoXRGB -neon_ARGBtoXRGB: +func(neon_ARGBtoXRGB): do_argb 0 -.global neon_ABGRtoXRGB -neon_ABGRtoXRGB: +func(neon_ABGRtoXRGB): do_argb 1 -.global neon_ARGBtoXRGBalpha -neon_ARGBtoXRGBalpha: - do_argb_alpha 0 +func(neon_ARGBtoXRGBalpha): + do_argb_alpha 0, 0 + +func(neon_ABGRtoXRGBalpha): + do_argb_alpha 1, 0 + +func(neon_ARGBtoXRGBalphaS): + do_argb_alpha 0, 1 -.global neon_ABGRtoXRGBalpha -neon_ABGRtoXRGBalpha: - do_argb_alpha 1 +func(neon_ABGRtoXRGBalphaS): + do_argb_alpha 1, 1 -.global neon_ARGBtoXRGBalphaS -neon_ARGBtoXRGBalphaS: - do_argb_alphaS 0 +func(neon_ARGBtoRGB565alpha): + do_argb_to_rgb565_alpha 0, 0 -.global neon_ABGRtoXRGBalphaS -neon_ABGRtoXRGBalphaS: - do_argb_alphaS 1 +func(neon_ABGRtoRGB565alpha): + do_argb_to_rgb565_alpha 1, 0 @ vim:filetype=armasm