From: kub Date: Sun, 25 Aug 2019 15:33:13 +0000 (+0200) Subject: some drawing code C optimisations X-Git-Tag: v2.00~836 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f740428b81340b35b0e12756e4876f8aac811478;p=picodrive.git some drawing code C optimisations --- diff --git a/Makefile b/Makefile index a3b2c96b..88b9238f 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,10 @@ CFLAGS += -I. ifeq "$(DEBUG)" "0" CFLAGS += -O3 -DNDEBUG endif +ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1")) +# very small caches, avoid optimization options making the binary much bigger +CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp-clone # -fno-ipa-cp +endif # This is actually needed, bevieve me. # If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere. diff --git a/README.md b/README.md index aa0466d1..d7798231 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ assuming $TC points to the appropriate cross compile toolchain directory: platform|toolchain|configure command --------|---------|----------------- -gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x -gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x +gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0 diff --git a/pico/32x/draw.c b/pico/32x/draw.c index 372f27ef..4119f09d 100644 --- a/pico/32x/draw.c +++ b/pico/32x/draw.c @@ -42,16 +42,21 @@ static void convert_pal555(int invert_prio) const unsigned int m1 = 0x001f; \ const unsigned int m2 = 0x03e0; \ const unsigned int m3 = 0x7c00; \ - int i; \ + unsigned short t; \ + int i = 320; \ \ - for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \ - unsigned short t = *p32x; \ - if ((*pmd & 0x3f) != mdbg && !((t ^ inv) & 0x8000)) { \ - pmd_draw_code; \ - continue; \ + while (i > 0) { \ + for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \ + t = *p32x++; \ + *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \ + } \ + for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \ + t = *p32x++; \ + if ((t ^ inv) & 0x8000) \ + *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \ + else \ + pmd_draw_code; \ } \ - \ - *pd = ((t & m1) << 11) | ((t & m2) << 1) | ((t & m3) >> 10); \ } \ } @@ -59,15 +64,21 @@ static void convert_pal555(int invert_prio) #define do_line_pp(pd, p32x, pmd, pmd_draw_code) \ { \ unsigned short t; \ - int i; \ - for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \ - t = pal[*(unsigned char *)((uintptr_t)p32x ^ 1)]; \ - if ((t & 0x20) || (*pmd & 0x3f) == mdbg) \ + int i = 320; \ + while (i > 0) { \ + for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \ + t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \ *pd = t; \ - else \ - pmd_draw_code; \ + } \ + for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \ + t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \ + if (t & 0x20) \ + *pd = t; \ + else \ + pmd_draw_code; \ + } \ } \ -} +} // run length mode #define do_line_rl(pd, p32x, pmd, pmd_draw_code) \ diff --git a/pico/draw.c b/pico/draw.c index 4834d6bf..0bf7c3de 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1341,8 +1341,14 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est) #if 1 int i; - for (i = 0; i < len; i++) - pd[i] = pal[ps[i]]; + for (i = len; i > 0; i-=4) { + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + *pd++ = pal[*ps++]; + } +// for (i = 0; i < len; i++) +// pd[i] = pal[ps[i]]; #else extern void amips_clut(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); extern void amips_clut_6bit(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); diff --git a/platform/common/plat_sdl.c b/platform/common/plat_sdl.c index ef99af2a..bce4b084 100644 --- a/platform/common/plat_sdl.c +++ b/platform/common/plat_sdl.c @@ -89,7 +89,8 @@ static const struct in_pdata in_sdl_platform_data = { /* YUV stuff */ static int yuv_ry[32], yuv_gy[32], yuv_by[32]; static unsigned char yuv_u[32 * 2], yuv_v[32 * 2]; -static int yuv_y[256]; +static unsigned char yuv_y[256]; +static struct uyvy { unsigned int y:8; unsigned int vyu:24; } yuv_uyvy[65536]; void bgr_to_uyvy_init(void) { @@ -124,34 +125,26 @@ void bgr_to_uyvy_init(void) for (i = 0; i < 256; i++) { yuv_y[i] = 16 + 219 * i / 32; } + // everything combined into one large array for speed + for (i = 0; i < 65536; i++) { + int r = (i >> 11) & 0x1f, g = (i >> 6) & 0x1f, b = (i >> 0) & 0x1f; + int y = (yuv_ry[r] + yuv_gy[g] + yuv_by[b]) >> 16; + yuv_uyvy[i].y = yuv_y[y]; + yuv_uyvy[i].vyu = (yuv_v[r-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[b-y + 32]; + } } void rgb565_to_uyvy(void *d, const void *s, int pixels) { unsigned int *dst = d; const unsigned short *src = s; - const unsigned char *yu = yuv_u + 32; - const unsigned char *yv = yuv_v + 32; - int r0, g0, b0, r1, g1, b1; - int y0, y1, u, v; - for (; pixels > 0; src += 2, dst++, pixels -= 2) + for (; pixels > 0; src += 4, dst += 2, pixels -= 4) { - r0 = (src[0] >> 11) & 0x1f; - g0 = (src[0] >> 6) & 0x1f; - b0 = src[0] & 0x1f; - r1 = (src[1] >> 11) & 0x1f; - g1 = (src[1] >> 6) & 0x1f; - b1 = src[1] & 0x1f; - y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16; - y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16; - u = yu[b0 - y0]; - v = yv[r0 - y0]; - // valid Y range seems to be 16..235 - y0 = yuv_y[y0]; - y1 = yuv_y[y1]; - - *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u; + struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1]; + struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3]; + dst[0] = (uyvy1->y << 24) | uyvy0->vyu; + dst[1] = (uyvy3->y << 24) | uyvy2->vyu; } }