some drawing code C optimisations
authorkub <derkub@gmail.com>
Sun, 25 Aug 2019 15:33:13 +0000 (17:33 +0200)
committerkub <derkub@gmail.com>
Sun, 25 Aug 2019 15:33:13 +0000 (17:33 +0200)
Makefile
README.md
pico/32x/draw.c
pico/draw.c
platform/common/plat_sdl.c

index a3b2c96..88b9238 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,10 @@ CFLAGS += -I.
 ifeq "$(DEBUG)" "0"
 CFLAGS += -O3 -DNDEBUG
 endif
+ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1"))
+# very small caches, avoid optimization options making the binary much bigger
+CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp-clone # -fno-ipa-cp
+endif
 
 # This is actually needed, bevieve me.
 # If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere.
index aa0466d..d779823 100644 (file)
--- a/README.md
+++ b/README.md
@@ -29,8 +29,8 @@ assuming $TC points to the appropriate cross compile toolchain directory:
 
 platform|toolchain|configure command
 --------|---------|-----------------
-gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
-gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
+gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
+gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
 opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux
 opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux
 gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0
index 372f27e..4119f09 100644 (file)
@@ -42,16 +42,21 @@ static void convert_pal555(int invert_prio)
   const unsigned int m1 = 0x001f;                                 \
   const unsigned int m2 = 0x03e0;                                 \
   const unsigned int m3 = 0x7c00;                                 \
-  int i;                                                          \
+  unsigned short t;                                               \
+  int i = 320;                                                    \
                                                                   \
-  for (i = 320; i > 0; i--, pd++, p32x++, pmd++) {                \
-    unsigned short t = *p32x;                                     \
-    if ((*pmd & 0x3f) != mdbg && !((t ^ inv) & 0x8000)) {         \
-      pmd_draw_code;                                              \
-      continue;                                                   \
+  while (i > 0) {                                                 \
+    for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) {    \
+      t = *p32x++;                                                \
+      *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10);      \
+    }                                                             \
+    for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) {    \
+      t = *p32x++;                                                \
+      if ((t ^ inv) & 0x8000)                                     \
+        *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10);    \
+      else                                                        \
+        pmd_draw_code;                                            \
     }                                                             \
-                                                                  \
-    *pd = ((t & m1) << 11) | ((t & m2) << 1) | ((t & m3) >> 10);  \
   }                                                               \
 }
 
@@ -59,15 +64,21 @@ static void convert_pal555(int invert_prio)
 #define do_line_pp(pd, p32x, pmd, pmd_draw_code)                  \
 {                                                                 \
   unsigned short t;                                               \
-  int i;                                                          \
-  for (i = 320; i > 0; i--, pd++, p32x++, pmd++) {                \
-    t = pal[*(unsigned char *)((uintptr_t)p32x ^ 1)];             \
-    if ((t & 0x20) || (*pmd & 0x3f) == mdbg)                      \
+  int i = 320;                                                    \
+  while (i > 0) {                                                 \
+    for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) {    \
+      t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)];       \
       *pd = t;                                                    \
-    else                                                          \
-      pmd_draw_code;                                              \
+    }                                                             \
+    for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) {    \
+      t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)];       \
+      if (t & 0x20)                                               \
+        *pd = t;                                                  \
+      else                                                        \
+        pmd_draw_code;                                            \
+    }                                                             \
   }                                                               \
-} 
+}
 
 // run length mode
 #define do_line_rl(pd, p32x, pmd, pmd_draw_code)                  \
index 4834d6b..0bf7c3d 100644 (file)
@@ -1341,8 +1341,14 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est)
 #if 1\r
     int i;\r
 \r
-    for (i = 0; i < len; i++)\r
-      pd[i] = pal[ps[i]];\r
+    for (i = len; i > 0; i-=4) {\r
+      *pd++ = pal[*ps++];\r
+      *pd++ = pal[*ps++];\r
+      *pd++ = pal[*ps++];\r
+      *pd++ = pal[*ps++];\r
+    }\r
+//    for (i = 0; i < len; i++)\r
+//      pd[i] = pal[ps[i]];\r
 #else\r
     extern void amips_clut(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);\r
     extern void amips_clut_6bit(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);\r
index ef99af2..bce4b08 100644 (file)
@@ -89,7 +89,8 @@ static const struct in_pdata in_sdl_platform_data = {
 /* YUV stuff */
 static int yuv_ry[32], yuv_gy[32], yuv_by[32];
 static unsigned char yuv_u[32 * 2], yuv_v[32 * 2];
-static int yuv_y[256];
+static unsigned char yuv_y[256];
+static struct uyvy {  unsigned int y:8; unsigned int vyu:24; } yuv_uyvy[65536];
 
 void bgr_to_uyvy_init(void)
 {
@@ -124,34 +125,26 @@ void bgr_to_uyvy_init(void)
   for (i = 0; i < 256; i++) {
     yuv_y[i] = 16 + 219 * i / 32;
   }
+  // everything combined into one large array for speed
+  for (i = 0; i < 65536; i++) {
+     int r = (i >> 11) & 0x1f, g = (i >> 6) & 0x1f, b = (i >> 0) & 0x1f;
+     int y = (yuv_ry[r] + yuv_gy[g] + yuv_by[b]) >> 16;
+     yuv_uyvy[i].y = yuv_y[y];
+     yuv_uyvy[i].vyu = (yuv_v[r-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[b-y + 32];
+  }
 }
 
 void rgb565_to_uyvy(void *d, const void *s, int pixels)
 {
   unsigned int *dst = d;
   const unsigned short *src = s;
-  const unsigned char *yu = yuv_u + 32;
-  const unsigned char *yv = yuv_v + 32;
-  int r0, g0, b0, r1, g1, b1;
-  int y0, y1, u, v;
 
-  for (; pixels > 0; src += 2, dst++, pixels -= 2)
+  for (; pixels > 0; src += 4, dst += 2, pixels -= 4)
   {
-    r0 = (src[0] >> 11) & 0x1f;
-    g0 = (src[0] >> 6) & 0x1f;
-    b0 =  src[0] & 0x1f;
-    r1 = (src[1] >> 11) & 0x1f;
-    g1 = (src[1] >> 6) & 0x1f;
-    b1 =  src[1] & 0x1f;
-    y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16;
-    y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16;
-    u = yu[b0 - y0];
-    v = yv[r0 - y0];
-    // valid Y range seems to be 16..235
-    y0 = yuv_y[y0];
-    y1 = yuv_y[y1];
-
-    *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u;
+    struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1];
+    struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3];
+    dst[0] = (uyvy1->y << 24) | uyvy0->vyu;
+    dst[1] = (uyvy3->y << 24) | uyvy2->vyu;
   }
 }