ifeq "$(DEBUG)" "0"
CFLAGS += -O3 -DNDEBUG
endif
+ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1"))
+# very small caches, avoid optimization options making the binary much bigger
+CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp-clone # -fno-ipa-cp
+endif
# This is actually needed, bevieve me.
# If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere.
platform|toolchain|configure command
--------|---------|-----------------
-gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
-gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
+gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
+gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux
opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux
gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0
const unsigned int m1 = 0x001f; \
const unsigned int m2 = 0x03e0; \
const unsigned int m3 = 0x7c00; \
- int i; \
+ unsigned short t; \
+ int i = 320; \
\
- for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \
- unsigned short t = *p32x; \
- if ((*pmd & 0x3f) != mdbg && !((t ^ inv) & 0x8000)) { \
- pmd_draw_code; \
- continue; \
+ while (i > 0) { \
+ for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \
+ t = *p32x++; \
+ *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \
+ } \
+ for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \
+ t = *p32x++; \
+ if ((t ^ inv) & 0x8000) \
+ *pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \
+ else \
+ pmd_draw_code; \
} \
- \
- *pd = ((t & m1) << 11) | ((t & m2) << 1) | ((t & m3) >> 10); \
} \
}
#define do_line_pp(pd, p32x, pmd, pmd_draw_code) \
{ \
unsigned short t; \
- int i; \
- for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \
- t = pal[*(unsigned char *)((uintptr_t)p32x ^ 1)]; \
- if ((t & 0x20) || (*pmd & 0x3f) == mdbg) \
+ int i = 320; \
+ while (i > 0) { \
+ for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \
+ t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \
*pd = t; \
- else \
- pmd_draw_code; \
+ } \
+ for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \
+ t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \
+ if (t & 0x20) \
+ *pd = t; \
+ else \
+ pmd_draw_code; \
+ } \
} \
-}
+}
// run length mode
#define do_line_rl(pd, p32x, pmd, pmd_draw_code) \
#if 1\r
int i;\r
\r
- for (i = 0; i < len; i++)\r
- pd[i] = pal[ps[i]];\r
+ for (i = len; i > 0; i-=4) {\r
+ *pd++ = pal[*ps++];\r
+ *pd++ = pal[*ps++];\r
+ *pd++ = pal[*ps++];\r
+ *pd++ = pal[*ps++];\r
+ }\r
+// for (i = 0; i < len; i++)\r
+// pd[i] = pal[ps[i]];\r
#else\r
extern void amips_clut(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);\r
extern void amips_clut_6bit(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);\r
/* YUV stuff */
static int yuv_ry[32], yuv_gy[32], yuv_by[32];
static unsigned char yuv_u[32 * 2], yuv_v[32 * 2];
-static int yuv_y[256];
+static unsigned char yuv_y[256];
+static struct uyvy { unsigned int y:8; unsigned int vyu:24; } yuv_uyvy[65536];
void bgr_to_uyvy_init(void)
{
for (i = 0; i < 256; i++) {
yuv_y[i] = 16 + 219 * i / 32;
}
+ // everything combined into one large array for speed
+ for (i = 0; i < 65536; i++) {
+ int r = (i >> 11) & 0x1f, g = (i >> 6) & 0x1f, b = (i >> 0) & 0x1f;
+ int y = (yuv_ry[r] + yuv_gy[g] + yuv_by[b]) >> 16;
+ yuv_uyvy[i].y = yuv_y[y];
+ yuv_uyvy[i].vyu = (yuv_v[r-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[b-y + 32];
+ }
}
void rgb565_to_uyvy(void *d, const void *s, int pixels)
{
unsigned int *dst = d;
const unsigned short *src = s;
- const unsigned char *yu = yuv_u + 32;
- const unsigned char *yv = yuv_v + 32;
- int r0, g0, b0, r1, g1, b1;
- int y0, y1, u, v;
- for (; pixels > 0; src += 2, dst++, pixels -= 2)
+ for (; pixels > 0; src += 4, dst += 2, pixels -= 4)
{
- r0 = (src[0] >> 11) & 0x1f;
- g0 = (src[0] >> 6) & 0x1f;
- b0 = src[0] & 0x1f;
- r1 = (src[1] >> 11) & 0x1f;
- g1 = (src[1] >> 6) & 0x1f;
- b1 = src[1] & 0x1f;
- y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16;
- y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16;
- u = yu[b0 - y0];
- v = yv[r0 - y0];
- // valid Y range seems to be 16..235
- y0 = yuv_y[y0];
- y1 = yuv_y[y1];
-
- *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u;
+ struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1];
+ struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3];
+ dst[0] = (uyvy1->y << 24) | uyvy0->vyu;
+ dst[1] = (uyvy3->y << 24) | uyvy2->vyu;
}
}