From 32feba7458a9497f27d72e219cf177774c09ce45 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 25 Mar 2019 19:31:32 +0100 Subject: [PATCH] minor changes --- pico/32x/32x.c | 4 +- pico/m68kif_cyclone.s | 8 +-- pico/pico_int.h | 3 +- pico/sms.c | 2 +- platform/common/helix/lib.c | 67 +------------------ platform/common/memcpy.c | 125 ++++++++++++++++++++++++++++++++++++ platform/common/plat_sdl.c | 9 ++- platform/common/version.h | 2 +- tools/mkoffsets.sh | 9 ++- 9 files changed, 151 insertions(+), 78 deletions(-) create mode 100644 platform/common/memcpy.c diff --git a/pico/32x/32x.c b/pico/32x/32x.c index 3ee8c2ea..a15cb112 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -194,11 +194,11 @@ void PicoPower32x(void) void PicoUnload32x(void) { + sh2_finish(&msh2); + sh2_finish(&ssh2); if (Pico32xMem != NULL) plat_munmap(Pico32xMem, sizeof(*Pico32xMem)); Pico32xMem = NULL; - sh2_finish(&msh2); - sh2_finish(&ssh2); PicoIn.AHW &= ~PAHW_32X; } diff --git a/pico/m68kif_cyclone.s b/pico/m68kif_cyclone.s index a0a508cd..3a9621dc 100644 --- a/pico/m68kif_cyclone.s +++ b/pico/m68kif_cyclone.s @@ -87,19 +87,19 @@ cyclone_fetch32: orrcc r0, r1, r0, lsl #16 bxcc lr - stmfd sp!,{r0,r1,lr} + stmfd sp!,{r0,r1,r2,lr} mov lr, pc bx r1 mov r2, r0, lsl #16 - ldmia sp, {r0,r1} + ldmfd sp!, {r0,r1} str r2, [sp] add r0, r0, #2 mov lr, pc bx r1 - ldr r1, [sp] + ldmfd sp!, {r1,lr} mov r0, r0, lsl #16 orr r0, r1, r0, lsr #16 - ldmfd sp!,{r1,r2,pc} + bx lr cyclone_write8: @ u32 a, u8 d diff --git a/pico/pico_int.h b/pico/pico_int.h index f6d8b37f..4d599ce8 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -241,7 +241,7 @@ extern SH2 sh2s[2]; # define sh2_pc(sh2) (sh2)->pc #endif -#define sh2_cycles_done(sh2) ((unsigned)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) +#define sh2_cycles_done(sh2) (unsigned)((int)(sh2)->cycles_timeslice - sh2_cycles_left(sh2)) #define sh2_cycles_done_t(sh2) \ (unsigned)(C_M68K_TO_SH2(sh2, (sh2)->m68krcycles_done) + sh2_cycles_done(sh2)) #define sh2_cycles_done_m68k(sh2) \ @@ -650,6 +650,7 @@ PICO_INTERNAL void PicoFrameStart(void); void PicoDrawSync(int to, int blank_last_line); void BackFill(int reg7, int sh, struct PicoEState *est); void FinalizeLine555(int sh, int line, struct PicoEState *est); +void PicoDrawSetOutBufMD(void *dest, int increment); extern int (*PicoScanBegin)(unsigned int num); extern int (*PicoScanEnd)(unsigned int num); #define MAX_LINE_SPRITES 29 diff --git a/pico/sms.c b/pico/sms.c index 286b8bf1..2800e209 100644 --- a/pico/sms.c +++ b/pico/sms.c @@ -46,8 +46,8 @@ static void vdp_data_write(unsigned char d) struct PicoVideo *pv = &Pico.video; if (pv->type == 3) { + if (PicoMem.cram[pv->addr & 0x1f] != d) Pico.m.dirtyPal = 1; PicoMem.cram[pv->addr & 0x1f] = d; - Pico.m.dirtyPal = 1; } else { PicoMem.vramb[pv->addr] = d; } diff --git a/platform/common/helix/lib.c b/platform/common/helix/lib.c index d7c511be..d2b05898 100644 --- a/platform/common/helix/lib.c +++ b/platform/common/helix/lib.c @@ -53,70 +53,5 @@ void *memmove (void *dest, const void *src, size_t n) return dest; } #else -/* memcpy/memmove in C with some simple optimizations. - * ATTN does dirty aliasing tricks with undefined behaviour by standard. - * (this works fine with gcc, though...) - */ -void *memcpy(void *dest, const void *src, size_t n) -{ - struct _16 { uint32_t a[4]; }; - union { const void *v; char *c; uint64_t *l; struct _16 *s; } - ss = { src }, ds = { dest }; - const int lm = sizeof(uint32_t)-1; - - if ((((unsigned)ss.c ^ (unsigned)ds.c) & lm) == 0) { - /* fast copy if pointers have the same aligment */ - while (((unsigned)ss.c & lm) && n > 0) /* align to word */ - *ds.c++ = *ss.c++, n--; - while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */ - *ds.s++ = *ss.s++, n -= sizeof(struct _16); - if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ - *ds.l++ = *ss.l++, n -= sizeof(uint64_t); - } else { - /* byte copy if pointers are unaligned */ - while (n >= 8) { /* copy 8 byte blocks */ - *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; - *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; - *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; - *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; - } - } - /* copy max. 8 leftover bytes */ - while (n > 0) - *ds.c++ = *ss.c++, n--; - return dest; -} - -void *memmove (void *dest, const void *src, size_t n) -{ - struct _16 { uint32_t a[4]; }; - union { const void *v; char *c; uint64_t *l; struct _16 *s; } - ss = { src+n }, ds = { dest+n }; - const int lm = sizeof(uint32_t)-1; - - if (dest <= src || dest >= src+n) - return memcpy(dest, src, n); - - if ((((unsigned)ss.c ^ (unsigned)ds.c) & lm) == 0) { - /* fast copy if pointers have the same aligment */ - while (((unsigned)ss.c & lm) && n > 0) - *--ds.c = *--ss.c, n--; - while (n >= sizeof(struct _16)) - *--ds.s = *--ss.s, n -= sizeof(struct _16); - if (n >= sizeof(uint64_t)) - *--ds.l = *--ss.l, n -= sizeof(uint64_t); - } else { - /* byte copy if pointers are unaligned */ - while (n >= 8) { - *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; - *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; - *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; - *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; - } - } - /* copy max. 8 leftover bytes */ - while (n > 0) - *--ds.c = *--ss.c, n--; - return dest; -} +#include "../memcpy.c" #endif diff --git a/platform/common/memcpy.c b/platform/common/memcpy.c new file mode 100644 index 00000000..b99de4ae --- /dev/null +++ b/platform/common/memcpy.c @@ -0,0 +1,125 @@ +/* + * (C) 2018 Kai-Uwe Bloem + * + * 32bit ARM/MIPS optimized C implementation of memcpy and memove, designed for + * good performance with gcc. + * - if src and dest have the same alignment, 4-word copy is used. + * - if src and dest are unaligned to each other, still loads word data and + * stores correctly shifted word data (for all but the first and last bytes + * to avoid under/overstepping the src region). + * + * ATTN does dirty aliasing tricks with undefined behaviour by standard. + * (however, this was needed to improve the generated code). + * ATTN uses struct assignment, which only works if the compiler is inlining + * this (else it would probably call memcpy :-)). + */ +#include +#include + +#include +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define _L_ >> +#define _U_ << +#else +#define _L_ << +#define _U_ >> +#endif + +void *memcpy(void *dest, const void *src, size_t n) +{ + struct _16 { uint32_t a[4]; }; + union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; } + ss = { src }, ds = { dest }; + const int lm = sizeof(uint32_t)-1; + + /* align src to word */ + while (((unsigned)ss.c & lm) && n > 0) + *ds.c++ = *ss.c++, n--; + if (((unsigned)ds.c & lm) == 0) { + /* fast copy if pointers have the same aligment */ + while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */ + *ds.s++ = *ss.s++, n -= sizeof(struct _16); + if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ + *ds.l++ = *ss.l++, n -= sizeof(uint64_t); + } else if (n >= 2*sizeof(uint32_t)) { + /* unaligned data big enough to avoid overstepping src */ + uint32_t v1, v2, b, s; + /* align dest to word */ + while (((unsigned)ds.c & lm) && n > 0) + *ds.c++ = *ss.c++, n--; + /* copy loop: load aligned words and store shifted words */ + b = (unsigned)ss.c & lm, s = b*8; ss.c -= b; + v1 = *ss.i++, v2 = *ss.i++; + while (n >= 3*sizeof(uint32_t)) { + *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); v1 = *ss.i++; + *ds.i++ = (v2 _L_ s) | (v1 _U_ (32-s)); v2 = *ss.i++; + n -= 2*sizeof(uint32_t); + } + /* data for one more store is already loaded */ + if (n >= sizeof(uint32_t)) { + *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); + n -= sizeof(uint32_t); + ss.c += sizeof(uint32_t); + } + ss.c += b - 2*sizeof(uint32_t); + } + /* copy 0-7 leftover bytes */ + while (n >= 4) { + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--; + } + while (n > 0) + *ds.c++ = *ss.c++, n--; + return dest; +} + +void *memmove (void *dest, const void *src, size_t n) +{ + struct _16 { uint32_t a[4]; }; + union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; } + ss = { src+n }, ds = { dest+n }; + const int lm = sizeof(uint32_t)-1; + + if (dest <= src || dest >= src+n) + return memcpy(dest, src, n); + + /* align src to word */ + while (((unsigned)ss.c & lm) && n > 0) + *--ds.c = *--ss.c, n--; + if (((unsigned)ds.c & lm) == 0) { + /* fast copy if pointers have the same aligment */ + while (n >= sizeof(struct _16)) /* copy 16 byte blocks */ + *--ds.s = *--ss.s, n -= sizeof(struct _16); + if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ + *--ds.l = *--ss.l, n -= sizeof(uint64_t); + } else if (n >= 2*sizeof(uint32_t)) { + /* unaligned data big enough to avoid understepping src */ + uint32_t v1, v2, b, s; + /* align dest to word */ + while (((unsigned)ds.c & lm) && n > 0) + *--ds.c = *--ss.c, n--; + /* copy loop: load aligned words and store shifted words */ + b = (unsigned)ss.c & lm, s = b*8; ss.c += b; + v1 = *--ss.i, v2 = *--ss.i; + while (n >= 3*sizeof(uint32_t)) { + *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); v1 = *--ss.i; + *--ds.i = (v2 _U_ s) | (v1 _L_ (32-s)); v2 = *--ss.i; + n -= 2*sizeof(uint32_t); + } + /* data for one more store is already loaded */ + if (n >= sizeof(uint32_t)) { + *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); + n -= sizeof(uint32_t); + ss.c -= sizeof(uint32_t); + } + ss.c -= b - 2*sizeof(uint32_t); + } + /* copy 0-7 leftover bytes */ + while (n >= 4) { + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; + } + while (n > 0) + *--ds.c = *--ss.c, n--; + return dest; +} diff --git a/platform/common/plat_sdl.c b/platform/common/plat_sdl.c index 4446f72e..ef99af2a 100644 --- a/platform/common/plat_sdl.c +++ b/platform/common/plat_sdl.c @@ -89,6 +89,7 @@ static const struct in_pdata in_sdl_platform_data = { /* YUV stuff */ static int yuv_ry[32], yuv_gy[32], yuv_by[32]; static unsigned char yuv_u[32 * 2], yuv_v[32 * 2]; +static int yuv_y[256]; void bgr_to_uyvy_init(void) { @@ -119,6 +120,10 @@ void bgr_to_uyvy_init(void) v = 255; yuv_v[i + 32] = v; } + // valid Y range seems to be 16..235 + for (i = 0; i < 256; i++) { + yuv_y[i] = 16 + 219 * i / 32; + } } void rgb565_to_uyvy(void *d, const void *s, int pixels) @@ -143,8 +148,8 @@ void rgb565_to_uyvy(void *d, const void *s, int pixels) u = yu[b0 - y0]; v = yv[r0 - y0]; // valid Y range seems to be 16..235 - y0 = 16 + 219 * y0 / 31; - y1 = 16 + 219 * y1 / 31; + y0 = yuv_y[y0]; + y1 = yuv_y[y1]; *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u; } diff --git a/platform/common/version.h b/platform/common/version.h index f65ba1ed..8b3adbf8 100644 --- a/platform/common/version.h +++ b/platform/common/version.h @@ -1 +1 @@ -#define VERSION "1.93" +#define VERSION "1.93+" diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 60088f21..90e65867 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -1,16 +1,21 @@ -# usage: mkoffsets # automatically compute structure offsets for gcc targets in ELF format +# (C) 2018 Kai-Uwe Bloem. This work is placed in the public domain. +# +# usage: mkoffsets CC=${CC:-gcc} # endianess of target (automagically determined below) ENDIAN= +# compile with target C compiler and extract value from .rodata section compile_rodata () { $CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 + # find the name of the .rodata section (in case -fdata-sections is used) rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | sed 's/^[^.]*././;s/ .*//') + # read out .rodata section as hex string (should be only 4 or 8 bytes) objcopy --dump-section $rosect=/tmp/getoffs.ro /tmp/getoffs.o || exit 1 ro=$(xxd -ps /tmp/getoffs.ro) if [ "$ENDIAN" = "le" ]; then @@ -22,9 +27,11 @@ compile_rodata () else hex=$ro fi + # extract decimal value from hex string rodata=$(printf "%d" 0x$hex) } +# determine member offset and create #define get_define () # prefix struct member member... { prefix=$1; shift -- 2.39.5