void PicoUnload32x(void)
{
+ sh2_finish(&msh2);
+ sh2_finish(&ssh2);
if (Pico32xMem != NULL)
plat_munmap(Pico32xMem, sizeof(*Pico32xMem));
Pico32xMem = NULL;
- sh2_finish(&msh2);
- sh2_finish(&ssh2);
PicoIn.AHW &= ~PAHW_32X;
}
orrcc r0, r1, r0, lsl #16
bxcc lr
- stmfd sp!,{r0,r1,lr}
+ stmfd sp!,{r0,r1,r2,lr}
mov lr, pc
bx r1
mov r2, r0, lsl #16
- ldmia sp, {r0,r1}
+ ldmfd sp!, {r0,r1}
str r2, [sp]
add r0, r0, #2
mov lr, pc
bx r1
- ldr r1, [sp]
+ ldmfd sp!, {r1,lr}
mov r0, r0, lsl #16
orr r0, r1, r0, lsr #16
- ldmfd sp!,{r1,r2,pc}
+ bx lr
cyclone_write8: @ u32 a, u8 d
# define sh2_pc(sh2) (sh2)->pc\r
#endif\r
\r
-#define sh2_cycles_done(sh2) ((unsigned)(sh2)->cycles_timeslice - sh2_cycles_left(sh2))\r
+#define sh2_cycles_done(sh2) (unsigned)((int)(sh2)->cycles_timeslice - sh2_cycles_left(sh2))\r
#define sh2_cycles_done_t(sh2) \\r
(unsigned)(C_M68K_TO_SH2(sh2, (sh2)->m68krcycles_done) + sh2_cycles_done(sh2))\r
#define sh2_cycles_done_m68k(sh2) \\r
void PicoDrawSync(int to, int blank_last_line);\r
void BackFill(int reg7, int sh, struct PicoEState *est);\r
void FinalizeLine555(int sh, int line, struct PicoEState *est);\r
+void PicoDrawSetOutBufMD(void *dest, int increment);\r
extern int (*PicoScanBegin)(unsigned int num);\r
extern int (*PicoScanEnd)(unsigned int num);\r
#define MAX_LINE_SPRITES 29\r
struct PicoVideo *pv = &Pico.video;
if (pv->type == 3) {
+ if (PicoMem.cram[pv->addr & 0x1f] != d) Pico.m.dirtyPal = 1;
PicoMem.cram[pv->addr & 0x1f] = d;
- Pico.m.dirtyPal = 1;
} else {
PicoMem.vramb[pv->addr] = d;
}
return dest;
}
#else
-/* memcpy/memmove in C with some simple optimizations.
- * ATTN does dirty aliasing tricks with undefined behaviour by standard.
- * (this works fine with gcc, though...)
- */
-void *memcpy(void *dest, const void *src, size_t n)
-{
- struct _16 { uint32_t a[4]; };
- union { const void *v; char *c; uint64_t *l; struct _16 *s; }
- ss = { src }, ds = { dest };
- const int lm = sizeof(uint32_t)-1;
-
- if ((((unsigned)ss.c ^ (unsigned)ds.c) & lm) == 0) {
- /* fast copy if pointers have the same aligment */
- while (((unsigned)ss.c & lm) && n > 0) /* align to word */
- *ds.c++ = *ss.c++, n--;
- while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */
- *ds.s++ = *ss.s++, n -= sizeof(struct _16);
- if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */
- *ds.l++ = *ss.l++, n -= sizeof(uint64_t);
- } else {
- /* byte copy if pointers are unaligned */
- while (n >= 8) { /* copy 8 byte blocks */
- *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--;
- *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--;
- *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--;
- *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--;
- }
- }
- /* copy max. 8 leftover bytes */
- while (n > 0)
- *ds.c++ = *ss.c++, n--;
- return dest;
-}
-
-void *memmove (void *dest, const void *src, size_t n)
-{
- struct _16 { uint32_t a[4]; };
- union { const void *v; char *c; uint64_t *l; struct _16 *s; }
- ss = { src+n }, ds = { dest+n };
- const int lm = sizeof(uint32_t)-1;
-
- if (dest <= src || dest >= src+n)
- return memcpy(dest, src, n);
-
- if ((((unsigned)ss.c ^ (unsigned)ds.c) & lm) == 0) {
- /* fast copy if pointers have the same aligment */
- while (((unsigned)ss.c & lm) && n > 0)
- *--ds.c = *--ss.c, n--;
- while (n >= sizeof(struct _16))
- *--ds.s = *--ss.s, n -= sizeof(struct _16);
- if (n >= sizeof(uint64_t))
- *--ds.l = *--ss.l, n -= sizeof(uint64_t);
- } else {
- /* byte copy if pointers are unaligned */
- while (n >= 8) {
- *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--;
- *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--;
- *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--;
- *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--;
- }
- }
- /* copy max. 8 leftover bytes */
- while (n > 0)
- *--ds.c = *--ss.c, n--;
- return dest;
-}
+#include "../memcpy.c"
#endif
--- /dev/null
+/*
+ * (C) 2018 Kai-Uwe Bloem <derkub@gmail.com>
+ *
+ * 32bit ARM/MIPS optimized C implementation of memcpy and memove, designed for
+ * good performance with gcc.
+ * - if src and dest have the same alignment, 4-word copy is used.
+ * - if src and dest are unaligned to each other, still loads word data and
+ * stores correctly shifted word data (for all but the first and last bytes
+ * to avoid under/overstepping the src region).
+ *
+ * ATTN does dirty aliasing tricks with undefined behaviour by standard.
+ * (however, this was needed to improve the generated code).
+ * ATTN uses struct assignment, which only works if the compiler is inlining
+ * this (else it would probably call memcpy :-)).
+ */
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <endian.h>
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define _L_ >>
+#define _U_ <<
+#else
+#define _L_ <<
+#define _U_ >>
+#endif
+
+void *memcpy(void *dest, const void *src, size_t n)
+{
+ struct _16 { uint32_t a[4]; };
+ union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; }
+ ss = { src }, ds = { dest };
+ const int lm = sizeof(uint32_t)-1;
+
+ /* align src to word */
+ while (((unsigned)ss.c & lm) && n > 0)
+ *ds.c++ = *ss.c++, n--;
+ if (((unsigned)ds.c & lm) == 0) {
+ /* fast copy if pointers have the same aligment */
+ while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */
+ *ds.s++ = *ss.s++, n -= sizeof(struct _16);
+ if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */
+ *ds.l++ = *ss.l++, n -= sizeof(uint64_t);
+ } else if (n >= 2*sizeof(uint32_t)) {
+ /* unaligned data big enough to avoid overstepping src */
+ uint32_t v1, v2, b, s;
+ /* align dest to word */
+ while (((unsigned)ds.c & lm) && n > 0)
+ *ds.c++ = *ss.c++, n--;
+ /* copy loop: load aligned words and store shifted words */
+ b = (unsigned)ss.c & lm, s = b*8; ss.c -= b;
+ v1 = *ss.i++, v2 = *ss.i++;
+ while (n >= 3*sizeof(uint32_t)) {
+ *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); v1 = *ss.i++;
+ *ds.i++ = (v2 _L_ s) | (v1 _U_ (32-s)); v2 = *ss.i++;
+ n -= 2*sizeof(uint32_t);
+ }
+ /* data for one more store is already loaded */
+ if (n >= sizeof(uint32_t)) {
+ *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s));
+ n -= sizeof(uint32_t);
+ ss.c += sizeof(uint32_t);
+ }
+ ss.c += b - 2*sizeof(uint32_t);
+ }
+ /* copy 0-7 leftover bytes */
+ while (n >= 4) {
+ *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--;
+ *ds.c++ = *ss.c++, n--; *ds.c++ = *ss.c++, n--;
+ }
+ while (n > 0)
+ *ds.c++ = *ss.c++, n--;
+ return dest;
+}
+
+void *memmove (void *dest, const void *src, size_t n)
+{
+ struct _16 { uint32_t a[4]; };
+ union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; }
+ ss = { src+n }, ds = { dest+n };
+ const int lm = sizeof(uint32_t)-1;
+
+ if (dest <= src || dest >= src+n)
+ return memcpy(dest, src, n);
+
+ /* align src to word */
+ while (((unsigned)ss.c & lm) && n > 0)
+ *--ds.c = *--ss.c, n--;
+ if (((unsigned)ds.c & lm) == 0) {
+ /* fast copy if pointers have the same aligment */
+ while (n >= sizeof(struct _16)) /* copy 16 byte blocks */
+ *--ds.s = *--ss.s, n -= sizeof(struct _16);
+ if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */
+ *--ds.l = *--ss.l, n -= sizeof(uint64_t);
+ } else if (n >= 2*sizeof(uint32_t)) {
+ /* unaligned data big enough to avoid understepping src */
+ uint32_t v1, v2, b, s;
+ /* align dest to word */
+ while (((unsigned)ds.c & lm) && n > 0)
+ *--ds.c = *--ss.c, n--;
+ /* copy loop: load aligned words and store shifted words */
+ b = (unsigned)ss.c & lm, s = b*8; ss.c += b;
+ v1 = *--ss.i, v2 = *--ss.i;
+ while (n >= 3*sizeof(uint32_t)) {
+ *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); v1 = *--ss.i;
+ *--ds.i = (v2 _U_ s) | (v1 _L_ (32-s)); v2 = *--ss.i;
+ n -= 2*sizeof(uint32_t);
+ }
+ /* data for one more store is already loaded */
+ if (n >= sizeof(uint32_t)) {
+ *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s));
+ n -= sizeof(uint32_t);
+ ss.c -= sizeof(uint32_t);
+ }
+ ss.c -= b - 2*sizeof(uint32_t);
+ }
+ /* copy 0-7 leftover bytes */
+ while (n >= 4) {
+ *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--;
+ *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--;
+ }
+ while (n > 0)
+ *--ds.c = *--ss.c, n--;
+ return dest;
+}
/* YUV stuff */
static int yuv_ry[32], yuv_gy[32], yuv_by[32];
static unsigned char yuv_u[32 * 2], yuv_v[32 * 2];
+static int yuv_y[256];
void bgr_to_uyvy_init(void)
{
v = 255;
yuv_v[i + 32] = v;
}
+ // valid Y range seems to be 16..235
+ for (i = 0; i < 256; i++) {
+ yuv_y[i] = 16 + 219 * i / 32;
+ }
}
void rgb565_to_uyvy(void *d, const void *s, int pixels)
u = yu[b0 - y0];
v = yv[r0 - y0];
// valid Y range seems to be 16..235
- y0 = 16 + 219 * y0 / 31;
- y1 = 16 + 219 * y1 / 31;
+ y0 = yuv_y[y0];
+ y1 = yuv_y[y1];
*dst = (y1 << 24) | (v << 16) | (y0 << 8) | u;
}
-#define VERSION "1.93"\r
+#define VERSION "1.93+"\r
-# usage: mkoffsets <output dir>
# automatically compute structure offsets for gcc targets in ELF format
+# (C) 2018 Kai-Uwe Bloem. This work is placed in the public domain.
+#
+# usage: mkoffsets <output dir>
CC=${CC:-gcc}
# endianess of target (automagically determined below)
ENDIAN=
+# compile with target C compiler and extract value from .rodata section
compile_rodata ()
{
$CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1
+ # find the name of the .rodata section (in case -fdata-sections is used)
rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' |
sed 's/^[^.]*././;s/ .*//')
+ # read out .rodata section as hex string (should be only 4 or 8 bytes)
objcopy --dump-section $rosect=/tmp/getoffs.ro /tmp/getoffs.o || exit 1
ro=$(xxd -ps /tmp/getoffs.ro)
if [ "$ENDIAN" = "le" ]; then
else
hex=$ro
fi
+ # extract decimal value from hex string
rodata=$(printf "%d" 0x$hex)
}
+# determine member offset and create #define
get_define () # prefix struct member member...
{
prefix=$1; shift