From d40a5af495db6c91d9c4469ac650bc95e6b7a4d5 Mon Sep 17 00:00:00 2001 From: kub Date: Tue, 16 Apr 2019 20:37:52 +0200 Subject: [PATCH] various small improvements and fixes --- Makefile | 6 +- config.caanoo | 6 +- config.caanoo47 | 4 +- config.dingux | 6 +- config.dingux54 | 6 +- config.gp2x | 4 +- config.gp2x47 | 4 +- config.i386 | 14 +++ config.x86 | 8 +- cpu/cz80/cz80.c | 1 + cpu/drc/cmn.h | 6 - cpu/drc/emit_arm.c | 56 +++++++--- cpu/drc/emit_x86.c | 68 +++++++----- cpu/sh2/compiler.c | 194 ++++++++++++++++++++++++--------- cpu/sh2/mame/sh2pico.c | 2 +- cpu/sh2/sh2.h | 4 +- pico/32x/32x.c | 28 ++--- pico/32x/draw_arm.S | 20 ++-- pico/32x/memory.c | 8 +- pico/32x/memory_arm.S | 76 ++++++------- pico/cd/gfx_dma.c | 4 - pico/cd/memory_arm.S | 2 +- pico/draw2_arm.S | 2 +- pico/draw_arm.S | 2 +- pico/memory.h | 5 - pico/memory_amips.S | 2 +- pico/memory_arm.S | 2 +- pico/pico_int.h | 18 ++- platform/common/common.mak | 1 + platform/common/memcpy.c | 37 ++++--- platform/gp2x/code940/memcpy.s | 12 +- tools/mkoffsets.sh | 5 +- 32 files changed, 372 insertions(+), 241 deletions(-) create mode 100644 config.i386 diff --git a/Makefile b/Makefile index a0e63a47..d82961eb 100644 --- a/Makefile +++ b/Makefile @@ -195,10 +195,10 @@ LDFLAGS += -Wl,-Map=$(TARGET).map endif -target_: pico/pico_int_o32.h $(TARGET) +target_: pico/pico_int_offs.h $(TARGET) clean: - $(RM) $(TARGET) $(OBJS) pico/pico_int_o32.h + $(RM) $(TARGET) $(OBJS) pico/pico_int_offs.h $(RM) -r .opk_data $(TARGET): $(OBJS) @@ -211,7 +211,7 @@ endif pprof: platform/linux/pprof.c $(CC) $(CFLAGS) -O2 -ggdb -DPPROF -DPPROF_TOOL -I../../ -I. $^ -o $@ $(LDFLAGS) $(LDLIBS) -pico/pico_int_o32.h:: tools/mkoffsets.sh +pico/pico_int_offs.h:: tools/mkoffsets.sh make -C tools/ XCC="$(CC)" XCFLAGS="$(CFLAGS)" .s.o: diff --git a/config.caanoo b/config.caanoo index 39edb5db..dd053bc5 100644 --- a/config.caanoo +++ b/config.caanoo @@ -4,11 +4,11 @@ CC = arm-gph-linux-gnueabi-gcc CXX = arm-gph-linux-gnueabi-g++ AS = arm-gph-linux-gnueabi-as STRIP = arm-gph-linux-gnueabi-strip -CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -fno-stack-protector -D__GP2X__ -DGPERF +CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -fno-stack-protector -D__GP2X__ CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers -CFLAGS += -I/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I/home/build/src/gp2x/armroot-eabi/include +CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include ASFLAGS += -mfloat-abi=soft -mcpu=arm920t -LDFLAGS += -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/src/gp2x/armroot-eabi/lib -static +LDFLAGS += -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/src/gp2x/armroot-eabi/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.caanoo47 b/config.caanoo47 index f3efde0f..2c0ee5af 100644 --- a/config.caanoo47 +++ b/config.caanoo47 @@ -6,9 +6,9 @@ AS = arm-linux-gnueabi-as STRIP = arm-linux-gnueabi-strip CFLAGS += -mfloat-abi=soft -mcpu=arm920t -mtune=arm920t -Wno-unused-result -fno-stack-protector -D__GP2X__ CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers -CFLAGS += -I/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I/home/build/src/gp2x/armroot-eabi/include +CFLAGS += -I${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/include -I${HOME}/src/gp2x/armroot-eabi/include ASFLAGS += -mfloat-abi=soft -mcpu=arm920t -LDFLAGS += -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L/home/build/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -static +LDFLAGS += -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/lib/gcc/arm-gph-linux-gnueabi/4.2.4 -B${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -L${HOME}/opt/GPH_SDK/tools/gcc-4.2.4-glibc-2.7-eabi/arm-gph-linux-gnueabi/sys-root/usr/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.dingux b/config.dingux index 6611991c..8aca06a6 100644 --- a/config.dingux +++ b/config.dingux @@ -4,12 +4,12 @@ CC = mipsel-linux-gcc CXX = mipsel-linux-g++ AS = mipsel-linux-as STRIP = mipsel-linux-strip -CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/ -CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/SDL +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/SDL CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector ASFLAGS += LDFLAGS += -LDLIBS += -B/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lm -lstdc++ -ldl +LDLIBS += -B${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lm -lstdc++ -ldl ARCH = mipsel PLATFORM = opendingux diff --git a/config.dingux54 b/config.dingux54 index 96e55014..5f292652 100644 --- a/config.dingux54 +++ b/config.dingux54 @@ -4,12 +4,12 @@ CC = mipsel-linux-gnu-gcc CXX = mipsel-linux-gnu-g++ AS = mipsel-linux-gnu-as STRIP = mipsel-linux-gnu-strip -CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/ -CFLAGS += -I/home/build/opt/opendingux-toolchain/usr/include/SDL +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/ +CFLAGS += -I${HOME}/opt/opendingux-toolchain/usr/include/SDL CFLAGS += -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -fno-stack-protector ASFLAGS += LDFLAGS += -LDLIBS += -B/home/build/opt/opendingux-toolchain/usr/lib -B/home/build/opt/opendingux-toolchain/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=/home/build/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl +LDLIBS += -B${HOME}/opt/opendingux-toolchain/usr/lib -B${HOME}/opt/opendingux-toolchain/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/usr/lib -Wl,-rpath-link=${HOME}/opt/opendingux-toolchain/lib -lSDL -lasound -lpng -lz -lm -lstdc++ -ldl ARCH = mipsel PLATFORM = opendingux diff --git a/config.gp2x b/config.gp2x index de3e47c4..248d73aa 100644 --- a/config.gp2x +++ b/config.gp2x @@ -5,10 +5,10 @@ CXX = arm-open2x-linux-g++ AS = arm-open2x-linux-as STRIP = arm-open2x-linux-strip CFLAGS += -msoft-float -mcpu=arm920t -mtune=arm920t -D__GP2X__ -CFLAGS += -I/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I/home/build/src/gp2x/armroot/include +CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers ASFLAGS += -mcpu=arm920t -mfloat-abi=soft -LDFLAGS += --sysroot /home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/src/gp2x/armroot/lib -static +LDFLAGS += --sysroot ${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.gp2x47 b/config.gp2x47 index 1022166d..21769ada 100644 --- a/config.gp2x47 +++ b/config.gp2x47 @@ -5,10 +5,10 @@ CXX = arm-linux-gnueabi-g++ AS = arm-linux-gnueabi-as STRIP = arm-linux-gnueabi-strip CFLAGS += -mabi=apcs-gnu -mno-thumb-interwork -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -mtune=arm920t -Wno-unused-result -fno-stack-protector -D__GP2X__ -CFLAGS += -I/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I/home/build/src/gp2x/armroot/include +CFLAGS += -I${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -I${HOME}/src/gp2x/armroot/include CFLAGS += -fno-gcse -funswitch-loops -fweb -ftree-loop-im #-fpredictive-commoning -ftree-loop-distribution -frename-registers ASFLAGS += -mabi=apcs-gnu -mfloat-abi=soft -mfpu=fpa -mcpu=arm920t -LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L/home/build/src/gp2x/armroot/lib -static +LDFLAGS += -mabi=apcs-gnu -mfpu=fpa -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/opt/open2x/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L${HOME}/src/gp2x/armroot/lib -static LDLIBS += -lpng -lm -ldl ARCH = arm diff --git a/config.i386 b/config.i386 new file mode 100644 index 00000000..ce07b103 --- /dev/null +++ b/config.i386 @@ -0,0 +1,14 @@ +# Automatically generated by configure +# Configured with: './configure' '--platform=generic' +CC = gcc +CXX = g++ +AS = as +STRIP = strip +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 # -pg +ASFLAGS += +LDFLAGS += -m32 #-pg +LDLIBS += -L/usr/lib/i386-linux-gnu -L${HOME}/opt/lib32 -lSDL-1.2 -lasound -lpng -lz -lm -ldl + +ARCH = i386 +PLATFORM = generic +SOUND_DRIVERS = oss alsa sdl diff --git a/config.x86 b/config.x86 index d463157e..287b82d3 100644 --- a/config.x86 +++ b/config.x86 @@ -4,11 +4,11 @@ CC = gcc CXX = g++ AS = as STRIP = strip -CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result -m32 # -pg +CFLAGS += -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT -Wno-unused-result # -pg ASFLAGS += -LDFLAGS += -m32 #-pg -LDLIBS += -L/usr/lib/i386-linux-gnu/debug -L/home/build/opt/lib32 -lSDL-1.2 -lasound -lpng -lz -lm -ldl +LDFLAGS += #-pg +LDLIBS += -L/usr/lib/x86_64-linux-gnu -lSDL-1.2 -lasound -lpng -lz -lm -ldl -ARCH = x86 +ARCH = x86_64 PLATFORM = generic SOUND_DRIVERS = oss alsa sdl diff --git a/cpu/cz80/cz80.c b/cpu/cz80/cz80.c index 61ca5f84..0326b0b8 100644 --- a/cpu/cz80/cz80.c +++ b/cpu/cz80/cz80.c @@ -14,6 +14,7 @@ #include "cz80.h" #if PICODRIVE_HACKS +#include #include #endif diff --git a/cpu/drc/cmn.h b/cpu/drc/cmn.h index 7d50d33d..bad02a1b 100644 --- a/cpu/drc/cmn.h +++ b/cpu/drc/cmn.h @@ -1,9 +1,3 @@ -typedef unsigned char u8; -typedef signed char s8; -typedef unsigned short u16; -typedef signed short s16; -typedef unsigned int u32; -typedef signed int s32; #define DRC_TCACHE_SIZE (4*1024*1024) diff --git a/cpu/drc/emit_arm.c b/cpu/drc/emit_arm.c index 3f782bb6..4744b127 100644 --- a/cpu/drc/emit_arm.c +++ b/cpu/drc/emit_arm.c @@ -177,26 +177,25 @@ #define EOP_C_AM3_REG(cond,u,l,rn,rd,s,h,rm) EOP_C_AM3(cond,u,0,l,rn,rd,s,h,rm) /* ldr and str */ -#define EOP_LDR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,0,1,rn,rd,offset_12) -#define EOP_LDRB_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,1,1,1,rn,rd,offset_12) +#define EOP_LDR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,0,1,rn,rd,abs(offset_12)) +#define EOP_LDRB_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,1,1,rn,rd,abs(offset_12)) #define EOP_STR_IMM2(cond,rd,rn,offset_12) EOP_C_AM2_IMM(cond,(offset_12) >= 0,0,0,rn,rd,abs(offset_12)) -#define EOP_LDR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,1,0,1,rn,rd,offset_12) -#define EOP_LDR_NEGIMM(rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,0,0,1,rn,rd,offset_12) +#define EOP_LDR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,(offset_12) >= 0,0,1,rn,rd,abs(offset_12)) #define EOP_LDR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,1,rn,rd,0) -#define EOP_STR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,offset_12) +#define EOP_STR_IMM( rd,rn,offset_12) EOP_C_AM2_IMM(A_COND_AL,(offset_12) >= 0,0,0,rn,rd,abs(offset_12)) #define EOP_STR_SIMPLE(rd,rn) EOP_C_AM2_IMM(A_COND_AL,1,0,0,rn,rd,0) #define EOP_LDR_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,0,1,rn,rd,shift_imm,A_AM1_LSL,rm) #define EOP_LDRB_REG_LSL(cond,rd,rn,rm,shift_imm) EOP_C_AM2_REG(cond,1,1,1,rn,rd,shift_imm,A_AM1_LSL,rm); -#define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,1,1,rn,rd,0,1,offset_8) +#define EOP_LDRH_IMM2(cond,rd,rn,offset_8) EOP_C_AM3_IMM(cond,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8)) #define EOP_LDRH_REG2(cond,rd,rn,rm) EOP_C_AM3_REG(cond,1,1,rn,rd,0,1,rm) -#define EOP_LDRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,1,1,rn,rd,0,1,offset_8) +#define EOP_LDRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,(offset_8) >= 0,1,rn,rd,0,1,abs(offset_8)) #define EOP_LDRH_SIMPLE(rd,rn) EOP_C_AM3_IMM(A_COND_AL,1,1,rn,rd,0,1,0) #define EOP_LDRH_REG( rd,rn,rm) EOP_C_AM3_REG(A_COND_AL,1,1,rn,rd,0,1,rm) -#define EOP_STRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,1,0,rn,rd,0,1,offset_8) +#define EOP_STRH_IMM( rd,rn,offset_8) EOP_C_AM3_IMM(A_COND_AL,(offset_8) >= 0,0,rn,rd,0,1,abs(offset_8)) #define EOP_STRH_SIMPLE(rd,rn) EOP_C_AM3_IMM(A_COND_AL,1,0,rn,rd,0,1,0) #define EOP_STRH_REG( rd,rn,rm) EOP_C_AM3_REG(A_COND_AL,1,0,rn,rd,0,1,rm) @@ -285,11 +284,29 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int imm = ~imm; op = A_OP_MVN; } +#ifdef HAVE_ARMV7 + for (v = imm, ror2 = 0; v && !(v & 3); v >>= 2) + ror2--; + if (v >> 8) { + /* 2+ insns needed - prefer movw/movt */ + if (op == A_OP_MVN) + imm = ~imm; + EOP_MOVW(rd, imm); + if (imm & 0xffff0000) + EOP_MOVT(rd, imm); + return; + } +#endif break; - case A_OP_EOR: case A_OP_SUB: case A_OP_ADD: + // count bits in imm and swap ADD and SUB if more bits 1 than 0 + if (s == 0 && count_bits(imm) > 16) { + imm = -imm; + op ^= (A_OP_ADD^A_OP_SUB); + } + case A_OP_EOR: case A_OP_ORR: case A_OP_BIC: if (s == 0 && imm == 0 && rd == rn) @@ -412,6 +429,8 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_add_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,0,d,s1,s2,A_AM1_LSL,lslimm) +#define emith_add_r_r_r_lsl_ptr(d, s1, s2, lslimm) \ + emith_add_r_r_r_lsl(d, s1, s2, lslimm) #define emith_addf_r_r_r_lsl(d, s1, s2, lslimm) \ EOP_ADD_REG(A_COND_AL,1,d,s1,s2,A_AM1_LSL,lslimm) @@ -483,7 +502,7 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_add_r_r_r(d, d, s) #define emith_sub_r_r(d, s) \ - EOP_SUB_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) + emith_sub_r_r_r(d, d, s) #define emith_adc_r_r(d, s) \ EOP_ADC_REG(A_COND_AL,0,d,d,s,A_AM1_LSL,0) @@ -529,6 +548,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_move_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_MOV, r, imm) +#define emith_move_r_ptr_imm(r, imm) \ + emith_move_r_imm(r, (u32)(imm)) + #define emith_add_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_ADD, r, imm) @@ -536,7 +558,7 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_op_imm(A_COND_AL, 0, A_OP_ADC, r, imm) #define emith_adcf_r_imm(r, imm) \ - emith_op_imm(A_COND_AL, 1, A_OP_ADC, r, (imm)) + emith_op_imm(A_COND_AL, 1, A_OP_ADC, r, imm) #define emith_sub_r_imm(r, imm) \ emith_op_imm(A_COND_AL, 0, A_OP_SUB, r, imm) @@ -610,13 +632,13 @@ static int emith_xbranch(int cond, void *target, int is_call) emith_op_imm2(A_COND_AL, 0, A_OP_SUB, d, s, imm) #define emith_subf_r_r_imm(d, s, imm) \ - emith_op_imm2(A_COND_AL, 1, A_OP_SUB, d, s, (imm)) + emith_op_imm2(A_COND_AL, 1, A_OP_SUB, d, s, imm) #define emith_or_r_r_imm(d, s, imm) \ - emith_op_imm2(A_COND_AL, 0, A_OP_ORR, d, s, (imm)) + emith_op_imm2(A_COND_AL, 0, A_OP_ORR, d, s, imm) #define emith_eor_r_r_imm(d, s, imm) \ - emith_op_imm2(A_COND_AL, 0, A_OP_EOR, d, s, (imm)) + emith_op_imm2(A_COND_AL, 0, A_OP_EOR, d, s, imm) #define emith_neg_r_r(d, s) \ EOP_RSB_IMM(d, s, 0, 0) @@ -758,7 +780,7 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_clear_msb_c(cond, d, s, count) { \ u32 t; \ if ((count) <= 8) { \ - t = (count) - 8; \ + t = 8 - (count); \ t = (0xff << t) & 0xff; \ EOP_C_DOP_IMM(cond,A_OP_BIC,0,s,d,8/2,t); \ } else if ((count) >= 24) { \ @@ -880,7 +902,9 @@ static int emith_xbranch(int cond, void *target, int is_call) #define emith_sh2_rcall(a, tab, func, mask) { \ emith_lsr(mask, a, SH2_READ_SHIFT); \ EOP_ADD_REG_LSL(tab, tab, mask, 3); \ - EOP_LDMIA(tab, (1<is_slave; -if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); if (!trace[0]) { truncate("pico.trace", 0); trace[0] = fopen("pico.trace0", "wb"); @@ -199,7 +199,8 @@ if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); if (csh2[idx][0].pc != sh2->pc) { fwrite(sh2, offsetof(SH2, read8_map), 1, trace[idx]); fwrite(&sh2->pdb_io_csum, sizeof(sh2->pdb_io_csum), 1, trace[idx]); - memcpy(&csh2[idx][0], sh2, offsetof(SH2, icount)); + memcpy(&csh2[idx][0], sh2, offsetof(SH2, poll_cnt)+4); + csh2[idx][0].is_slave = idx; } } #elif (DRC_DEBUG & 512) @@ -234,9 +235,10 @@ if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); #elif (DRC_DEBUG & 1024) { int x = sh2->is_slave, i; - for (i = 0; i < ARRAY_SIZE(csh2[x]); i++) - memcpy(&csh2[x][i], &csh2[x][i+1], offsetof(SH2, icount)); - memcpy(&csh2[x][3], sh2, offsetof(SH2, icount)); + for (i = 0; i < ARRAY_SIZE(csh2[x])-1; i++) + memcpy(&csh2[x][i], &csh2[x][i+1], offsetof(SH2, poll_cnt)+4); + memcpy(&csh2[x][ARRAY_SIZE(csh2[x])-1], sh2, offsetof(SH2, poll_cnt)+4); + csh2[x][0].is_slave = x; } #endif } @@ -252,9 +254,9 @@ if (sh2 != &sh2s[0] && sh2 != &sh2s[1]) printf("sh2 %p?\n",sh2); // and can be discarded early // XXX: need to tune sizes static const int tcache_sizes[TCACHE_BUFFERS] = { - DRC_TCACHE_SIZE * 6 / 8, // ROM (rarely used), DRAM - DRC_TCACHE_SIZE / 8, // BIOS, data array in master sh2 - DRC_TCACHE_SIZE / 8, // ... slave + DRC_TCACHE_SIZE * 14 / 16, // ROM (rarely used), DRAM + DRC_TCACHE_SIZE / 16, // BIOS, data array in master sh2 + DRC_TCACHE_SIZE / 16, // ... slave }; static u8 *tcache_bases[TCACHE_BUFFERS]; @@ -287,6 +289,9 @@ struct block_entry { #if (DRC_DEBUG & 2) struct block_desc *block; #endif +#if (DRC_DEBUG & 32) + int entry_count; +#endif }; struct block_desc { @@ -698,6 +703,14 @@ static void add_to_hashlist(struct block_entry *be, int tcache_id) (*head)->prev = be; be->next = *head; *head = be; + +#if (DRC_DEBUG & 2) + if (be->next != NULL) { + printf(" %08x: entry hash collision with %08x\n", + be->pc, be->next->pc); + hash_collisions++; + } +#endif } static void rm_from_hashlist(struct block_entry *be, int tcache_id) @@ -727,6 +740,14 @@ static void add_to_hashlist_unresolved(struct block_link *bl, int tcache_id) u32 tcmask = hash_table_sizes[tcache_id] - 1; struct block_link **head = &HASH_FUNC(unresolved_links[tcache_id], bl->target_pc, tcmask); +#if DRC_DEBUG & 1 + struct block_link *current = *head; + while (current != NULL && current != bl) + current = current->next; + if (current == bl) + dbg(1, "add_to_hashlist_unresolved @%p: bl %p %p %08x already in?", head, bl, bl->target, bl->target_pc); +#endif + bl->target = NULL; // marker for not resolved bl->prev = NULL; if (*head) @@ -745,7 +766,7 @@ static void rm_from_hashlist_unresolved(struct block_link *bl, int tcache_id) while (current->prev != NULL) current = current->prev; if (current != *head) - dbg(1, "rm_from_hashlist unresolved @%p: bl %p %p %08x missing?", head, bl, bl->target, bl->target_pc); + dbg(1, "rm_from_hashlist_unresolved @%p: bl %p %p %08x missing?", head, bl, bl->target, bl->target_pc); #endif if (bl->prev != NULL) @@ -980,10 +1001,12 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla struct block_entry *be = NULL; int target_tcache_id; + // get the target block entry be = dr_get_entry(pc, is_slave, &target_tcache_id); if (target_tcache_id && target_tcache_id != tcache_id) return sh2_drc_dispatcher; + // get a block link if (blink_free[tcache_id] != NULL) { bl = blink_free[tcache_id]; blink_free[tcache_id] = bl->next; @@ -995,6 +1018,7 @@ static void *dr_prepare_ext_branch(struct block_entry *owner, u32 pc, int is_sla block_link_pool_counts[tcache_id] = cnt+1; } + // prepare link and add to ougoing list of owner bl->tcache_id = tcache_id; bl->target_pc = pc; bl->jump = tcache_ptr; @@ -1940,6 +1964,7 @@ static void rcache_invalidate(void) cache_regs[i].type = HR_FREE; cache_regs[i].gregs = 0; } + for (i = 0; i < ARRAY_SIZE(guest_regs); i++) { guest_regs[i].flags &= GRF_STATIC; if (!(guest_regs[i].flags & GRF_STATIC)) @@ -1953,7 +1978,8 @@ static void rcache_invalidate(void) cache_regs[guest_regs[i].sreg].gregs = 1 << i; guest_regs[i].vreg = guest_regs[i].sreg; } - }; + } + rcache_counter = 0; rcache_hint_soon = rcache_hint_late = 0; @@ -2005,6 +2031,7 @@ static int emit_get_rbase_and_offs(SH2 *sh2, u32 a, u32 *offs) u32 mask = 0; int poffs; int hr; + unsigned long la; poffs = dr_ctx_get_mem_ptr(a, &mask); if (poffs == -1) @@ -2014,15 +2041,16 @@ static int emit_get_rbase_and_offs(SH2 *sh2, u32 a, u32 *offs) if (mask < 0x1000) { // can't access data array or BIOS directly from ROM or SDRAM, // since code may run on both SH2s (tcache_id of translation block needed)) - emith_ctx_read(hr, poffs); + emith_ctx_read_ptr(hr, poffs); if (a & mask & ~omask) - emith_add_r_imm(hr, a & mask & ~omask); + emith_add_r_r_ptr_imm(hr, hr, a & mask & ~omask); + *offs = a & omask; } else { // known fixed host address - a = (a & mask) + *(u32 *)((char *)sh2 + poffs); - emith_move_r_imm(hr, (a & ~omask)); + la = (unsigned long)*(void **)((char *)sh2 + poffs) + (a & mask); + *offs = la & omask; + emith_move_r_ptr_imm(hr, la & ~omask); } - *offs = a & omask; return hr; } @@ -2392,8 +2420,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) void *branch_patch_ptr[MAX_LOCAL_BRANCHES]; u32 branch_patch_pc[MAX_LOCAL_BRANCHES]; int branch_patch_count = 0; - u32 literal_addr[MAX_LITERALS]; - int literal_addr_count = 0; u8 op_flags[BLOCK_INSN_LIMIT]; struct { u32 test_irq:1; @@ -2473,7 +2499,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) { u32 delay_dep_fw = 0, delay_dep_bk = 0; int tmp3, tmp4; - u32 sr; + int sr; opd = &ops[i]; op = FETCH_OP(pc); @@ -2487,7 +2513,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) pc, op, sh2dasm_buff); #endif - if ((op_flags[i] & OF_BTARGET) || pc == base_pc) + if (op_flags[i] & OF_BTARGET) { if (pc != base_pc) { @@ -2517,6 +2543,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) else { dbg(1, "too many entryp for block #%d,%d pc=%08x", tcache_id, blkid_main, pc); + break; } } else { entry = block->entryp; @@ -2537,10 +2564,10 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) #if (DRC_DEBUG & 0x10) rcache_get_reg_arg(0, SHR_PC, NULL); - tmp = emit_memhandler_read(2); + tmp = emit_memhandler_read(1); tmp2 = rcache_get_tmp(); tmp3 = rcache_get_tmp(); - emith_move_r_imm(tmp2, FETCH32(pc)); + emith_move_r_imm(tmp2, (s16)FETCH_OP(pc)); emith_move_r_imm(tmp3, 0); emith_cmp_r_r(tmp, tmp2); EMITH_SJMP_START(DCOND_EQ); @@ -2556,9 +2583,20 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) emith_cmp_r_imm(sr, 0); emith_jump_cond(DCOND_LE, sh2_drc_exit); +#if (DRC_DEBUG & 32) + // block hit counter + tmp = rcache_get_tmp_arg(0); + tmp2 = rcache_get_tmp_arg(1); + emith_move_r_ptr_imm(tmp, (uptr)entry); + emith_read_r_r_offs(tmp2, tmp, offsetof(struct block_entry, entry_count)); + emith_add_r_imm(tmp2, 1); + emith_write_r_r_offs(tmp2, tmp, offsetof(struct block_entry, entry_count)); + rcache_free_tmp(tmp); + rcache_free_tmp(tmp2); +#endif + #if (DRC_DEBUG & (8|256|512|1024)) sr = rcache_get_reg(SHR_SR, RC_GR_RMW, NULL); - FLUSH_CYCLES(sr); rcache_clean(); tmp = rcache_used_hreg_mask(); emith_save_caller_regs(tmp); @@ -2566,7 +2604,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) rcache_get_reg_arg(2, SHR_SR, NULL); tmp2 = rcache_get_tmp_arg(0); tmp3 = rcache_get_tmp_arg(1); - emith_move_r_imm(tmp2, (u32)tcache_ptr); + emith_move_r_ptr_imm(tmp2, tcache_ptr); emith_move_r_r_ptr(tmp3,CONTEXT_REG); emith_call(sh2_drc_log_entry); emith_restore_caller_regs(tmp); @@ -2776,7 +2814,6 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) if ((opd->imm && opd->imm >= base_pc && opd->imm < end_literals) || dr_is_rom(opd->imm)) { - ADD_TO_ARRAY(literal_addr, literal_addr_count, opd->imm,); if (opd->size == 2) u = FETCH32(opd->imm); else @@ -2862,8 +2899,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x06: // MOV.L Rm,@(R0,Rn) 0000nnnnmmmm0110 emit_indirect_indexed_write(sh2, GET_Rm(), SHR_R0, GET_Rn(), op & 3); goto end_op; - case 0x07: - // MUL.L Rm,Rn 0000nnnnmmmm0111 + case 0x07: // MUL.L Rm,Rn 0000nnnnmmmm0111 tmp = rcache_get_reg(GET_Rn(), RC_GR_READ, NULL); tmp2 = rcache_get_reg(GET_Rm(), RC_GR_READ, NULL); tmp3 = rcache_get_reg(SHR_MACL, RC_GR_WRITE, NULL); @@ -2941,8 +2977,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x01: - // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd + case 0x01: // MOV.L Rm,@(disp,Rn) 0001nnnnmmmmdddd emit_memhandler_write_rr(sh2, GET_Rm(), GET_Rn(), (op & 0x0f) * 4, 2); goto end_op; @@ -3346,19 +3381,16 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) case 0x09: switch (GET_Fx()) { - case 0: - // SHLL2 Rn 0100nnnn00001000 - // SHLR2 Rn 0100nnnn00001001 + case 0: // SHLL2 Rn 0100nnnn00001000 + // SHLR2 Rn 0100nnnn00001001 tmp = 2; break; - case 1: - // SHLL8 Rn 0100nnnn00011000 - // SHLR8 Rn 0100nnnn00011001 + case 1: // SHLL8 Rn 0100nnnn00011000 + // SHLR8 Rn 0100nnnn00011001 tmp = 8; break; - case 2: - // SHLL16 Rn 0100nnnn00101000 - // SHLR16 Rn 0100nnnn00101001 + case 2: // SHLL16 Rn 0100nnnn00101000 + // SHLR16 Rn 0100nnnn00101001 tmp = 16; break; default: @@ -3432,8 +3464,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) } else emit_move_r_r(tmp2, GET_Rn()); goto end_op; - case 0x0f: - // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 + case 0x0f: // MAC.W @Rm+,@Rn+ 0100nnnnmmmm1111 emit_indirect_read_double(sh2, &tmp, &tmp2, GET_Rn(), GET_Rm(), 1); sr = rcache_get_reg(SHR_SR, RC_GR_READ, NULL); tmp3 = rcache_get_reg(SHR_MACL, RC_GR_RMW, NULL); @@ -3446,8 +3477,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x05: - // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd + case 0x05: // MOV.L @(disp,Rm),Rn 0101nnnnmmmmdddd emit_memhandler_read_rr(sh2, GET_Rn(), GET_Rm(), (op & 0x0f) * 4, 2); goto end_op; @@ -3519,8 +3549,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x07: - // ADD #imm,Rn 0111nnnniiiiiiii + case 0x07: // ADD #imm,Rn 0111nnnniiiiiiii tmp = rcache_get_reg(GET_Rn(), RC_GR_RMW, &tmp2); if (op & 0x80) { // adding negative emith_sub_r_r_imm(tmp, tmp2, -op & 0xff); @@ -3621,8 +3650,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id) goto default_; ///////////////////////////////////////////// - case 0x0e: - // MOV #imm,Rn 1110nnnniiiiiiii + case 0x0e: // MOV #imm,Rn 1110nnnniiiiiiii emit_move_r_imm32(GET_Rn(), (s8)op); goto end_op; @@ -3886,9 +3914,7 @@ static void sh2_generate_utils(void) #if BRANCH_CACHE // check if PC is in branch target cache emith_and_r_r_imm(arg1, arg0, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); - // TODO implement emith_add_r_r_r_lsl_ptr, saves one insn on 32bit ARM - emith_lsl(arg1, arg1, sizeof(void *) == 8 ? 2 : 1); - emith_add_r_r_ptr(arg1, CONTEXT_REG); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 2 : 1); emith_read_r_r_offs(arg2, arg1, offsetof(SH2, branch_cache)); emith_cmp_r_r(arg2, arg0); EMITH_SJMP_START(DCOND_NE); @@ -3905,8 +3931,7 @@ static void sh2_generate_utils(void) EMITH_SJMP_START(DCOND_EQ); emith_ctx_read_c(DCOND_NE, arg2, SHR_PC * 4); emith_and_r_r_imm(arg1, arg2, (ARRAY_SIZE(sh2s->branch_cache)-1)*4); - emith_lsl(arg1, arg1, sizeof(void *) == 8 ? 2 : 1); - emith_add_r_r_ptr(arg1, CONTEXT_REG); + emith_add_r_r_r_lsl_ptr(arg1, CONTEXT_REG, arg1, sizeof(void *) == 8 ? 2 : 1); emith_write_r_r_offs_c(DCOND_NE, arg2, arg1, offsetof(SH2, branch_cache)); emith_write_r_r_offs_ptr_c(DCOND_NE, RET_REG, arg1, offsetof(SH2, branch_cache) + sizeof(void *)); EMITH_SJMP_END(DCOND_EQ); @@ -4174,7 +4199,8 @@ int sh2_execute_drc(SH2 *sh2c, int cycles) static void block_stats(void) { #if (DRC_DEBUG & 2) - int c, b, i, total = 0; + int c, b, i; + long total = 0; printf("block stats:\n"); for (b = 0; b < ARRAY_SIZE(block_tables); b++) { @@ -4185,8 +4211,9 @@ static void block_stats(void) if (block_tables[b][i].addr != 0) total += block_tables[b][i].refcount; } + printf("total: %ld\n",total); - for (c = 0; c < 10; c++) { + for (c = 0; c < 20; c++) { struct block_desc *blk, *maxb = NULL; int max = 0; for (b = 0; b < ARRAY_SIZE(block_tables); b++) { @@ -4221,6 +4248,63 @@ static void block_stats(void) #endif } +void entry_stats(void) +{ +#if (DRC_DEBUG & 32) + int c, b, i, j; + long total = 0; + + printf("block entry stats:\n"); + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = 0; i < block_counts[b]; i++) + for (j = 0; j < block_tables[b][i].entry_count; j++) + total += block_tables[b][i].entryp[j].entry_count; + for (i = block_limit[b]; i < block_max_counts[b]; i++) + for (j = 0; j < block_tables[b][i].entry_count; j++) + total += block_tables[b][i].entryp[j].entry_count; + } + printf("total: %ld\n",total); + + for (c = 0; c < 20; c++) { + struct block_desc *blk; + struct block_entry *maxb = NULL; + int max = 0; + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = 0; i < block_counts[b]; i++) { + blk = &block_tables[b][i]; + for (j = 0; j < blk->entry_count; j++) + if (blk->entryp[j].entry_count > max) { + max = blk->entryp[j].entry_count; + maxb = &blk->entryp[j]; + } + } + for (i = block_limit[b]; i < block_max_counts[b]; i++) { + blk = &block_tables[b][i]; + for (j = 0; j < blk->entry_count; j++) + if (blk->entryp[j].entry_count > max) { + max = blk->entryp[j].entry_count; + maxb = &blk->entryp[j]; + } + } + } + if (maxb == NULL) + break; + printf("%08x %p %9d %2.3f%%\n", maxb->pc, maxb->tcache_ptr, maxb->entry_count, + (double)100 * maxb->entry_count / total); + maxb->entry_count = 0; + } + + for (b = 0; b < ARRAY_SIZE(block_tables); b++) { + for (i = 0; i < block_counts[b]; i++) + for (j = 0; j < block_tables[b][i].entry_count; j++) + block_tables[b][i].entryp[j].entry_count = 0; + for (i = block_limit[b]; i < block_max_counts[b]; i++) + for (j = 0; j < block_tables[b][i].entry_count; j++) + block_tables[b][i].entryp[j].entry_count = 0; + } +#endif +} + static void backtrace(void) { #if (DRC_DEBUG & 1024) @@ -4279,6 +4363,7 @@ void sh2_drc_flush_all(void) backtrace(); state_dump(); block_stats(); + entry_stats(); flush_tcache(0); flush_tcache(1); flush_tcache(2); @@ -4364,6 +4449,7 @@ int sh2_drc_init(SH2 *sh2) hash_collisions = 0; #endif } + memset(sh2->branch_cache, -1, sizeof(sh2->branch_cache)); return 0; diff --git a/cpu/sh2/mame/sh2pico.c b/cpu/sh2/mame/sh2pico.c index 636ebc6f..f9d30d77 100644 --- a/cpu/sh2/mame/sh2pico.c +++ b/cpu/sh2/mame/sh2pico.c @@ -214,7 +214,7 @@ int sh2_execute_interpreter(SH2 *sh2, int cycles) if (sh2->pc < *base_pc || sh2->pc >= *end_pc) { *base_pc = sh2->pc; scan_block(*base_pc, sh2->is_slave, - op_flags, end_pc, NULL); + op_flags, end_pc, NULL, NULL); } if ((op_flags[(sh2->pc - *base_pc) / 2] & OF_BTARGET) || sh2->pc == *base_pc diff --git a/cpu/sh2/sh2.h b/cpu/sh2/sh2.h index e53bbf05..5a0661ea 100644 --- a/cpu/sh2/sh2.h +++ b/cpu/sh2/sh2.h @@ -81,9 +81,9 @@ typedef struct SH2_ #define CYCLE_MULT_SHIFT 10 #define C_M68K_TO_SH2(xsh2, c) \ - ((int)((long long)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) + (int)(((unsigned long long)(c) * (xsh2)->mult_m68k_to_sh2) >> CYCLE_MULT_SHIFT) #define C_SH2_TO_M68K(xsh2, c) \ - ((int)((long long)(c+3) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) + (int)(((unsigned long long)(c+3U) * (xsh2)->mult_sh2_to_m68k) >> CYCLE_MULT_SHIFT) int sh2_init(SH2 *sh2, int is_slave, SH2 *other_sh2); void sh2_finish(SH2 *sh2); diff --git a/pico/32x/32x.c b/pico/32x/32x.c index a15cb112..4e8377eb 100644 --- a/pico/32x/32x.c +++ b/pico/32x/32x.c @@ -30,7 +30,7 @@ static int REGPARM(2) sh2_irq_cb(SH2 *sh2, int level) } // MUST specify active_sh2 when called from sh2 memhandlers -void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) +void p32x_update_irls(SH2 *active_sh2, unsigned int m68k_cycles) { int irqs, mlvl = 0, slvl = 0; int mrun, srun; @@ -50,18 +50,18 @@ void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) slvl++; slvl *= 2; - mrun = sh2_irl_irq(&msh2, mlvl, active_sh2 == &msh2); + mrun = sh2_irl_irq(&msh2, mlvl, msh2.state & SH2_STATE_RUN); if (mrun) { p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, m68k_cycles); - if (active_sh2 == &msh2) - sh2_end_run(active_sh2, 1); + if (msh2.state & SH2_STATE_RUN) + sh2_end_run(&msh2, 1); } - srun = sh2_irl_irq(&ssh2, slvl, active_sh2 == &ssh2); + srun = sh2_irl_irq(&ssh2, slvl, ssh2.state & SH2_STATE_RUN); if (srun) { p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, m68k_cycles); - if (active_sh2 == &ssh2) - sh2_end_run(active_sh2, 1); + if (ssh2.state & SH2_STATE_RUN) + sh2_end_run(&ssh2, 1); } elprintf(EL_32X, "update_irls: m %d/%d, s %d/%d", mlvl, mrun, slvl, srun); @@ -70,7 +70,7 @@ void p32x_update_irls(SH2 *active_sh2, int m68k_cycles) // the mask register is inconsistent, CMD is supposed to be a mask, // while others are actually irq trigger enables? // TODO: test on hw.. -void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask) +void p32x_trigger_irq(SH2 *sh2, unsigned int m68k_cycles, unsigned int mask) { Pico32x.sh2irqs |= mask & P32XI_VRES; Pico32x.sh2irqi[0] |= mask & (Pico32x.sh2irq_mask[0] << 3); @@ -79,7 +79,7 @@ void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask) p32x_update_irls(sh2, m68k_cycles); } -void p32x_update_cmd_irq(SH2 *sh2, int m68k_cycles) +void p32x_update_cmd_irq(SH2 *sh2, unsigned int m68k_cycles) { if ((Pico32x.sh2irq_mask[0] & 2) && (Pico32x.regs[2 / 2] & 1)) Pico32x.sh2irqi[0] |= P32XI_CMD; @@ -207,8 +207,8 @@ void PicoReset32x(void) { if (PicoIn.AHW & PAHW_32X) { p32x_trigger_irq(NULL, SekCyclesDone(), P32XI_VRES); - p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, 0); - p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, 0); + p32x_sh2_poll_event(&msh2, SH2_IDLE_STATES, SekCyclesDone()); + p32x_sh2_poll_event(&ssh2, SH2_IDLE_STATES, SekCyclesDone()); p32x_pwm_ctl_changed(); p32x_timers_recalc(); } @@ -258,7 +258,7 @@ static void p32x_start_blank(void) p32x_sh2_poll_event(&ssh2, SH2_STATE_VPOLL, SekCyclesDone()); } -void p32x_schedule_hint(SH2 *sh2, int m68k_cycles) +void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles) { // rather rough, 32x hint is useless in practice int after; @@ -370,9 +370,9 @@ static void p32x_run_events(unsigned int until) oldest, event_time_next); } -static void run_sh2(SH2 *sh2, int m68k_cycles) +static void run_sh2(SH2 *sh2, unsigned int m68k_cycles) { - int cycles, done; + unsigned int cycles, done; pevt_log_sh2_o(sh2, EVT_RUN_START); sh2->state |= SH2_STATE_RUN; diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S index e91f9893..c59fa8f5 100644 --- a/pico/32x/draw_arm.S +++ b/pico/32x/draw_arm.S @@ -6,7 +6,7 @@ @* See COPYING file in the top-level directory. @* -#include "pico/pico_int_o32.h" +#include "pico/pico_int_offs.h" .extern Pico32x .extern Pico @@ -74,7 +74,7 @@ Pico32xNativePal: ldr lr,=Pico ldr r10,=Pico32x ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] - ldr r10,[r10, #0x40] @ Pico32x.vdp_regs[0] + ldrh r10,[r10, #0x40] @ Pico32x.vdp_regs[0] add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd and r4, r2, #0xff @@ -118,6 +118,8 @@ Pico32xNativePal: mov r7, r7, lsl #1 ldreqh r12,[r9, r7] streqh r12,[r0], #2 @ *dst++ = palmd[*pmd] +.else + addeq r0, r0, #2 .endif beq 2b @ loop_inner @@ -182,8 +184,8 @@ Pico32xNativePal: ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index subs r6, r6, #1 blt 0b @ loop_outer -@ cmp r7, r8 @ is this really improving things? -@ beq 5f @ check_fill @ +8 + cmp r7, r8 @ is this really improving things? + beq 5f @ check_fill @ +8 3: @ no_fill: mov r12,r7, lsl #1 @@ -242,7 +244,7 @@ Pico32xNativePal: beq 6b 7: @ count_done - sub r5, r5, #4 @ undo readahead + sub r5, r5, #4 @ undo readahead @ fix alignment and check type sub r8, r5, lr @@ -268,14 +270,14 @@ Pico32xNativePal: b 2b @ loop_inner 9: @ bg_mode: - ldrb r12,[r11],#1 @ MD pixel + ldrb r12,[r11],#1 @ MD pixel 0,1 ldrb lr, [r11],#1 - cmp r3, lr, lsl #26 @ MD has bg pixel? + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? .if \do_md mov r12,r12,lsl #1 ldrneh r12,[r9, r12] @ t = palmd[*pmd] moveq r12,r7 - cmp r3, lr, lsl #26 + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? mov lr, lr, lsl #1 ldrneh lr, [r9, lr] moveq lr, r7 @@ -283,7 +285,7 @@ Pico32xNativePal: strh lr, [r0], #2 .else streqh r7, [r0] - cmp r3, lr, lsl #26 + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? streqh r7, [r0, #2] add r0, r0, #4 .endif diff --git a/pico/32x/memory.c b/pico/32x/memory.c index 30d0e4d5..6a3b2222 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -398,9 +398,6 @@ static void p32x_reg_write8(u32 a, u32 d) p32x_sh2_poll_event(&sh2s[1], SH2_STATE_CPOLL, cycles); comreg = 1 << (a & 0x0f) / 2; Pico32x.comm_dirty |= comreg; - - if (cycles - (int)msh2.m68krcycles_done > 120) - p32x_sync_sh2s(cycles); return; } } @@ -453,6 +450,9 @@ static void p32x_reg_write16(u32 a, u32 d) int cycles = SekCyclesDone(); int comreg; + if (r[a / 2] == d) + return; + p32x_sync_sh2s(cycles); r[a / 2] = d; @@ -685,7 +685,7 @@ static void p32x_sh2reg_write8(u32 a, u32 d, SH2 *sh2) case 0x3f: return; pwm_write: - p32x_pwm_write16(a & ~1, d, sh2, 0); + p32x_pwm_write16(a & ~1, d, sh2, sh2_cycles_done_m68k(sh2)); return; } diff --git a/pico/32x/memory_arm.S b/pico/32x/memory_arm.S index 90c86ddf..1082c7b7 100644 --- a/pico/32x/memory_arm.S +++ b/pico/32x/memory_arm.S @@ -6,7 +6,7 @@ * See COPYING file in the top-level directory. */ -#include "../pico_int_o32.h" +#include "../pico_int_offs.h" @ 32X bank sizes... TODO this should somehow come from an include file .equ SH2_ROM_SHIFT, 10 @ 0x003fffff @@ -46,92 +46,92 @@ sh2_read8_rom: ldr ip, [r1, #OFS_SH2_p_rom] eor r0, r0, #1 - lsl r0, #SH2_ROM_SHIFT + mov r0, r0, lsl #SH2_ROM_SHIFT ldrb r0, [ip, r0, lsr #SH2_ROM_SHIFT] bx lr sh2_read8_sdram: ldr ip, [r1, #OFS_SH2_p_sdram] eor r0, r0, #1 - lsl r0, #SH2_RAM_SHIFT + mov r0, r0, lsl #SH2_RAM_SHIFT ldrb r0, [ip, r0, lsr #SH2_RAM_SHIFT] bx lr sh2_read8_da: ldr ip, [r1, #OFS_SH2_p_da] eor r0, r0, #1 - lsl r0, #SH2_DA_SHIFT + mov r0, r0, lsl #SH2_DA_SHIFT ldrb r0, [ip, r0, lsr #SH2_DA_SHIFT] bx lr sh2_read8_dram: ldr ip, [r1, #OFS_SH2_p_dram] eor r0, r0, #1 - lsl r0, #SH2_DRAM_SHIFT + mov r0, r0, lsl #SH2_DRAM_SHIFT ldrb r0, [ip, r0, lsr #SH2_DRAM_SHIFT] bx lr sh2_read16_rom: ldr ip, [r1, #OFS_SH2_p_rom] - lsl r0, #SH2_ROM_SHIFT - lsr r0, #SH2_ROM_SHIFT + mov r0, r0, lsl #SH2_ROM_SHIFT + mov r0, r0, lsr #SH2_ROM_SHIFT ldrh r0, [ip, r0] bx lr sh2_read16_sdram: ldr ip, [r1, #OFS_SH2_p_sdram] - lsl r0, #SH2_RAM_SHIFT - lsr r0, #SH2_RAM_SHIFT + mov r0, r0, lsl #SH2_RAM_SHIFT + mov r0, r0, lsr #SH2_RAM_SHIFT ldrh r0, [ip, r0] bx lr sh2_read16_da: ldr ip, [r1, #OFS_SH2_p_da] - lsl r0, #SH2_DA_SHIFT - lsr r0, #SH2_DA_SHIFT + mov r0, r0, lsl #SH2_DA_SHIFT + mov r0, r0, lsr #SH2_DA_SHIFT ldrh r0, [ip, r0] bx lr sh2_read16_dram: ldr ip, [r1, #OFS_SH2_p_dram] - lsl r0, #SH2_DRAM_SHIFT - lsr r0, #SH2_DRAM_SHIFT + mov r0, r0, lsl #SH2_DRAM_SHIFT + mov r0, r0, lsr #SH2_DRAM_SHIFT ldrh r0, [ip, r0] bx lr sh2_read32_rom: ldr ip, [r1, #OFS_SH2_p_rom] - lsl r0, #SH2_ROM_SHIFT + mov r0, r0, lsl #SH2_ROM_SHIFT ldr r0, [ip, r0, lsr #SH2_ROM_SHIFT] - ror r0, r0, #16 + mov r0, r0, ror #16 bx lr sh2_read32_sdram: ldr ip, [r1, #OFS_SH2_p_sdram] - lsl r0, #SH2_RAM_SHIFT + mov r0, r0, lsl #SH2_RAM_SHIFT ldr r0, [ip, r0, lsr #SH2_RAM_SHIFT] - ror r0, r0, #16 + mov r0, r0, ror #16 bx lr sh2_read32_da: ldr ip, [r1, #OFS_SH2_p_da] - lsl r0, #SH2_DA_SHIFT + mov r0, r0, lsl #SH2_DA_SHIFT ldr r0, [ip, r0, lsr #SH2_DA_SHIFT] - ror r0, r0, #16 + mov r0, r0, ror #16 bx lr sh2_read32_dram: ldr ip, [r1, #OFS_SH2_p_dram] - lsl r0, #SH2_DRAM_SHIFT + mov r0, r0, lsl #SH2_DRAM_SHIFT ldr r0, [ip, r0, lsr #SH2_DRAM_SHIFT] - ror r0, r0, #16 + mov r0, r0, ror #16 bx lr sh2_write8_sdram: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] eor r3, r0, #1 - lsl r3, #SH2_RAM_SHIFT + mov r3, r3, lsl #SH2_RAM_SHIFT strb r1, [ip, r3, lsr #SH2_RAM_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] @@ -148,7 +148,7 @@ sh2_write8_da: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_da] eor r3, r0, #1 - lsl r3, #SH2_DA_SHIFT + mov r3, r3, lsl #SH2_DA_SHIFT strb r1, [ip, r3, lsr #SH2_DA_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_da] @@ -165,15 +165,15 @@ sh2_write8_dram: tst r1, #0xff ldrne ip, [r2, #OFS_SH2_p_dram] eorne r3, r0, #1 - lslne r3, #SH2_DRAM_SHIFT + movne r3, r3, lsl #SH2_DRAM_SHIFT strneb r1, [ip, r3, lsr #SH2_DRAM_SHIFT] bx lr sh2_write16_sdram: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] - lsl r3, r0, #SH2_RAM_SHIFT - lsr r3, r3, #SH2_RAM_SHIFT + mov r3, r0, lsl #SH2_RAM_SHIFT + mov r3, r3, lsr #SH2_RAM_SHIFT strh r1, [ip, r3] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] @@ -188,8 +188,8 @@ sh2_write16_sdram: sh2_write16_da: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_da] - lsl r3, r0, #SH2_DA_SHIFT - lsr r3, r3, #SH2_DA_SHIFT + mov r3, r0, lsl #SH2_DA_SHIFT + mov r3, r3, lsr #SH2_DA_SHIFT strh r1, [ip, r3] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_da] @@ -204,23 +204,23 @@ sh2_write16_da: sh2_write16_dram: ldr ip, [r2, #OFS_SH2_p_dram] tst r0, #SH2_DRAM_OW - lsl r3, r0, #SH2_DRAM_SHIFT - lsr r3, r3, #SH2_DRAM_SHIFT + mov r3, r0, lsl #SH2_DRAM_SHIFT + mov r3, r3, lsr #SH2_DRAM_SHIFT streqh r1, [ip, r3] bxeq lr add ip, ip, r3 tst r1, #0xff strneb r1, [ip, #0] tst r1, #0xff00 - lsrne r1, r1, #8 + movne r1, r1, lsr #8 strneb r1, [ip, #1] bx lr sh2_write32_sdram: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_sdram] - ror r1, r1, #16 - lsl r3, r0, #SH2_RAM_SHIFT + mov r1, r1, ror #16 + mov r3, r0, lsl #SH2_RAM_SHIFT str r1, [ip, r3, lsr #SH2_RAM_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_ram] @@ -242,8 +242,8 @@ sh2_write32_sdram: sh2_write32_da: @ preserve r0 and r2 for tail call ldr ip, [r2, #OFS_SH2_p_da] - ror r1, r1, #16 - lsl r3, r0, #SH2_DA_SHIFT + mov r1, r1, ror #16 + mov r3, r0, lsl #SH2_DA_SHIFT str r1, [ip, r3, lsr #SH2_DA_SHIFT] #ifdef DRC_SH2 ldr ip, [r2, #OFS_SH2_p_drcblk_da] @@ -265,13 +265,13 @@ sh2_write32_da: sh2_write32_dram: ldr ip, [r2, #OFS_SH2_p_dram] tst r0, #SH2_DRAM_OW - lsl r3, r0, #SH2_DRAM_SHIFT - roreq r1, r1, #16 + mov r3, r0, lsl #SH2_DRAM_SHIFT + moveq r1, r1, ror #16 streq r1, [ip, r3, lsr #SH2_DRAM_SHIFT] bxeq lr #if 1 ldr r0, [ip, r3, lsr #SH2_DRAM_SHIFT] - ror r1, r1, #16 + mov r1, r1, ror #16 mov r2, #0 tst r1, #0x00ff0000 orrne r2, r2, #0x00ff0000 diff --git a/pico/cd/gfx_dma.c b/pico/cd/gfx_dma.c index 7dfe4bc9..ff93a2dc 100644 --- a/pico/cd/gfx_dma.c +++ b/pico/cd/gfx_dma.c @@ -10,10 +10,6 @@ #include "cell_map.c" -#ifndef UTYPES_DEFINED -typedef unsigned short u16; -#endif - // check: Heart of the alien, jaguar xj 220 PICO_INTERNAL void DmaSlowCell(unsigned int source, unsigned int a, int len, unsigned char inc) { diff --git a/pico/cd/memory_arm.S b/pico/cd/memory_arm.S index 335f3624..04920b62 100644 --- a/pico/cd/memory_arm.S +++ b/pico/cd/memory_arm.S @@ -6,7 +6,7 @@ @* See COPYING file in the top-level directory. @* -#include "../pico_int_o32.h" +#include "../pico_int_offs.h" .equiv PCM_STEP_SHIFT, 11 diff --git a/pico/draw2_arm.S b/pico/draw2_arm.S index 6b110b32..6b094495 100644 --- a/pico/draw2_arm.S +++ b/pico/draw2_arm.S @@ -8,7 +8,7 @@ * this is highly specialized, be careful if changing related C code! */ -#include "pico_int_o32.h" +#include "pico_int_offs.h" @ define these constants in your include file: @ .equiv START_ROW, 1 diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 3bc27033..2efc804c 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -8,7 +8,7 @@ * this is highly specialized, be careful if changing related C code! */ -#include "pico_int_o32.h" +#include "pico_int_offs.h" .extern DrawStripInterlace diff --git a/pico/memory.h b/pico/memory.h index eb440dd4..d55267ba 100644 --- a/pico/memory.h +++ b/pico/memory.h @@ -2,11 +2,6 @@ #include "pico_port.h" -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; -typedef uintptr_t uptr; // unsigned pointer-sized int - #define M68K_MEM_SHIFT 16 // minimum size we can map #define M68K_BANK_SIZE (1 << M68K_MEM_SHIFT) diff --git a/pico/memory_amips.S b/pico/memory_amips.S index 7ae25922..7932c2c9 100644 --- a/pico/memory_amips.S +++ b/pico/memory_amips.S @@ -8,7 +8,7 @@ # OUT OF DATE -#include "pico_int_o32.h" +#include "pico_int_offs.h" .set noreorder .set noat diff --git a/pico/memory_arm.S b/pico/memory_arm.S index 117cea0b..07d6a128 100644 --- a/pico/memory_arm.S +++ b/pico/memory_arm.S @@ -6,7 +6,7 @@ * See COPYING file in the top-level directory. */ -#include "pico_int_o32.h" +#include "pico_int_offs.h" .equ SRR_MAPPED, (1 << 0) .equ SRR_READONLY, (1 << 1) diff --git a/pico/pico_int.h b/pico/pico_int.h index 13338242..831bfc72 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -33,6 +33,14 @@ extern "C" { #endif +typedef unsigned char u8; +typedef signed char s8; +typedef unsigned short u16; +typedef signed short s16; +typedef unsigned int u32; +typedef signed int s32; +typedef uintptr_t uptr; // unsigned pointer-sized int + // ----------------------- 68000 CPU ----------------------- #ifdef EMU_C68K #include "../cpu/cyclone/Cyclone.h" @@ -427,7 +435,7 @@ struct PicoSound short psg_line; }; -// run tools/mkoffsets pico/pico_int_o32.h if you change these +// run tools/mkoffsets pico/pico_int_offs.h if you change these // careful with savestate compat struct Pico { @@ -905,13 +913,13 @@ void PicoFrame32x(void); void Pico32xStateLoaded(int is_early); void p32x_sync_sh2s(unsigned int m68k_target); void p32x_sync_other_sh2(SH2 *sh2, unsigned int m68k_target); -void p32x_update_irls(SH2 *active_sh2, int m68k_cycles); -void p32x_trigger_irq(SH2 *sh2, int m68k_cycles, unsigned int mask); -void p32x_update_cmd_irq(SH2 *sh2, int m68k_cycles); +void p32x_update_irls(SH2 *active_sh2, unsigned int m68k_cycles); +void p32x_trigger_irq(SH2 *sh2, unsigned int m68k_cycles, unsigned int mask); +void p32x_update_cmd_irq(SH2 *sh2, unsigned int m68k_cycles); void p32x_reset_sh2s(void); void p32x_event_schedule(unsigned int now, enum p32x_event event, int after); void p32x_event_schedule_sh2(SH2 *sh2, enum p32x_event event, int after); -void p32x_schedule_hint(SH2 *sh2, int m68k_cycles); +void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles); // 32x/memory.c extern struct Pico32xMem *Pico32xMem; diff --git a/platform/common/common.mak b/platform/common/common.mak index 2f676abc..b4a5759c 100644 --- a/platform/common/common.mak +++ b/platform/common/common.mak @@ -9,6 +9,7 @@ asm_render = 0 asm_ym2612 = 0 asm_misc = 0 asm_cdmemory = 0 +asm_32xdraw = 0 asm_mix = 0 endif diff --git a/platform/common/memcpy.c b/platform/common/memcpy.c index b99de4ae..1cd74175 100644 --- a/platform/common/memcpy.c +++ b/platform/common/memcpy.c @@ -9,7 +9,7 @@ * to avoid under/overstepping the src region). * * ATTN does dirty aliasing tricks with undefined behaviour by standard. - * (however, this was needed to improve the generated code). + * (however, this improved the generated code). * ATTN uses struct assignment, which only works if the compiler is inlining * this (else it would probably call memcpy :-)). */ @@ -33,22 +33,24 @@ void *memcpy(void *dest, const void *src, size_t n) const int lm = sizeof(uint32_t)-1; /* align src to word */ - while (((unsigned)ss.c & lm) && n > 0) + while (((uintptr_t)ss.c & lm) && n > 0) *ds.c++ = *ss.c++, n--; - if (((unsigned)ds.c & lm) == 0) { + if (((uintptr_t)ds.c & lm) == 0) { /* fast copy if pointers have the same aligment */ - while (n >= sizeof(struct _16)) /* copy 16 bytes blocks */ + while (n >= sizeof(struct _16)) /* copy 16 byte blocks */ *ds.s++ = *ss.s++, n -= sizeof(struct _16); if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ *ds.l++ = *ss.l++, n -= sizeof(uint64_t); +// if (n >= sizeof(uint32_t)) /* copy leftover 4 byte block */ +// *ds.i++ = *ss.i++, n -= sizeof(uint32_t); } else if (n >= 2*sizeof(uint32_t)) { /* unaligned data big enough to avoid overstepping src */ uint32_t v1, v2, b, s; /* align dest to word */ - while (((unsigned)ds.c & lm) && n > 0) + while (((uintptr_t)ds.c & lm) && n > 0) *ds.c++ = *ss.c++, n--; /* copy loop: load aligned words and store shifted words */ - b = (unsigned)ss.c & lm, s = b*8; ss.c -= b; + b = (uintptr_t)ss.c & lm, s = b*8; ss.c -= b; v1 = *ss.i++, v2 = *ss.i++; while (n >= 3*sizeof(uint32_t)) { *ds.i++ = (v1 _L_ s) | (v2 _U_ (32-s)); v1 = *ss.i++; @@ -78,28 +80,35 @@ void *memmove (void *dest, const void *src, size_t n) struct _16 { uint32_t a[4]; }; union { const void *v; uint8_t *c; uint32_t *i; uint64_t *l; struct _16 *s; } ss = { src+n }, ds = { dest+n }; + size_t pd = dest > src ? dest - src : src - dest; const int lm = sizeof(uint32_t)-1; if (dest <= src || dest >= src+n) return memcpy(dest, src, n); /* align src to word */ - while (((unsigned)ss.c & lm) && n > 0) + while (((uintptr_t)ss.c & lm) && n > 0) *--ds.c = *--ss.c, n--; - if (((unsigned)ds.c & lm) == 0) { + /* take care not to copy multi-byte data if it overlaps */ + if (((uintptr_t)ds.c & lm) == 0) { /* fast copy if pointers have the same aligment */ - while (n >= sizeof(struct _16)) /* copy 16 byte blocks */ + while (n >= sizeof(struct _16) && pd >= sizeof(struct _16)) + /* copy 16 bytes blocks if no overlap */ *--ds.s = *--ss.s, n -= sizeof(struct _16); - if (n >= sizeof(uint64_t)) /* copy leftover 8 byte block */ + while (n >= sizeof(uint64_t) && pd >= sizeof(uint64_t)) + /* copy leftover 8 byte blocks if no overlap */ *--ds.l = *--ss.l, n -= sizeof(uint64_t); - } else if (n >= 2*sizeof(uint32_t)) { + while (n >= sizeof(uint32_t) && pd >= sizeof(uint32_t)) + /* copy leftover 4 byte blocks if no overlap */ + *--ds.i = *--ss.i, n -= sizeof(uint32_t); + } else if (n >= 2*sizeof(uint32_t) && pd >= 2*sizeof(uint32_t)) { /* unaligned data big enough to avoid understepping src */ uint32_t v1, v2, b, s; /* align dest to word */ - while (((unsigned)ds.c & lm) && n > 0) + while (((uintptr_t)ds.c & lm) && n > 0) *--ds.c = *--ss.c, n--; /* copy loop: load aligned words and store shifted words */ - b = (unsigned)ss.c & lm, s = b*8; ss.c += b; + b = (uintptr_t)ss.c & lm, s = b*8; ss.c += b; v1 = *--ss.i, v2 = *--ss.i; while (n >= 3*sizeof(uint32_t)) { *--ds.i = (v1 _U_ s) | (v2 _L_ (32-s)); v1 = *--ss.i; @@ -114,7 +123,7 @@ void *memmove (void *dest, const void *src, size_t n) } ss.c -= b - 2*sizeof(uint32_t); } - /* copy 0-7 leftover bytes */ + /* copy 0-7 leftover bytes (or upto everything if ptrs are too close) */ while (n >= 4) { *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; *--ds.c = *--ss.c, n--; diff --git a/platform/gp2x/code940/memcpy.s b/platform/gp2x/code940/memcpy.s index 282762fd..1350639a 100644 --- a/platform/gp2x/code940/memcpy.s +++ b/platform/gp2x/code940/memcpy.s @@ -114,14 +114,12 @@ subs r2, r2, #0x14 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */ -/* blat 64 bytes at a time */ +/* blat 32 bytes at a time */ /* XXX for really big copies perhaps we should use more registers */ Lmemcpy_floop32: ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr} -stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -subs r2, r2, #0x40 +subs r2, r2, #0x20 bge Lmemcpy_floop32 cmn r2, #0x10 @@ -314,14 +312,12 @@ stmdb sp!, {r4, r7, r8, r9, r10, lr} subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ blt Lmemcpy_bl32 -/* blat 64 bytes at a time */ +/* blat 32 bytes at a time */ /* XXX for really big copies perhaps we should use more registers */ Lmemcpy_bloop32: ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr} -stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr} -subs r2, r2, #0x40 +subs r2, r2, #0x20 bge Lmemcpy_bloop32 Lmemcpy_bl32: diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 13e55495..6d68a1bc 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -49,11 +49,8 @@ get_define () # prefix struct member member... echo "const int one = 1;" >/tmp/getoffs.c compile_rodata ENDIAN=$(if [ "$rodata" -eq 1 ]; then echo be; else echo le; fi) -# determine output file -echo "const int vsz = sizeof(void *);" >/tmp/getoffs.c -compile_rodata -fn="${1:-.}/pico_int_o$((8*$rodata)).h" # output header +fn="${1:-.}/pico_int_offs.h" echo "/* autogenerated by mkoffset.sh, do not edit */" >$fn echo "/* target endianess: $ENDIAN, compiled with: $CC $CFLAGS */" >>$fn # output offsets -- 2.39.2