From: kub Date: Tue, 28 May 2019 21:16:45 +0000 (+0200) Subject: 32x DMA memory copy performance optimisation X-Git-Tag: v2.00~849 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=346153e08ed482c2b0694541b582f8674a2bf8af;p=picodrive.git 32x DMA memory copy performance optimisation --- diff --git a/cpu/sh2/compiler.c b/cpu/sh2/compiler.c index f2a1f95b..2a147a15 100644 --- a/cpu/sh2/compiler.c +++ b/cpu/sh2/compiler.c @@ -2261,7 +2261,7 @@ static int emit_get_rom_data(SH2 *sh2, sh2_reg_e r, u32 offs, int size, u32 *val if (gconst_get(r, &a)) { a += offs; // check if rom is memory mapped (not bank switched), and address is in rom - if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2)) { + if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2) != (void *)-1) { switch (size & MF_SIZEMASK) { case 0: *val = (s8)p32x_sh2_read8(a, sh2s); break; // 8 case 1: *val = (s16)p32x_sh2_read16(a, sh2s); break; // 16 @@ -4896,12 +4896,7 @@ void sh2_drc_flush_all(void) void sh2_drc_mem_setup(SH2 *sh2) { - // fill the convenience pointers - sh2->p_bios = sh2->is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w; - sh2->p_da = sh2->data_array; - sh2->p_sdram = Pico32xMem->sdram; - sh2->p_rom = Pico.rom; - // sh2->p_dram filled in dram bank switching + // fill the DRC-only convenience pointers sh2->p_drcblk_da = Pico32xMem->drcblk_da[!!sh2->is_slave]; sh2->p_drcblk_ram = Pico32xMem->drcblk_ram; } diff --git a/pico/32x/memory.c b/pico/32x/memory.c index a1ef42c2..70287a2c 100644 --- a/pico/32x/memory.c +++ b/pico/32x/memory.c @@ -1855,17 +1855,15 @@ void *p32x_sh2_get_mem_ptr(u32 a, u32 *mask, SH2 *sh2) { const sh2_memmap *mm = sh2->read8_map; void *ret = (void *)-1; - u32 am; - mm += a >> SH2_READ_SHIFT; - am = a & ((1 << SH2_READ_SHIFT)-1); - if (!map_flag_set(mm->addr) && !(am & ~mm->mask)) { + mm += SH2MAP_ADDR2OFFS_R(a); + if (!map_flag_set(mm->addr)) { // directly mapped memory (SDRAM, ROM, data array) ret = (void *)(mm->addr << 1); *mask = mm->mask; } else if ((a & ~0x7ff) == 0) { // BIOS, has handler function since it shares its segment with I/O - ret = sh2->is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w; + ret = sh2->p_bios; *mask = 0x7ff; } else if ((a & 0xc6000000) == 0x02000000) { // banked ROM. Return bank address @@ -1877,6 +1875,75 @@ void *p32x_sh2_get_mem_ptr(u32 a, u32 *mask, SH2 *sh2) return ret; } +int p32x_sh2_memcpy(u32 dst, u32 src, int count, int size, SH2 *sh2) +{ + u32 mask; + void *ps, *pd; + int len, i; + + // check if src and dst points to memory (rom/sdram/dram/da) + if ((pd = p32x_sh2_get_mem_ptr(dst, &mask, sh2)) == (void *)-1) + return 0; + if ((ps = p32x_sh2_get_mem_ptr(src, &mask, sh2)) == (void *)-1) + return 0; + ps += src & mask; + len = count * size; + + // DRAM in byte access is always in overwrite mode + if (pd == sh2->p_dram && size == 1) + dst |= 0x20000; + + // align dst to halfword + if (dst & 1) { + p32x_sh2_write8(dst, *(u8 *)((uptr)ps ^ 1), sh2); + ps++, dst++, len --; + } + + // copy data + if ((uptr)ps & 1) { + // unaligned, use halfword copy mode to reduce memory bandwidth + u16 *sp = (u16 *)(ps - 1); + u16 dl, dh = *sp++; + for (i = 0; i < (len & ~1); i += 2, dst += 2, sp++) { + dl = dh, dh = *sp; + p32x_sh2_write16(dst, (dh >> 8) | (dl << 8), sh2); + } + if (len & 1) + p32x_sh2_write8(dst, dh, sh2); + } else { + // dst and src at least halfword aligned + u16 *sp = (u16 *)ps; + // align dst to word + if ((dst & 2) && len >= 2) { + p32x_sh2_write16(dst, *sp++, sh2); + dst += 2, len -= 2; + } + if ((uptr)sp & 2) { + // halfword copy, using word writes to reduce memory bandwidth + u16 dl, dh; + for (i = 0; i < (len & ~3); i += 4, dst += 4, sp += 2) { + dl = sp[0], dh = sp[1]; + p32x_sh2_write32(dst, (dl << 16) | dh, sh2); + } + } else { + // word copy + u32 d; + for (i = 0; i < (len & ~3); i += 4, dst += 4, sp += 2) { + d = *(u32 *)sp; + p32x_sh2_write32(dst, (d << 16) | (d >> 16), sh2); + } + } + if (len & 2) { + p32x_sh2_write16(dst, *sp++, sh2); + dst += 2; + } + if (len & 1) + p32x_sh2_write8(dst, *sp >> 8, sh2); + } + + return count; +} + // ----------------------------------------------------------------- static void z80_md_bank_write_32x(unsigned int a, unsigned char d) @@ -2107,8 +2174,12 @@ void Pico32xSwapDRAM(int b) ssh2_read16_map[0x04/2].addr = ssh2_read16_map[0x24/2].addr = ssh2_read32_map[0x04/2].addr = ssh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]); - msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; // DRC conveniance ptr - msh2.p_rom = ssh2.p_rom = Pico.rom; + // convenience ptrs + msh2.p_sdram = ssh2.p_sdram = Pico32xMem->sdram; + msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; + msh2.p_rom = ssh2.p_rom = Pico.rom; + msh2.p_bios = Pico32xMem->sh2_rom_m.w; msh2.p_da = msh2.data_array; + ssh2.p_bios = Pico32xMem->sh2_rom_s.w; ssh2.p_da = ssh2.data_array; } static void bank_switch_rom_sh2(void) diff --git a/pico/32x/sh2soc.c b/pico/32x/sh2soc.c index dd61a93b..66bdc478 100644 --- a/pico/32x/sh2soc.c +++ b/pico/32x/sh2soc.c @@ -129,6 +129,24 @@ static void dmac_transfer_one(SH2 *sh2, struct dma_chan *chan) chan->sar += size; } +// optimization for copying around memory with SH2 DMA +static void dmac_memcpy(struct dma_chan *chan, SH2 *sh2) +{ + u32 size = (chan->chcr >> 10) & 3, up = chan->chcr & (1 << 14); + int count; + + if (!up || chan->tcr < 4) + return; + if (size == 3) size = 2; // 4-word xfer mode still counts in words + // XXX check TCR being a multiple of 4 in 4-word xfer mode? + // XXX check alignment of sar/dar, generating a bus error if unaligned? + count = p32x_sh2_memcpy(chan->dar, chan->sar, chan->tcr, 1 << size, sh2); + + chan->sar += count << size; + chan->dar += count << size; + chan->tcr -= count; +} + // DMA trigger by SH2 register write static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) { @@ -139,6 +157,11 @@ static void dmac_trigger(SH2 *sh2, struct dma_chan *chan) if (chan->chcr & DMA_AR) { // auto-request transfer sh2->state |= SH2_STATE_SLEEP; + if ((((chan->chcr >> 12) ^ (chan->chcr >> 14)) & 3) == 0 && + (((chan->chcr >> 14) ^ (chan->chcr >> 15)) & 1) == 1) { + // SM == DM and either DM0 or DM1 are set. check for mem to mem copy + dmac_memcpy(chan, sh2); + } while ((int)chan->tcr > 0) dmac_transfer_one(sh2, chan); dmac_transfer_complete(sh2, chan); diff --git a/pico/pico_int.h b/pico/pico_int.h index 31fc702c..36b36144 100644 --- a/pico/pico_int.h +++ b/pico/pico_int.h @@ -937,6 +937,7 @@ unsigned int REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, unsigned int d, S unsigned int REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, unsigned int d, SH2 *sh2); void *p32x_sh2_get_mem_ptr(unsigned int a, unsigned int *mask, SH2 *sh2); void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles); +int p32x_sh2_memcpy(unsigned int dst, unsigned int src, int count, int size, SH2 *sh2); // 32x/draw.c void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode); diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 461fbfa7..a573f7a4 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -89,7 +89,7 @@ get_define OFS_PMEM32x_ Pico32xMem pal_native ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_da ; echo "$line" >>$fn -get_define OFS_SH2_ SH2_ p_sdram ; echo "$line" >>$fn +get_define OFS_SH2_ SH2_ p_sdram ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_rom ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_dram ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_drcblk_da ; echo "$line" >>$fn