From de4a0279efefdd2e4595c8fc27f1564f4bff9341 Mon Sep 17 00:00:00 2001 From: notaz Date: Mon, 5 Jan 2015 04:27:03 +0200 Subject: [PATCH] spu: finish offload code to TI C64x DSP rather disappointing result so far, though :( --- plugins/dfsound/Makefile.c64p | 15 ++++ plugins/dfsound/externals.h | 7 +- plugins/dfsound/freeze.c | 10 +-- plugins/dfsound/registers.c | 3 +- plugins/dfsound/spu.c | 80 +++++++++++++-------- plugins/dfsound/spu_c64x.c | 89 +++++++++++++++++++++++- plugins/dfsound/spu_c64x.h | 16 +++++ plugins/dfsound/spu_c64x_dspcode.c | 108 +++++++++++++++++++++++++++++ plugins/dfsound/xa.c | 4 +- 9 files changed, 292 insertions(+), 40 deletions(-) create mode 100644 plugins/dfsound/Makefile.c64p create mode 100644 plugins/dfsound/spu_c64x_dspcode.c diff --git a/plugins/dfsound/Makefile.c64p b/plugins/dfsound/Makefile.c64p new file mode 100644 index 00000000..45fe76a4 --- /dev/null +++ b/plugins/dfsound/Makefile.c64p @@ -0,0 +1,15 @@ +ifndef C64_TOOLS_DSP_ROOT +$(error need C64_TOOLS_DSP_ROOT) +endif + +include $(C64_TOOLS_DSP_ROOT)/install.mk + +TARGET_BASENAME = pcsxr_spu +OPTFLAGS += -O2 +CFLAGS += -DNO_OS -DWANT_THREAD_CODE + +OBJ = \ + spu_c64x_dspcode.o64 + +include $(C64_TOOLS_DSP_ROOT)/build_area3.mk +include $(C64_TOOLS_DSP_ROOT)/build.mk diff --git a/plugins/dfsound/externals.h b/plugins/dfsound/externals.h index 3047afc8..d3bcbc6b 100644 --- a/plugins/dfsound/externals.h +++ b/plugins/dfsound/externals.h @@ -110,8 +110,6 @@ typedef struct int iRightVolume; // right volume ADSRInfoEx ADSRX; int iRawPitch; // raw pitch (0...3fff) - - int SB[32+4]; } SPUCHAN; /////////////////////////////////////////////////////////// @@ -175,6 +173,8 @@ typedef struct // psx buffers / addresses +#define SB_SIZE (32 + 4) + typedef struct { unsigned short spuCtrl; @@ -226,8 +226,9 @@ typedef struct int iRightXAVol; SPUCHAN * s_chan; + int * SB; - int pad[31]; + int pad[30]; unsigned short regArea[0x400]; } SPUInfo; diff --git a/plugins/dfsound/freeze.c b/plugins/dfsound/freeze.c index 83a7d522..5b4267bf 100644 --- a/plugins/dfsound/freeze.c +++ b/plugins/dfsound/freeze.c @@ -143,7 +143,7 @@ static void save_channel(SPUCHAN_orig *d, const SPUCHAN *s, int ch) d->iSBPos = s->iSBPos; d->spos = s->spos; d->sinc = s->sinc; - memcpy(d->SB, s->SB, sizeof(d->SB)); + memcpy(d->SB, spu.SB + ch * SB_SIZE, sizeof(d->SB[0]) * SB_SIZE); d->iStart = (regAreaGet(ch,6)&~1)<<3; d->iCurr = 0; // set by the caller d->iLoop = 0; // set by the caller @@ -157,8 +157,8 @@ static void save_channel(SPUCHAN_orig *d, const SPUCHAN *s, int ch) d->bIgnoreLoop = (s->prevflags ^ 2) << 1; d->iRightVolume = s->iRightVolume; d->iRawPitch = s->iRawPitch; - d->s_1 = s->SB[27]; // yes it's reversed - d->s_2 = s->SB[26]; + d->s_1 = spu.SB[ch * SB_SIZE + 27]; // yes it's reversed + d->s_2 = spu.SB[ch * SB_SIZE + 26]; d->bRVBActive = s->bRVBActive; d->bNoise = s->bNoise; d->bFMod = s->bFMod; @@ -185,7 +185,7 @@ static void load_channel(SPUCHAN *d, const SPUCHAN_orig *s, int ch) d->spos = s->spos; d->sinc = s->sinc; d->sinc_inv = 0; - memcpy(d->SB, s->SB, sizeof(d->SB)); + memcpy(spu.SB + ch * SB_SIZE, s->SB, sizeof(spu.SB[0]) * SB_SIZE); d->pCurr = (void *)((long)s->iCurr & 0x7fff0); d->pLoop = (void *)((long)s->iLoop & 0x7fff0); d->bReverb = s->bReverb; @@ -302,7 +302,7 @@ long CALLBACK SPUfreeze(uint32_t ulFreezeMode, SPUFreeze_t * pF, load_register(H_CDRight, cycles); // fix to prevent new interpolations from crashing - for(i=0;i freq change in simple interpolation mode: set flag + if (spu_config.iUseInterpolation == 1) + spu.SB[ch * SB_SIZE + 32] = 1; // -> freq change in simple interpolation mode: set flag } //////////////////////////////////////////////////////////////////////// diff --git a/plugins/dfsound/spu.c b/plugins/dfsound/spu.c index 96ef69c5..c6a06fd4 100644 --- a/plugins/dfsound/spu.c +++ b/plugins/dfsound/spu.c @@ -18,7 +18,7 @@ * * ***************************************************************************/ -#ifndef _WIN32 +#if !defined(_WIN32) && !defined(NO_OS) #include // gettimeofday in xa.c #define THREAD_ENABLED 1 #endif @@ -29,9 +29,12 @@ #include "externals.h" #include "registers.h" #include "out.h" -#include "arm_features.h" #include "spu_config.h" +#ifdef __arm__ +#include "arm_features.h" +#endif + #ifdef __ARM_ARCH_7A__ #define ssat32_to_16(v) \ asm("ssat %0,#16,%1" : "=r" (v) : "r" (v)) @@ -74,19 +77,18 @@ SPUConfig spu_config; REVERBInfo rvb; -#ifdef THREAD_ENABLED +#if defined(THREAD_ENABLED) || defined(WANT_THREAD_CODE) // worker thread state static struct spu_worker { unsigned int pending:1; unsigned int exit_thread:1; + unsigned int stale_cache:1; int ns_to; int ctrl; int decode_pos; int silentch; unsigned int chmask; - unsigned int r_chan_end; - unsigned int r_decode_dirty; struct { int spos; int sbpos; @@ -97,6 +99,14 @@ static struct spu_worker { ADSRInfoEx adsr; // might want to add vol and fmod flags.. } ch[24]; + struct { + struct { + int adsrState; + int adsrEnvelopeVol; + } ch[24]; + unsigned int chan_end; + unsigned int decode_dirty; + } r; } *worker; #else @@ -260,20 +270,21 @@ static int check_irq(int ch, unsigned char *pos) INLINE void StartSound(int ch) { SPUCHAN *s_chan = &spu.s_chan[ch]; + int *SB = spu.SB + ch * SB_SIZE; StartADSR(ch); StartREVERB(ch); s_chan->prevflags=2; - s_chan->SB[26]=0; // init mixing vars - s_chan->SB[27]=0; s_chan->iSBPos=27; + SB[26]=0; // init mixing vars + SB[27]=0; - s_chan->SB[28]=0; - s_chan->SB[29]=0; // init our interpolation helpers - s_chan->SB[30]=0; - s_chan->SB[31]=0; + SB[28]=0; + SB[29]=0; // init our interpolation helpers + SB[30]=0; + SB[31]=0; s_chan->spos=0; spu.dwNewChannel&=~(1<SB; + SB = spu.SB + ch * SB_SIZE; sinc = s_chan->sinc; if (s_chan->bNoise) @@ -867,10 +878,11 @@ static void do_samples_finish(int ns_to, int silentch, int decode_pos); // optional worker thread handling -#ifdef THREAD_ENABLED +#if defined(THREAD_ENABLED) || defined(WANT_THREAD_CODE) static void thread_work_start(void); static void thread_work_wait_sync(void); +static void thread_sync_caches(void); static void queue_channel_work(int ns_to, int silentch) { @@ -927,7 +939,7 @@ static void do_channel_work(void) sinc = worker->ch[ch].sinc; s_chan = &spu.s_chan[ch]; - SB = s_chan->SB; + SB = spu.SB + ch * SB_SIZE; if (s_chan->bNoise) do_lsfr_samples(d, worker->ctrl, &spu.dwNoiseCount, &spu.dwNoiseVal); @@ -945,6 +957,8 @@ static void do_channel_work(void) worker->ch[ch].adsr.EnvelopeVol = 0; memset(&ChanBuf[d], 0, (ns_to - d) * sizeof(ChanBuf[0])); } + worker->r.ch[ch].adsrState = worker->ch[ch].adsr.State; + worker->r.ch[ch].adsrEnvelopeVol = worker->ch[ch].adsr.EnvelopeVol; if (ch == 1 || ch == 3) { @@ -960,15 +974,17 @@ static void do_channel_work(void) mix_chan(0, ns_to, s_chan->iLeftVolume, s_chan->iRightVolume); } - worker->r_chan_end = endmask; - worker->r_decode_dirty = decode_dirty_ch; + worker->r.chan_end = endmask; + worker->r.decode_dirty = decode_dirty_ch; } -static void sync_worker_thread(void) +static void sync_worker_thread(int do_direct) { unsigned int mask; int ch; + if (do_direct) + thread_sync_caches(); if (!worker->pending) return; @@ -981,12 +997,12 @@ static void sync_worker_thread(void) // be sure there was no keyoff while thread was working if (spu.s_chan[ch].ADSRX.State != ADSR_RELEASE) - spu.s_chan[ch].ADSRX.State = worker->ch[ch].adsr.State; - spu.s_chan[ch].ADSRX.EnvelopeVol = worker->ch[ch].adsr.EnvelopeVol; + spu.s_chan[ch].ADSRX.State = worker->r.ch[ch].adsrState; + spu.s_chan[ch].ADSRX.EnvelopeVol = worker->r.ch[ch].adsrEnvelopeVol; } - spu.dwChannelOn &= ~worker->r_chan_end; - spu.decode_dirty_ch |= worker->r_decode_dirty; + spu.dwChannelOn &= ~worker->r.chan_end; + spu.decode_dirty_ch |= worker->r.decode_dirty; do_samples_finish(worker->ns_to, worker->silentch, worker->decode_pos); @@ -995,7 +1011,7 @@ static void sync_worker_thread(void) #else static void queue_channel_work(int ns_to, int silentch) {} -static void sync_worker_thread(void) {} +static void sync_worker_thread(int do_direct) {} #endif // THREAD_ENABLED @@ -1004,7 +1020,7 @@ static void sync_worker_thread(void) {} // here is the main job handler... //////////////////////////////////////////////////////////////////////// -void do_samples(unsigned int cycles_to, int do_sync) +void do_samples(unsigned int cycles_to, int do_direct) { unsigned int mask; int ch, ns_to; @@ -1019,6 +1035,10 @@ void do_samples(unsigned int cycles_to, int do_sync) return; } + do_direct |= (cycle_diff < 64 * 768); + if (worker != NULL) + sync_worker_thread(do_direct); + if (cycle_diff < 2 * 768) return; @@ -1058,9 +1078,6 @@ void do_samples(unsigned int cycles_to, int do_sync) } } - if (worker != NULL) - sync_worker_thread(); - mask = spu.dwNewChannel & 0xffffff; for (ch = 0; mask != 0; ch++, mask >>= 1) { if (mask & 1) @@ -1074,7 +1091,7 @@ void do_samples(unsigned int cycles_to, int do_sync) do_samples_finish(ns_to, silentch, spu.decode_pos); } else { - if (do_sync || worker == NULL || !spu_config.iUseThread) { + if (do_direct || worker == NULL || !spu_config.iUseThread) { do_channels(ns_to); do_samples_finish(ns_to, silentch, spu.decode_pos); } @@ -1320,6 +1337,10 @@ static void thread_work_wait_sync(void) sem_wait(&t.sem_done); } +static void thread_sync_caches(void) +{ +} + static void *spu_worker_thread(void *unused) { while (1) { @@ -1400,6 +1421,7 @@ long CALLBACK SPUinit(void) InitADSR(); spu.s_chan = calloc(MAXCHAN+1, sizeof(spu.s_chan[0])); // channel + 1 infos (1 is security for fmod handling) + spu.SB = calloc(MAXCHAN, sizeof(spu.SB[0]) * SB_SIZE); spu.spuAddr = 0; spu.decode_pos = 0; @@ -1448,6 +1470,8 @@ long CALLBACK SPUshutdown(void) free(spu.spuMemC); spu.spuMemC = NULL; + free(spu.SB); + spu.SB = NULL; free(spu.s_chan); spu.s_chan = NULL; diff --git a/plugins/dfsound/spu_c64x.c b/plugins/dfsound/spu_c64x.c index 86b21497..d829d297 100644 --- a/plugins/dfsound/spu_c64x.c +++ b/plugins/dfsound/spu_c64x.c @@ -22,10 +22,13 @@ */ #include +#include + #include #include "spu_c64x.h" static dsp_mem_region_t region; +static dsp_component_id_t compid; static struct { void *handle; @@ -37,20 +40,57 @@ static struct { int (*dsp_cache_inv_virt)(void *_virtAddr, sU32 _size); int (*dsp_rpc_send)(const dsp_msg_t *_msgTo); int (*dsp_rpc_recv)(dsp_msg_t *_msgFrom); + int (*dsp_rpc)(const dsp_msg_t *_msgTo, dsp_msg_t *_msgFrom); void (*dsp_logbuf_print)(void); } f; static void thread_work_start(void) { - do_channel_work(); + dsp_msg_t msg; + int ret; + + DSP_MSG_INIT(&msg, compid, CCMD_DOIT, 0, 0); + ret = f.dsp_rpc_send(&msg); + if (ret != 0) { + fprintf(stderr, "dsp_rpc_send failed: %d\n", ret); + f.dsp_logbuf_print(); + // maybe stop using the DSP? + } } static void thread_work_wait_sync(void) { + dsp_msg_t msg; + int ns_to; + int ret; + + ns_to = worker->ns_to; + f.dsp_cache_inv_virt(spu.sRVBStart, sizeof(spu.sRVBStart[0]) * 2 * ns_to); + f.dsp_cache_inv_virt(SSumLR, sizeof(SSumLR[0]) * 2 * ns_to); + f.dsp_cache_inv_virt(&worker->r, sizeof(worker->r)); + worker->stale_cache = 1; // SB, ram + + ret = f.dsp_rpc_recv(&msg); + if (ret != 0) { + fprintf(stderr, "dsp_rpc_recv failed: %d\n", ret); + f.dsp_logbuf_print(); + } + //f.dsp_logbuf_print(); +} + +// called before ARM decides to do SPU mixing itself +static void thread_sync_caches(void) +{ + if (worker->stale_cache) { + f.dsp_cache_inv_virt(spu.SB, sizeof(spu.SB[0]) * SB_SIZE * 24); + f.dsp_cache_inv_virt(spu.spuMemC + 0x800, 0x800); + worker->stale_cache = 0; + } } static void init_spu_thread(void) { + dsp_msg_t init_msg, msg_in; struct region_mem *mem; int ret; @@ -73,6 +113,7 @@ static void init_spu_thread(void) LDS(dsp_component_load); LDS(dsp_rpc_send); LDS(dsp_rpc_recv); + LDS(dsp_rpc); LDS(dsp_logbuf_print); #undef LDS if (failed) { @@ -89,6 +130,12 @@ static void init_spu_thread(void) return; } + ret = f.dsp_component_load(NULL, COMPONENT_NAME, &compid); + if (ret != 0) { + fprintf(stderr, "dsp_component_load failed: %d\n", ret); + goto fail_cload; + } + region = f.dsp_shm_alloc(DSP_CACHE_R, sizeof(*mem)); // writethrough if (region.size < sizeof(*mem) || region.virt_addr == 0) { fprintf(stderr, "dsp_shm_alloc failed\n"); @@ -96,6 +143,31 @@ static void init_spu_thread(void) } mem = (void *)region.virt_addr; + memcpy(&mem->spu_config, &spu_config, sizeof(mem->spu_config)); + + DSP_MSG_INIT(&init_msg, compid, CCMD_INIT, region.phys_addr, 0); + ret = f.dsp_rpc(&init_msg, &msg_in); + if (ret != 0) { + fprintf(stderr, "dsp_rpc failed: %d\n", ret); + goto fail_init; + } + + if (mem->sizeof_region_mem != sizeof(*mem)) { + fprintf(stderr, "error: size mismatch 1: %d vs %zd\n", + mem->sizeof_region_mem, sizeof(*mem)); + goto fail_init; + } + if (mem->offsetof_s_chan1 != offsetof(typeof(*mem), s_chan[1])) { + fprintf(stderr, "error: size mismatch 2: %d vs %zd\n", + mem->offsetof_s_chan1, offsetof(typeof(*mem), s_chan[1])); + goto fail_init; + } + if (mem->offsetof_worker_ram != offsetof(typeof(*mem), worker.ch[1])) { + fprintf(stderr, "error: size mismatch 3: %d vs %zd\n", + mem->offsetof_worker_ram, offsetof(typeof(*mem), worker.ch[1])); + goto fail_init; + } + // override default allocations free(spu.spuMemC); spu.spuMemC = mem->spu_ram; @@ -103,14 +175,26 @@ static void init_spu_thread(void) spu.sRVBStart = mem->RVB; free(SSumLR); SSumLR = mem->SSumLR; + free(spu.SB); + spu.SB = mem->SB; free(spu.s_chan); spu.s_chan = mem->s_chan; worker = &mem->worker; - printf("C64x DSP ready.\n"); + printf("spu: C64x DSP ready (id=%d).\n", (int)compid); + f.dsp_logbuf_print(); + +pcnt_init(); + (void)do_channel_work; // used by DSP instead return; +fail_init: + f.dsp_shm_free(region); fail_mem: + // no component unload func? +fail_cload: + printf("spu: C64x DSP init failed.\n"); + f.dsp_logbuf_print(); f.dsp_close(); worker = NULL; } @@ -128,6 +212,7 @@ static void exit_spu_thread(void) spu.spuMemC = NULL; spu.sRVBStart = NULL; SSumLR = NULL; + spu.SB = NULL; spu.s_chan = NULL; worker = NULL; } diff --git a/plugins/dfsound/spu_c64x.h b/plugins/dfsound/spu_c64x.h index ba2a4c37..d4e73e97 100644 --- a/plugins/dfsound/spu_c64x.h +++ b/plugins/dfsound/spu_c64x.h @@ -1,9 +1,25 @@ +#define COMPONENT_NAME "pcsxr_spu" + +enum { + CCMD_INIT = 0x101, + CCMD_DOIT = 0x102, +}; struct region_mem { unsigned char spu_ram[512 * 1024]; int RVB[NSSIZE * 2]; int SSumLR[NSSIZE * 2]; + int SB[SB_SIZE * 24]; + // careful not to lose ARM writes by DSP overwriting + // with old data when it's writing out neighbor cachelines + int _pad1[128/4 - ((NSSIZE * 4 + SB_SIZE * 24) & (128/4 - 1))]; SPUCHAN s_chan[24 + 1]; + int _pad2[128/4 - ((sizeof(SPUCHAN) * 25 / 4) & (128/4 - 1))]; struct spu_worker worker; + SPUConfig spu_config; + // init/debug + int sizeof_region_mem; + int offsetof_s_chan1; + int offsetof_worker_ram; }; diff --git a/plugins/dfsound/spu_c64x_dspcode.c b/plugins/dfsound/spu_c64x_dspcode.c new file mode 100644 index 00000000..117a2966 --- /dev/null +++ b/plugins/dfsound/spu_c64x_dspcode.c @@ -0,0 +1,108 @@ +/* + * SPU processing offload to TI C64x DSP using bsp's c64_tools + * (C) Gražvydas "notaz" Ignotas, 2015 + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define SYSCALLS_C +#include +#include + +#include "spu.c" +#include "spu_c64x.h" + +/* dummy deps, some bloat but avoids ifdef hell in SPU code.. */ +static void thread_work_start(void) {} +static void thread_work_wait_sync(void) {} +static void thread_sync_caches(void) {} +struct out_driver *out_current; +void SetupSound(void) {} + +#if 0 +// no use, c64_tools does BCACHE_wbInvAll.. +static void sync_caches(void) +{ + int ns_to = worker->ns_to; + + syscalls.cache_wb(spu.sRVBStart, sizeof(spu.sRVBStart[0]) * 2 * ns_to, 1); + syscalls.cache_wb(SSumLR, sizeof(SSumLR[0]) * 2 * ns_to, 1); + + syscalls.cache_wbInv(worker, sizeof(*worker), 1); +} +#endif + +static unsigned int exec(dsp_component_cmd_t cmd, + unsigned int arg1, unsigned int arg2, + unsigned int *ret1, unsigned int *ret2) +{ + struct region_mem *mem = (void *)arg1; + int i; + + switch (cmd) { + case CCMD_INIT: + InitADSR(); + + spu.spuMemC = mem->spu_ram; + spu.sRVBStart = mem->RVB; + SSumLR = mem->SSumLR; + spu.SB = mem->SB; + spu.s_chan = mem->s_chan; + worker = &mem->worker; + memcpy(&spu_config, &mem->spu_config, sizeof(spu_config)); + + mem->sizeof_region_mem = sizeof(*mem); + mem->offsetof_s_chan1 = offsetof(typeof(*mem), s_chan[1]); + mem->offsetof_worker_ram = offsetof(typeof(*mem), worker.ch[1]); + // seems to be unneeded, no write-alloc? but just in case.. + syscalls.cache_wb(&mem->sizeof_region_mem, 3 * 4, 1); + break; + + case CCMD_DOIT: + do_channel_work(); + // c64_tools lib does BCACHE_wbInvAll() when it receives mailbox irq, + // so there is no benefit of syncing only what's needed. + // But call wbInvAll() anyway in case c64_tools is ever fixed.. + //sync_caches(); + syscalls.cache_wbInvAll(); + break; + + default: + syscalls.printf("bad cmd: %x\n", cmd); + break; + } + + return 0; +} + +#pragma DATA_SECTION(component_test_dsp, ".sec_com"); +dsp_component_t component_test_dsp = { + { + NULL, /* init */ + exec, + NULL, /* exec fastcall RPC */ + NULL, /* exit */ + }, + + COMPONENT_NAME, +}; + +DSP_COMPONENT_MAIN + +// vim:shiftwidth=1:expandtab diff --git a/plugins/dfsound/xa.c b/plugins/dfsound/xa.c index 12413808..6c0ce4b2 100644 --- a/plugins/dfsound/xa.c +++ b/plugins/dfsound/xa.c @@ -90,7 +90,9 @@ INLINE void MixXA(int ns_to, int decode_pos) static unsigned long timeGetTime_spu() { -#ifdef _WIN32 +#if defined(NO_OS) + return 0; +#elif defined(_WIN32) return GetTickCount(); #else struct timeval tv; -- 2.39.2