From 2fbb05197d220bc87be96cec4eef753e6554c02e Mon Sep 17 00:00:00 2001 From: notaz Date: Sun, 4 Jan 2026 01:18:45 +0200 Subject: [PATCH] gpu_async: some dma support --- plugins/gpulib/gpu.c | 58 ++++++++-------- plugins/gpulib/gpu.h | 23 ++++++- plugins/gpulib/gpu_async.c | 137 ++++++++++++++++++++++++++++++++----- plugins/gpulib/gpu_async.h | 8 ++- 4 files changed, 175 insertions(+), 51 deletions(-) diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c index c9b764d5..04ffbf6e 100644 --- a/plugins/gpulib/gpu.c +++ b/plugins/gpulib/gpu.c @@ -11,7 +11,6 @@ #include #include #include -#include /* for calloc */ #include "gpu.h" #include "gpu_timing.h" @@ -31,7 +30,7 @@ struct psx_gpu gpu; static noinline int do_cmd_buffer(struct psx_gpu *gpu, uint32_t *data, int count, int *cycles_sum, int *cycles_last); -static noinline void finish_vram_transfer(struct psx_gpu *gpu, int is_read); +static noinline void finish_vram_transfer(struct psx_gpu *gpu, int is_read, int is_async); static void sync_renderer(struct psx_gpu *gpu) { @@ -51,7 +50,7 @@ static noinline void do_cmd_reset(struct psx_gpu *gpu) sync_renderer(gpu); if (unlikely(gpu->dma.h > 0)) - finish_vram_transfer(gpu, gpu->dma_start.is_read); + finish_vram_transfer(gpu, gpu->dma_start.is_read, 0); gpu->dma.h = 0; } @@ -428,10 +427,8 @@ const unsigned char cmd_lengths[256] = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -#define VRAM_MEM_XY(vram_, x, y) &vram_[(y) * 1024 + (x)] - // this isn't very useful so should be rare -static void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6) +void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6) { int i; if (r6 == 1) { @@ -447,18 +444,6 @@ static void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6) } } -static inline void do_vram_line(uint16_t *vram_, int x, int y, - uint16_t *mem, int l, int is_read, uint32_t r6) -{ - uint16_t *vram = VRAM_MEM_XY(vram_, x, y); - if (unlikely(is_read)) - memcpy(mem, vram, l * 2); - else if (unlikely(r6)) - cpy_mask(vram, mem, l, r6); - else - memcpy(vram, mem, l * 2); -} - static int do_vram_io(struct psx_gpu *gpu, uint32_t *data, int count, int is_read) { int count_initial = count; @@ -468,11 +453,20 @@ static int do_vram_io(struct psx_gpu *gpu, uint32_t *data, int count, int is_rea int x = gpu->dma.x, y = gpu->dma.y; int w = gpu->dma.w, h = gpu->dma.h; int o = gpu->dma.offset; - int l; - count *= 2; // operate in 16bpp pixels - - //sync_renderer(gpu); // done in start_vram_transfer() + int l, async_queued = 0; + + if (gpu_async_enabled(gpu) && !is_read && o == 0 && + count <= AGPU_DMA_MAX && w * h == count * 2) + async_queued = gpu_async_try_dma(gpu, data, count); + if (async_queued) { + gpu->dma.h = 0; + finish_vram_transfer(gpu, 0, 1); + return count; + } + if (o == 0) + sync_renderer(gpu); + count *= 2; // operate in 16bpp pixels if (gpu->dma.offset) { l = w - gpu->dma.offset; if (count < l) @@ -505,7 +499,7 @@ static int do_vram_io(struct psx_gpu *gpu, uint32_t *data, int count, int is_rea } } else - finish_vram_transfer(gpu, is_read); + finish_vram_transfer(gpu, is_read, 0); gpu->dma.y = y; gpu->dma.h = h; gpu->dma.offset = o; @@ -527,7 +521,8 @@ static noinline void start_vram_transfer(struct psx_gpu *gpu, uint32_t pos_word, gpu->dma.is_read = is_read; gpu->dma_start = gpu->dma; - sync_renderer(gpu); + // postponed until the actual transfer + //sync_renderer(gpu); if (is_read) { const uint16_t *mem = VRAM_MEM_XY(gpu->vram, gpu->dma.x, gpu->dma.y); @@ -537,13 +532,17 @@ static noinline void start_vram_transfer(struct psx_gpu *gpu, uint32_t pos_word, gpu->state.last_vram_read_frame = *gpu->state.frame_count; } - log_io(gpu, "start_vram_transfer %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w', - gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h); + if (gpu->dma.x + gpu->dma.w > 1024) + log_anomaly(gpu, "vram tr xwrap: %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w', + gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h); + else + log_io(gpu, "start_vram_transfer %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w', + gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h); if (gpu->gpu_state_change) gpu->gpu_state_change(PGS_VRAM_TRANSFER_START, 0); } -static void finish_vram_transfer(struct psx_gpu *gpu, int is_read) +static void finish_vram_transfer(struct psx_gpu *gpu, int is_read, int is_async) { if (is_read) gpu->status &= ~PSX_GPU_STATUS_IMG; @@ -562,8 +561,9 @@ static void finish_vram_transfer(struct psx_gpu *gpu, int is_read) gpu->dma_start.x, gpu->dma_start.y, gpu->dma_start.w, gpu->dma_start.h, gpu->screen.src_x, gpu->screen.src_y, gpu->screen.hres, gpu->screen.vres, !not_dirty); gpu->state.fb_dirty |= !not_dirty; - renderer_update_caches(gpu->dma_start.x, gpu->dma_start.y, - gpu->dma_start.w, gpu->dma_start.h, 0); + if (!is_async) + renderer_update_caches(gpu->dma_start.x, gpu->dma_start.y, + gpu->dma_start.w, gpu->dma_start.h, 0); } if (gpu->gpu_state_change) gpu->gpu_state_change(PGS_VRAM_TRANSFER_END, 0); diff --git a/plugins/gpulib/gpu.h b/plugins/gpulib/gpu.h index 7a2bcab0..d4b66e5a 100644 --- a/plugins/gpulib/gpu.h +++ b/plugins/gpulib/gpu.h @@ -12,14 +12,19 @@ #define __GPULIB_GPU_H__ #include +#include +#include "../../include/compiler_features.h" //#define RAW_FB_DISPLAY #define gpu_log(gpu, fmt, ...) \ printf("%d:%03d: " fmt, *(gpu)->state.frame_count, *(gpu)->state.hcnt, ##__VA_ARGS__) -//#define log_anomaly gpu_log +#ifdef LOG_UNHANDLED +#define log_anomaly gpu_log +#else #define log_anomaly(...) +#endif #ifdef __cplusplus extern "C" { @@ -150,12 +155,28 @@ void vout_blank(void); void vout_set_config(const struct rearmed_cbs *config); // helpers +#define VRAM_MEM_XY(vram_, x, y) &vram_[(y) * 1024 + (x)] + int do_vram_copy(uint16_t *vram, const uint32_t *ex_regs, const uint32_t *params, int *cpu_cycles); int prim_try_simplify_quad_t (void *simplified, const void *prim); int prim_try_simplify_quad_gt(void *simplified, const void *prim); +void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6); + +static inline void do_vram_line(uint16_t *vram_, int x, int y, + uint16_t *mem, int l, int is_read, uint32_t r6) +{ + uint16_t *vram = VRAM_MEM_XY(vram_, x, y); + if (unlikely(is_read)) + memcpy(mem, vram, l * 2); + else if (unlikely(r6)) + cpy_mask(vram, mem, l, r6); + else + memcpy(vram, mem, l * 2); +} + /* listing these here for correct linkage if rasterizer uses c++ */ struct GPUFreeze; diff --git a/plugins/gpulib/gpu_async.c b/plugins/gpulib/gpu_async.c index 385d78de..b7a085b6 100644 --- a/plugins/gpulib/gpu_async.c +++ b/plugins/gpulib/gpu_async.c @@ -35,7 +35,8 @@ // must be at least 3 words due to cmd_lengths[] #define FAKECMD_SCREEN_CHANGE 0xdfu #define FAKECMD_SET_INTERLACE 0xdeu -#define FAKECMD_BREAK 0xddu +#define FAKECMD_DMA_WRITE 0xddu +#define FAKECMD_BREAK 0xdcu #if defined(__aarch64__) || defined(HAVE_ARMV7) #define BARRIER() __asm__ __volatile__ ("dmb ishst" ::: "memory") @@ -99,6 +100,15 @@ union cmd_set_interlace }; }; +union cmd_dma_write +{ + uint32_t u32s[3]; + struct { + uint32_t cmd; + short x, y, w, h; + }; +}; + struct cmd_break { uint32_t u32s[3]; @@ -108,6 +118,8 @@ static int noinline do_notify_screen_change(struct psx_gpu *gpu, const union cmd_screen_change *cmd); static int do_set_interlace(struct psx_gpu *gpu, const union cmd_set_interlace *cmd); +static int do_dma_write(struct psx_gpu *gpu, + const union cmd_dma_write *cmd, uint32_t pos); static void run_thread_nolock(struct psx_gpu_async *agpu) { @@ -124,52 +136,62 @@ static void run_thread(struct psx_gpu_async *agpu) slock_unlock(agpu->lock); } -static int calc_space_for_add(struct psx_gpu_async *agpu) +static int calc_space_for_add(struct psx_gpu_async *agpu, uint32_t pos_added) { - int space = AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used)); + int space = AGPU_BUF_LEN - (pos_added - RDPOS(agpu->pos_used)); assert(space >= 0); assert(space <= AGPU_BUF_LEN); return space; } // adds everything or nothing, else we may get incomplete cmd -static int do_add(struct psx_gpu_async *agpu, const uint32_t *list, int len) +static int do_add_pos(struct psx_gpu_async *agpu, const void *list, int list_words, + uint32_t *pos_added_) { int pos, space, left, retval = 0; - uint32_t pos_added = agpu->pos_added; + uint32_t pos_added = *pos_added_; - assert(len < AGPU_BUF_LEN); - space = calc_space_for_add(agpu); - if (space < len) + assert(list_words < AGPU_BUF_LEN); + space = calc_space_for_add(agpu, pos_added); + if (space < list_words) return 0; pos = pos_added & AGPU_BUF_MASK; left = AGPU_BUF_LEN - pos; - if (left < len) { + if (left < list_words) { memset(&agpu->cmd_buffer[pos], 0, left * 4); pos_added += left; pos = 0; - space = calc_space_for_add(agpu); + space = calc_space_for_add(agpu, pos_added); } - if (space >= len) { - memcpy(&agpu->cmd_buffer[pos], list, len * 4); - pos_added += len; - retval = len; + if (space >= list_words) { + memcpy(&agpu->cmd_buffer[pos], list, list_words * 4); + pos_added += list_words; + retval = list_words; } + *pos_added_ = pos_added; + return retval; +} + +static int do_add(struct psx_gpu_async *agpu, const void *list, int list_words) +{ + uint32_t pos_added = agpu->pos_added; + int ret = do_add_pos(agpu, list, list_words, &pos_added); BARRIER(); WRPOS(agpu->pos_added, pos_added); - return retval; + return ret; } -static void do_add_with_wait(struct psx_gpu_async *agpu, const uint32_t *list, int len) +static void do_add_with_wait(struct psx_gpu_async *agpu, + const void *list, int list_words) { for (;;) { - if (do_add(agpu, list, len)) + if (do_add(agpu, list, list_words)) break; slock_lock(agpu->lock); run_thread_nolock(agpu); - while (len > AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used))) { + while (list_words > AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used))) { assert(!agpu->idle); assert(agpu->wait_mode == waitmode_none); agpu->wait_mode = waitmode_progress; @@ -200,7 +222,7 @@ static void add_draw_area_e(struct psx_gpu_async *agpu, uint32_t pos, int force, (ex_regs[4] & 0x3ff) + 1, ((ex_regs[4] >> 10) & 0x1ff) + 1); } -int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len, +int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int list_len, int *cpu_cycles_sum_out, int *cpu_cycles_last, int *last_cmd) { uint32_t cyc_sum = 0, cyc = *cpu_cycles_last; @@ -368,6 +390,50 @@ breakloop: return pos; } +int gpu_async_try_dma(struct psx_gpu *gpu, const uint32_t *data, int words) +{ + struct psx_gpu_async *agpu = gpu->async; + int used, w = gpu->dma.w, h = gpu->dma.h; + uint32_t pos_added = agpu->pos_added; + union cmd_dma_write cmd; + int bad = 0; + + if (!agpu) + return 0; + // avoid double copying + used = agpu->pos_added - RDPOS(agpu->pos_used); + if (agpu->idle && used == 0) + return 0; + // only proceed if there is space to avoid messy sync + if (AGPU_BUF_LEN - used < sizeof(cmd) / 4 + ((w + 1) / 2) * (h + 1)) { + agpu_log(gpu, "agpu: dma: used %d\n", used); + return 0; + } + + cmd.cmd = HTOLE32(FAKECMD_DMA_WRITE << 24); + cmd.x = gpu->dma.x; cmd.y = gpu->dma.y; + cmd.w = gpu->dma.w; cmd.h = gpu->dma.h; + bad |= !do_add_pos(agpu, cmd.u32s, sizeof(cmd) / 4, &pos_added); + if (w & 1) { + // align lines to psx dma word units + const uint16_t *sdata = (const uint16_t *)data; + for (; h > 0; sdata += w, h--) + bad |= !do_add_pos(agpu, sdata, w / 2 + 1, &pos_added); + } + else { + for (; h > 0; data += w / 2, h--) + bad |= !do_add_pos(agpu, data, w / 2, &pos_added); + } + assert(!bad); (void)bad; + + slock_lock(agpu->lock); + agpu->pos_added = pos_added; + run_thread_nolock(agpu); + slock_unlock(agpu->lock); + + return 1; +} + static STRHEAD_RET_TYPE gpu_async_thread(void *unused) { struct psx_gpu *gpup = &gpu; @@ -422,6 +488,9 @@ static STRHEAD_RET_TYPE gpu_async_thread(void *unused) case FAKECMD_SET_INTERLACE: done += do_set_interlace(gpup, list); break; + case FAKECMD_DMA_WRITE: + done += do_dma_write(gpup, list, pos + done); + break; case FAKECMD_BREAK: done += sizeof(struct cmd_break) / 4; break; @@ -504,6 +573,34 @@ static int do_set_interlace(struct psx_gpu *gpu, return sizeof(*cmd) / 4; } +static int do_dma_write(struct psx_gpu *gpu, + const union cmd_dma_write *cmd, uint32_t pos) +{ + int x = cmd->x, y = cmd->y, w = cmd->w, h = cmd->h; + struct psx_gpu_async *agpu = gpu->async; + uint32_t r6 = agpu->ex_regs[6] & 3; + uint16_t *vram = gpu->vram; + int stride = (w + 1) / 2; + int done = 0; + + pos += sizeof(*cmd) / 4u; + done += sizeof(*cmd) / 4u; + assert(pos <= AGPU_BUF_LEN); + for (; h > 0; h--, y++) { + if (stride > AGPU_BUF_LEN - pos) { + done += AGPU_BUF_LEN - pos; + pos = 0; + } + + y &= 511; + do_vram_line(vram, x, y, (uint16_t *)&agpu->cmd_buffer[pos], w, 0, r6); + pos += stride; + done += stride; + } + renderer_update_caches(x, cmd->y, w, cmd->h, 0); + return done; +} + void gpu_async_sync(struct psx_gpu *gpu) { struct psx_gpu_async *agpu = gpu->async; @@ -629,6 +726,8 @@ void gpu_async_start(struct psx_gpu *gpu) if (gpu->async) return; + assert(AGPU_DMA_MAX <= AGPU_BUF_LEN / 2); + agpu = calloc(1, sizeof(*agpu)); if (agpu) { agpu->lock = slock_new(); diff --git a/plugins/gpulib/gpu_async.h b/plugins/gpulib/gpu_async.h index cf429b7a..7edf0e10 100644 --- a/plugins/gpulib/gpu_async.h +++ b/plugins/gpulib/gpu_async.h @@ -6,12 +6,15 @@ struct psx_gpu; struct psx_gpu_async; +#define AGPU_DMA_MAX 4096 // words + #ifdef USE_ASYNC_GPU #define gpu_async_enabled(gpu) ((gpu)->async) -int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list, int list_len, - int *cycles_sum_out, int *cycles_last, int *last_cmd); +int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list, int list_len, + int *cycles_sum_out, int *cycles_last, int *last_cmd); +int gpu_async_try_dma(struct psx_gpu *gpu, const uint32_t *data, int words); void gpu_async_start(struct psx_gpu *gpu); void gpu_async_stop(struct psx_gpu *gpu); void gpu_async_sync(struct psx_gpu *gpu); @@ -24,6 +27,7 @@ void gpu_async_set_interlace(struct psx_gpu *gpu, int enable, int is_odd); #define gpu_async_enabled(gpu) 0 #define gpu_async_do_cmd_list(gpu, list, list_len, c0, c1, cmd) (list_len) +#define gpu_async_try_dma(gpu, data, words) 0 #define gpu_async_start(gpu) #define gpu_async_stop(gpu) #define gpu_async_sync(gpu) do {} while (0) -- 2.47.3