From 22da0dbda5f040845bc3f86c0839cb3c5eed7077 Mon Sep 17 00:00:00 2001 From: notaz Date: Tue, 23 Dec 2025 03:01:11 +0200 Subject: [PATCH] gpulib: thread sync reduction --- plugins/gpu_neon/psx_gpu/psx_gpu_parse.c | 4 + plugins/gpulib/gpu.c | 6 +- plugins/gpulib/gpu_async.c | 224 +++++++++++++++++++---- plugins/gpulib/gpu_async.h | 2 + 4 files changed, 200 insertions(+), 36 deletions(-) diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c index 7506c1c9..c127c155 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c @@ -1701,6 +1701,10 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *ex_reg psx_gpu->saved_viewport_end_x = viewport_end_x; psx_gpu->saved_viewport_end_y = viewport_end_y; + // needed for multithreaded mode where the main thread will start + // scanout if it sees no intersect with the latest draw area + flush_render_block_buffer(psx_gpu); + select_enhancement_buf(psx_gpu); #if 0 if (!psx_gpu->enhancement_current_buf_ptr) diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c index a1c2a7ac..827c0d2e 100644 --- a/plugins/gpulib/gpu.c +++ b/plugins/gpulib/gpu.c @@ -1001,7 +1001,11 @@ void GPUupdateLace(void) gpu.frameskip.frame_ready = 0; } - sync_renderer(&gpu); + if (gpu_async_enabled(&gpu)) + gpu_async_sync_scanout(&gpu); + else + renderer_flush_queues(); + updated = vout_update(); if (gpu.state.enhancement_active && !gpu.state.enhancement_was_active) renderer_update_caches(0, 0, 1024, 512, 1); diff --git a/plugins/gpulib/gpu_async.c b/plugins/gpulib/gpu_async.c index f594ab1b..99a9e099 100644 --- a/plugins/gpulib/gpu_async.c +++ b/plugins/gpulib/gpu_async.c @@ -21,14 +21,20 @@ //#define agpu_log gpu_log #define agpu_log(...) -#define AGPU_BUF_LEN (128*1024/4u) // must be power of 2 -#define AGPU_BUF_MASK (AGPU_BUF_LEN - 1) +// these constants must be power of 2 +#define AGPU_BUF_LEN (128*1024/4u) +#define AGPU_BUF_MASK (AGPU_BUF_LEN - 1) +#define AGPU_AREAS_CNT 8u +#define AGPU_AREAS_MASK (AGPU_AREAS_CNT - 1) + #ifndef min #define min(a, b) ((b) < (a) ? (b) : (a)) #endif -// must be in 0xc0...0xdf range that can't appear in thread's real cmd stream +// must be in 0xc0...0xdf range that can't appear in thread's real cmd stream; +// must be at least 3 words due to cmd_lengths[] #define FAKECMD_SCREEN_CHANGE 0xdfu +#define FAKECMD_BREAK 0xdeu #if defined(__aarch64__) || defined(HAVE_ARMV6) #define BARRIER() __asm__ __volatile__ ("dmb ishst" ::: "memory") @@ -39,13 +45,21 @@ enum waitmode { waitmode_none = 0, waitmode_progress, + waitmode_target, waitmode_full, }; +struct pos_drawarea { + uint32_t pos; + uint16_t x0, y0; + uint16_t x1, y1; +}; + struct psx_gpu_async { uint32_t pos_added; uint32_t pos_used; + uint32_t pos_target; enum waitmode wait_mode; uint8_t exit; uint8_t idle; @@ -55,6 +69,8 @@ struct psx_gpu_async scond_t *cond_add; uint32_t ex_regs[8]; // used by vram copy at least uint32_t cmd_buffer[AGPU_BUF_LEN]; + uint32_t pos_area; + struct pos_drawarea draw_areas[AGPU_AREAS_CNT]; }; union cmd_screen_change @@ -122,7 +138,6 @@ static void do_add_with_wait(struct psx_gpu_async *agpu, const uint32_t *list, i assert(agpu->wait_mode == waitmode_none); agpu->wait_mode = waitmode_progress; scond_wait(agpu->cond_add, agpu->lock); - agpu->wait_mode = waitmode_none; } slock_unlock(agpu->lock); } @@ -143,13 +158,35 @@ static void run_thread(struct psx_gpu_async *agpu) slock_unlock(agpu->lock); } +static void add_draw_area(struct psx_gpu_async *agpu, uint32_t pos, int force, + uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1) +{ + uint32_t pos_area = agpu->pos_area; + if (pos - agpu->draw_areas[pos_area].pos > 1u || force) + pos_area = agpu->pos_area = (pos_area + 1) & AGPU_AREAS_MASK; + agpu->draw_areas[pos_area].pos = pos; + agpu->draw_areas[pos_area].x0 = x0; + agpu->draw_areas[pos_area].y0 = y0; + agpu->draw_areas[pos_area].x1 = x1; + agpu->draw_areas[pos_area].y1 = y1; +} + +static void add_draw_area_e(struct psx_gpu_async *agpu, uint32_t pos, int force, + const uint32_t *ex_regs) +{ + add_draw_area(agpu, pos, force, + ex_regs[3] & 0x3ff, (ex_regs[3] >> 10) & 0x1ff, + (ex_regs[4] & 0x3ff) + 1, ((ex_regs[4] >> 10) & 0x1ff) + 1); +} + int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len, int *cpu_cycles_sum_out, int *cpu_cycles_last, int *last_cmd) { uint32_t cyc_sum = 0, cyc = *cpu_cycles_last; struct psx_gpu_async *agpu = gpu->async; - int dst_added = 0, dst_can_add = 1; + int pos_handled = 0, dst_can_add = 1; int rendered_anything = 0; + int insert_break = 0; int cmd = -1, pos, len; assert(agpu); @@ -157,8 +194,9 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len { const uint32_t *list = list_data + pos; const int16_t *slist = (void *)list; + const struct pos_drawarea *darea; int rendered = 1, skip = 0; - int num_vertexes, w, h; + int num_vertexes, x, y, w, h; cmd = LE32TOH(list[0]) >> 24; len = 1 + cmd_lengths[cmd]; @@ -169,8 +207,17 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len switch (cmd) { case 0x02: - w = LE16TOH(slist[4]) & 0x3FF; - h = LE16TOH(slist[5]) & 0x1FF; + x = (LE16TOH(slist[2]) & 0x3ff) & ~0xf; + y = LE16TOH(slist[3]) & 0x1ff; + w = ((LE16TOH(slist[4]) & 0x3ff) + 0xf) & ~0xf; + h = LE16TOH(slist[5]) & 0x1ff; + darea = &agpu->draw_areas[agpu->pos_area]; + if (x < darea->x0 || x + w > darea->x1 || y < darea->y0 || y + h > darea->y1) { + // let the main thread know about changes outside of drawing area + agpu_log(gpu, "agpu: fill %d,%d vs area %d,%d\n", x, y, darea->x0, darea->y0); + add_draw_area(agpu, agpu->pos_added, 1, x, x + w, y, y + h); + add_draw_area_e(agpu, agpu->pos_added + 1, 1, gpu->ex_regs); + } gput_sum(cyc_sum, cyc, gput_fill(w, h)); break; case 0x1f: // irq? @@ -235,17 +282,36 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len case 0x78 ... 0x7b: case 0x7C ... 0x7f: gput_sum(cyc_sum, cyc, gput_sprite(16, 16)); break; case 0x80 ... 0x9f: // vid -> vid + x = LE16TOH(slist[4]) & 0x3ff; + y = LE16TOH(slist[5]) & 0x1ff; w = ((LE16TOH(slist[6]) - 1) & 0x3ff) + 1; h = ((LE16TOH(slist[7]) - 1) & 0x1ff) + 1; + darea = &agpu->draw_areas[agpu->pos_area]; + if (x < darea->x0 || x + w > darea->x1 || y < darea->y0 || y + h > darea->y1) { + add_draw_area(agpu, agpu->pos_added, 1, x, x + w, y, y + h); + add_draw_area_e(agpu, agpu->pos_added + 1, 1, gpu->ex_regs); + } gput_sum(cyc_sum, cyc, gput_copy(w, h)); break; case 0xa0 ... 0xbf: // sys -> vid case 0xc0 ... 0xdf: // vid -> sys goto breakloop; - case 0xe0 ... 0xe7: + case 0xe0 ... 0xe2: + case 0xe5 ... 0xe7: gpu->ex_regs[cmd & 7] = LE32TOH(list[0]); rendered = 0; break; + case 0xe3: + case 0xe4: + rendered = 0; + if (gpu->ex_regs[cmd & 7] == LE32TOH(list[0])) { + skip = 1; + break; + } + gpu->ex_regs[cmd & 7] = LE32TOH(list[0]); + add_draw_area_e(agpu, agpu->pos_added, 1, gpu->ex_regs); + insert_break = 1; + break; default: rendered = 0; skip = 1; @@ -255,19 +321,24 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len if (dst_can_add) { if (!skip) { int added = dst_can_add = do_add(agpu, list, len); - dst_added += added; + pos_handled += added; } else - dst_added += len; + pos_handled += len; } } breakloop: - if (dst_added && (rendered_anything || dst_added < pos)) + if (pos_handled && (rendered_anything || pos_handled < pos)) run_thread(agpu); - if (dst_added < pos) { - int left = pos - dst_added; - agpu_log(gpu, "agpu: wait %d left %d\n", agpu->pos_added - agpu->pos_used, left); - do_add_with_wait(agpu, list_data + dst_added, left); + if (pos_handled < pos) { + // note: this is poorly implemented (wrong pos_added for draw_areas) + int left = pos - pos_handled; + agpu_log(gpu, "agpu: full %d left %d\n", agpu->pos_added - agpu->pos_used, left); + do_add_with_wait(agpu, list_data + pos_handled, left); + } + if (insert_break) { + uint32_t cmd[3] = { HTOLE32(FAKECMD_BREAK << 24), }; + do_add(agpu, cmd, sizeof(cmd) / sizeof(cmd[0])); } *cpu_cycles_sum_out += cyc_sum; @@ -287,11 +358,21 @@ static STRHEAD_RET_TYPE gpu_async_thread(void *unused) while (!agpu->exit) { int len = agpu->pos_added - agpu->pos_used; - int pos, done, cycles_dummy = 0, cmd = -1; + int pos = agpu->pos_used & AGPU_BUF_MASK; + int done, cycles_dummy = 0, cmd = -1; assert(len >= 0); if (len == 0 && !dirty) { - if (agpu->wait_mode == waitmode_full) - scond_signal(agpu->cond_add); + switch (agpu->wait_mode) { + case waitmode_full: + case waitmode_target: + agpu->wait_mode = waitmode_none; + scond_signal(agpu->cond_add); + break; + case waitmode_none: + break; + default: + assert(0); + } agpu->idle = 1; scond_wait(agpu->cond_use, agpu->lock); continue; @@ -305,29 +386,45 @@ static STRHEAD_RET_TYPE gpu_async_thread(void *unused) continue; } - pos = agpu->pos_used & AGPU_BUF_MASK; len = min(len, AGPU_BUF_LEN - pos); done = renderer_do_cmd_list(agpu->cmd_buffer + pos, len, agpu->ex_regs, &cycles_dummy, &cycles_dummy, &cmd); if (done != len) { - if (0x80 <= cmd && cmd < 0xa0) - done += do_vram_copy(gpup->vram, agpu->ex_regs, - agpu->cmd_buffer + pos + done, &cycles_dummy); - else if (cmd == FAKECMD_SCREEN_CHANGE) - done += do_notify_screen_change(gpup, - (const void *)(agpu->cmd_buffer + pos + done)); - else if (0xa0 <= cmd && cmd < 0xec) - assert(0); // todo? - else - assert(0); // should not happen + switch (cmd) { + case 0x80 ... 0x9f: + done += do_vram_copy(gpup->vram, agpu->ex_regs, + agpu->cmd_buffer + pos + done, &cycles_dummy); + break; + case FAKECMD_SCREEN_CHANGE: + done += do_notify_screen_change(gpup, + (const void *)(agpu->cmd_buffer + pos + done)); + break; + case FAKECMD_BREAK: + done++; + break; + default: + assert(0); + done++; + break; + } } dirty = 1; assert(done > 0); slock_lock(agpu->lock); agpu->pos_used += done; - if (agpu->wait_mode == waitmode_progress) - scond_signal(agpu->cond_add); + switch (agpu->wait_mode) { + case waitmode_target: + if ((int32_t)(agpu->pos_used - agpu->pos_target) < 0) + break; + // fallthrough + case waitmode_progress: + agpu->wait_mode = waitmode_none; + scond_signal(agpu->cond_add); + break; + default: + break; + } } slock_unlock(agpu->lock); STRHEAD_RETURN(); @@ -369,21 +466,78 @@ void gpu_async_sync(struct psx_gpu *gpu) if (!agpu || (agpu->idle && agpu->pos_added == agpu->pos_used)) return; - agpu_log(gpu, "agpu: stall %d\n", agpu->pos_added - agpu->pos_used); + agpu_log(gpu, "agpu: sync %d\n", agpu->pos_added - agpu->pos_used); slock_lock(agpu->lock); - if (agpu->idle && agpu->pos_added != agpu->pos_used) + if (agpu->idle && agpu->pos_added != agpu->pos_used) { + agpu_log(gpu, "agpu: idle %d\n", agpu->pos_added - agpu->pos_used); run_thread_nolock(agpu); + } if (!agpu->idle) { assert(agpu->wait_mode == waitmode_none); agpu->wait_mode = waitmode_full; scond_wait(agpu->cond_add, agpu->lock); - agpu->wait_mode = waitmode_none; } slock_unlock(agpu->lock); assert(agpu->pos_added == agpu->pos_used); assert(agpu->idle); } +void gpu_async_sync_scanout(struct psx_gpu *gpu) +{ + struct psx_gpu_async *agpu = gpu->async; + int so_x0 = gpu->screen.src_x, so_y0 = gpu->screen.src_y; + int so_x1 = so_x0 + gpu->screen.hres, so_y1 = so_y0 + gpu->screen.vres; + uint32_t pos; + int c, i; + + if (!agpu || (agpu->idle && agpu->pos_added == agpu->pos_used)) + return; + pos = *(volatile uint32_t *)&agpu->pos_used; + i = agpu->pos_area; + if (agpu->idle) + /* unlikely but possible - do a full sync */; + else if (agpu->draw_areas[(i+1) & AGPU_AREAS_MASK].pos > pos) { + agpu_log(gpu, "agpu: oldest draw area %d > %d\n", + agpu->draw_areas[(i+1) & AGPU_AREAS_MASK].pos, pos); + } + else { + for (c = 0, i = agpu->pos_area; c < AGPU_AREAS_CNT; + c++, i = (i - 1) & AGPU_AREAS_MASK) + { + int area_x0 = agpu->draw_areas[i].x0, area_y0 = agpu->draw_areas[i].y0; + int area_x1 = agpu->draw_areas[i].x1, area_y1 = agpu->draw_areas[i].y1; + if (so_x1 <= area_x0 || area_x1 <= so_x0) + /* no x intersect */; + else if (so_y1 <= area_y0 || area_y1 <= so_y0) + /* no y intersect */; + else { + agpu_log(gpu, "agpu: scanout #%d %d,%d %dx%d hit %d,%d %dx%d\n", + c, so_x0, so_y0, so_x1 - so_x0, so_y1 - so_y0, + area_x0, area_y0, area_x1 - area_x0, area_y1 - area_y0); + break; + } + pos = *(volatile uint32_t *)&agpu->pos_used; + if (pos >= agpu->draw_areas[i].pos) + return; + } + if (c > 0) { + i = (i + 1) & AGPU_AREAS_MASK; + agpu_log(gpu, "agpu: wait %d/%d\n", agpu->draw_areas[i].pos - agpu->pos_used, + agpu->pos_added - agpu->pos_used); + slock_lock(agpu->lock); + if (!agpu->idle) { + assert(agpu->wait_mode == waitmode_none); + agpu->pos_target = agpu->draw_areas[i].pos + 1; + agpu->wait_mode = waitmode_target; + scond_wait(agpu->cond_add, agpu->lock); + } + slock_unlock(agpu->lock); + return; + } + } + gpu_async_sync(gpu); +} + void gpu_async_sync_ecmds(struct psx_gpu *gpu) { struct psx_gpu_async *agpu = gpu->async; diff --git a/plugins/gpulib/gpu_async.h b/plugins/gpulib/gpu_async.h index 533a23e8..2b2e0c18 100644 --- a/plugins/gpulib/gpu_async.h +++ b/plugins/gpulib/gpu_async.h @@ -15,6 +15,7 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list, int list_len, void gpu_async_start(struct psx_gpu *gpu); void gpu_async_stop(struct psx_gpu *gpu); void gpu_async_sync(struct psx_gpu *gpu); +void gpu_async_sync_scanout(struct psx_gpu *gpu); void gpu_async_sync_ecmds(struct psx_gpu *gpu); void gpu_async_notify_screen_change(struct psx_gpu *gpu); @@ -25,6 +26,7 @@ void gpu_async_notify_screen_change(struct psx_gpu *gpu); #define gpu_async_start(gpu) #define gpu_async_stop(gpu) #define gpu_async_sync(gpu) do {} while (0) +#define gpu_async_sync_scanout(gpu) do {} while (0) #define gpu_async_sync_ecmds(gpu) #define gpu_async_notify_screen_change(gpu) -- 2.47.3