From 22da0dbda5f040845bc3f86c0839cb3c5eed7077 Mon Sep 17 00:00:00 2001
From: notaz <notasas@gmail.com>
Date: Tue, 23 Dec 2025 03:01:11 +0200
Subject: [PATCH] gpulib: thread sync reduction

---
 plugins/gpu_neon/psx_gpu/psx_gpu_parse.c |   4 +
 plugins/gpulib/gpu.c                     |   6 +-
 plugins/gpulib/gpu_async.c               | 224 +++++++++++++++++++----
 plugins/gpulib/gpu_async.h               |   2 +
 4 files changed, 200 insertions(+), 36 deletions(-)

diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
index 7506c1c9..c127c155 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
@@ -1701,6 +1701,10 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *ex_reg
         psx_gpu->saved_viewport_end_x = viewport_end_x;
         psx_gpu->saved_viewport_end_y = viewport_end_y;
 
+        // needed for multithreaded mode where the main thread will start
+        // scanout if it sees no intersect with the latest draw area
+        flush_render_block_buffer(psx_gpu);
+
         select_enhancement_buf(psx_gpu);
 #if 0
         if (!psx_gpu->enhancement_current_buf_ptr)
diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c
index a1c2a7ac..827c0d2e 100644
--- a/plugins/gpulib/gpu.c
+++ b/plugins/gpulib/gpu.c
@@ -1001,7 +1001,11 @@ void GPUupdateLace(void)
     gpu.frameskip.frame_ready = 0;
   }
 
-  sync_renderer(&gpu);
+  if (gpu_async_enabled(&gpu))
+    gpu_async_sync_scanout(&gpu);
+  else
+    renderer_flush_queues();
+
   updated = vout_update();
   if (gpu.state.enhancement_active && !gpu.state.enhancement_was_active)
     renderer_update_caches(0, 0, 1024, 512, 1);
diff --git a/plugins/gpulib/gpu_async.c b/plugins/gpulib/gpu_async.c
index f594ab1b..99a9e099 100644
--- a/plugins/gpulib/gpu_async.c
+++ b/plugins/gpulib/gpu_async.c
@@ -21,14 +21,20 @@
 //#define agpu_log gpu_log
 #define agpu_log(...)
 
-#define AGPU_BUF_LEN  (128*1024/4u)  // must be power of 2
-#define AGPU_BUF_MASK (AGPU_BUF_LEN - 1)
+// these constants must be power of 2
+#define AGPU_BUF_LEN    (128*1024/4u)
+#define AGPU_BUF_MASK   (AGPU_BUF_LEN - 1)
+#define AGPU_AREAS_CNT  8u
+#define AGPU_AREAS_MASK (AGPU_AREAS_CNT - 1)
+
 #ifndef min
 #define min(a, b) ((b) < (a) ? (b) : (a))
 #endif
 
-// must be in 0xc0...0xdf range that can't appear in thread's real cmd stream
+// must be in 0xc0...0xdf range that can't appear in thread's real cmd stream;
+// must be at least 3 words due to cmd_lengths[]
 #define FAKECMD_SCREEN_CHANGE 0xdfu
+#define FAKECMD_BREAK         0xdeu
 
 #if defined(__aarch64__) || defined(HAVE_ARMV6)
 #define BARRIER() __asm__ __volatile__ ("dmb ishst" ::: "memory")
@@ -39,13 +45,21 @@
 enum waitmode {
   waitmode_none = 0,
   waitmode_progress,
+  waitmode_target,
   waitmode_full,
 };
 
+struct pos_drawarea {
+  uint32_t pos;
+  uint16_t x0, y0;
+  uint16_t x1, y1;
+};
+
 struct psx_gpu_async
 {
   uint32_t pos_added;
   uint32_t pos_used;
+  uint32_t pos_target;
   enum waitmode wait_mode;
   uint8_t exit;
   uint8_t idle;
@@ -55,6 +69,8 @@ struct psx_gpu_async
   scond_t *cond_add;
   uint32_t ex_regs[8]; // used by vram copy at least
   uint32_t cmd_buffer[AGPU_BUF_LEN];
+  uint32_t pos_area;
+  struct pos_drawarea draw_areas[AGPU_AREAS_CNT];
 };
 
 union cmd_screen_change
@@ -122,7 +138,6 @@ static void do_add_with_wait(struct psx_gpu_async *agpu, const uint32_t *list, i
       assert(agpu->wait_mode == waitmode_none);
       agpu->wait_mode = waitmode_progress;
       scond_wait(agpu->cond_add, agpu->lock);
-      agpu->wait_mode = waitmode_none;
     }
     slock_unlock(agpu->lock);
   }
@@ -143,13 +158,35 @@ static void run_thread(struct psx_gpu_async *agpu)
   slock_unlock(agpu->lock);
 }
 
+static void add_draw_area(struct psx_gpu_async *agpu, uint32_t pos, int force,
+    uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1)
+{
+  uint32_t pos_area = agpu->pos_area;
+  if (pos - agpu->draw_areas[pos_area].pos > 1u || force)
+    pos_area = agpu->pos_area = (pos_area + 1) & AGPU_AREAS_MASK;
+  agpu->draw_areas[pos_area].pos = pos;
+  agpu->draw_areas[pos_area].x0 = x0;
+  agpu->draw_areas[pos_area].y0 = y0;
+  agpu->draw_areas[pos_area].x1 = x1;
+  agpu->draw_areas[pos_area].y1 = y1;
+}
+
+static void add_draw_area_e(struct psx_gpu_async *agpu, uint32_t pos, int force,
+    const uint32_t *ex_regs)
+{
+  add_draw_area(agpu, pos, force,
+       ex_regs[3] & 0x3ff,       (ex_regs[3] >> 10) & 0x1ff,
+      (ex_regs[4] & 0x3ff) + 1, ((ex_regs[4] >> 10) & 0x1ff) + 1);
+}
+
 int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len,
  int *cpu_cycles_sum_out, int *cpu_cycles_last, int *last_cmd)
 {
   uint32_t cyc_sum = 0, cyc = *cpu_cycles_last;
   struct psx_gpu_async *agpu = gpu->async;
-  int dst_added = 0, dst_can_add = 1;
+  int pos_handled = 0, dst_can_add = 1;
   int rendered_anything = 0;
+  int insert_break = 0;
   int cmd = -1, pos, len;
 
   assert(agpu);
@@ -157,8 +194,9 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len
   {
     const uint32_t *list = list_data + pos;
     const int16_t *slist = (void *)list;
+    const struct pos_drawarea *darea;
     int rendered = 1, skip = 0;
-    int num_vertexes, w, h;
+    int num_vertexes, x, y, w, h;
 
     cmd = LE32TOH(list[0]) >> 24;
     len = 1 + cmd_lengths[cmd];
@@ -169,8 +207,17 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len
 
     switch (cmd) {
       case 0x02:
-        w = LE16TOH(slist[4]) & 0x3FF;
-        h = LE16TOH(slist[5]) & 0x1FF;
+        x =  (LE16TOH(slist[2]) & 0x3ff) & ~0xf;
+        y =   LE16TOH(slist[3]) & 0x1ff;
+        w = ((LE16TOH(slist[4]) & 0x3ff) + 0xf) & ~0xf;
+        h =   LE16TOH(slist[5]) & 0x1ff;
+        darea = &agpu->draw_areas[agpu->pos_area];
+        if (x < darea->x0 || x + w > darea->x1 || y < darea->y0 || y + h > darea->y1) {
+          // let the main thread know about changes outside of drawing area
+          agpu_log(gpu, "agpu: fill %d,%d vs area %d,%d\n", x, y, darea->x0, darea->y0);
+          add_draw_area(agpu, agpu->pos_added, 1, x, x + w, y, y + h);
+          add_draw_area_e(agpu, agpu->pos_added + 1, 1, gpu->ex_regs);
+        }
         gput_sum(cyc_sum, cyc, gput_fill(w, h));
         break;
       case 0x1f: // irq?
@@ -235,17 +282,36 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len
       case 0x78 ... 0x7b:
       case 0x7C ... 0x7f: gput_sum(cyc_sum, cyc, gput_sprite(16, 16)); break;
       case 0x80 ... 0x9f: // vid -> vid
+        x =   LE16TOH(slist[4]) & 0x3ff;
+        y =   LE16TOH(slist[5]) & 0x1ff;
         w = ((LE16TOH(slist[6]) - 1) & 0x3ff) + 1;
         h = ((LE16TOH(slist[7]) - 1) & 0x1ff) + 1;
+        darea = &agpu->draw_areas[agpu->pos_area];
+        if (x < darea->x0 || x + w > darea->x1 || y < darea->y0 || y + h > darea->y1) {
+          add_draw_area(agpu, agpu->pos_added, 1, x, x + w, y, y + h);
+          add_draw_area_e(agpu, agpu->pos_added + 1, 1, gpu->ex_regs);
+        }
         gput_sum(cyc_sum, cyc, gput_copy(w, h));
         break;
       case 0xa0 ... 0xbf: // sys -> vid
       case 0xc0 ... 0xdf: // vid -> sys
         goto breakloop;
-      case 0xe0 ... 0xe7:
+      case 0xe0 ... 0xe2:
+      case 0xe5 ... 0xe7:
         gpu->ex_regs[cmd & 7] = LE32TOH(list[0]);
         rendered = 0;
         break;
+      case 0xe3:
+      case 0xe4:
+        rendered = 0;
+        if (gpu->ex_regs[cmd & 7] == LE32TOH(list[0])) {
+          skip = 1;
+          break;
+        }
+        gpu->ex_regs[cmd & 7] = LE32TOH(list[0]);
+        add_draw_area_e(agpu, agpu->pos_added, 1, gpu->ex_regs);
+        insert_break = 1;
+        break;
       default:
         rendered = 0;
         skip = 1;
@@ -255,19 +321,24 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len
     if (dst_can_add) {
       if (!skip) {
         int added = dst_can_add = do_add(agpu, list, len);
-        dst_added += added;
+        pos_handled += added;
       }
       else
-        dst_added += len;
+        pos_handled += len;
     }
   }
 breakloop:
-  if (dst_added && (rendered_anything || dst_added < pos))
+  if (pos_handled && (rendered_anything || pos_handled < pos))
     run_thread(agpu);
-  if (dst_added < pos) {
-    int left = pos - dst_added;
-    agpu_log(gpu, "agpu: wait %d left %d\n", agpu->pos_added - agpu->pos_used, left);
-    do_add_with_wait(agpu, list_data + dst_added, left);
+  if (pos_handled < pos) {
+    // note: this is poorly implemented (wrong pos_added for draw_areas)
+    int left = pos - pos_handled;
+    agpu_log(gpu, "agpu: full %d left %d\n", agpu->pos_added - agpu->pos_used, left);
+    do_add_with_wait(agpu, list_data + pos_handled, left);
+  }
+  if (insert_break) {
+    uint32_t cmd[3] = { HTOLE32(FAKECMD_BREAK << 24), };
+    do_add(agpu, cmd, sizeof(cmd) / sizeof(cmd[0]));
   }
 
   *cpu_cycles_sum_out += cyc_sum;
@@ -287,11 +358,21 @@ static STRHEAD_RET_TYPE gpu_async_thread(void *unused)
   while (!agpu->exit)
   {
     int len = agpu->pos_added - agpu->pos_used;
-    int pos, done, cycles_dummy = 0, cmd = -1;
+    int pos = agpu->pos_used & AGPU_BUF_MASK;
+    int done, cycles_dummy = 0, cmd = -1;
     assert(len >= 0);
     if (len == 0 && !dirty) {
-      if (agpu->wait_mode == waitmode_full)
-        scond_signal(agpu->cond_add);
+      switch (agpu->wait_mode) {
+        case waitmode_full:
+        case waitmode_target:
+          agpu->wait_mode = waitmode_none;
+          scond_signal(agpu->cond_add);
+          break;
+        case waitmode_none:
+          break;
+        default:
+          assert(0);
+      }
       agpu->idle = 1;
       scond_wait(agpu->cond_use, agpu->lock);
       continue;
@@ -305,29 +386,45 @@ static STRHEAD_RET_TYPE gpu_async_thread(void *unused)
       continue;
     }
 
-    pos = agpu->pos_used & AGPU_BUF_MASK;
     len = min(len, AGPU_BUF_LEN - pos);
     done = renderer_do_cmd_list(agpu->cmd_buffer + pos, len, agpu->ex_regs,
              &cycles_dummy, &cycles_dummy, &cmd);
     if (done != len) {
-      if (0x80 <= cmd && cmd < 0xa0)
-        done += do_vram_copy(gpup->vram, agpu->ex_regs,
-                  agpu->cmd_buffer + pos + done, &cycles_dummy);
-      else if (cmd == FAKECMD_SCREEN_CHANGE)
-        done += do_notify_screen_change(gpup,
-                  (const void *)(agpu->cmd_buffer + pos + done));
-      else if (0xa0 <= cmd && cmd < 0xec)
-        assert(0); // todo?
-      else
-        assert(0); // should not happen
+      switch (cmd) {
+        case 0x80 ... 0x9f:
+          done += do_vram_copy(gpup->vram, agpu->ex_regs,
+                    agpu->cmd_buffer + pos + done, &cycles_dummy);
+          break;
+        case FAKECMD_SCREEN_CHANGE:
+          done += do_notify_screen_change(gpup,
+                    (const void *)(agpu->cmd_buffer + pos + done));
+          break;
+        case FAKECMD_BREAK:
+          done++;
+          break;
+        default:
+          assert(0);
+          done++;
+          break;
+      }
     }
 
     dirty = 1;
     assert(done > 0);
     slock_lock(agpu->lock);
     agpu->pos_used += done;
-    if (agpu->wait_mode == waitmode_progress)
-      scond_signal(agpu->cond_add);
+    switch (agpu->wait_mode) {
+      case waitmode_target:
+        if ((int32_t)(agpu->pos_used - agpu->pos_target) < 0)
+          break;
+        // fallthrough
+      case waitmode_progress:
+        agpu->wait_mode = waitmode_none;
+        scond_signal(agpu->cond_add);
+        break;
+      default:
+        break;
+    }
   }
   slock_unlock(agpu->lock);
   STRHEAD_RETURN();
@@ -369,21 +466,78 @@ void gpu_async_sync(struct psx_gpu *gpu)
 
   if (!agpu || (agpu->idle && agpu->pos_added == agpu->pos_used))
     return;
-  agpu_log(gpu, "agpu: stall %d\n", agpu->pos_added - agpu->pos_used);
+  agpu_log(gpu, "agpu: sync %d\n", agpu->pos_added - agpu->pos_used);
   slock_lock(agpu->lock);
-  if (agpu->idle && agpu->pos_added != agpu->pos_used)
+  if (agpu->idle && agpu->pos_added != agpu->pos_used) {
+    agpu_log(gpu, "agpu: idle %d\n", agpu->pos_added - agpu->pos_used);
     run_thread_nolock(agpu);
+  }
   if (!agpu->idle) {
     assert(agpu->wait_mode == waitmode_none);
     agpu->wait_mode = waitmode_full;
     scond_wait(agpu->cond_add, agpu->lock);
-    agpu->wait_mode = waitmode_none;
   }
   slock_unlock(agpu->lock);
   assert(agpu->pos_added == agpu->pos_used);
   assert(agpu->idle);
 }
 
+void gpu_async_sync_scanout(struct psx_gpu *gpu)
+{
+  struct psx_gpu_async *agpu = gpu->async;
+  int so_x0 = gpu->screen.src_x, so_y0 = gpu->screen.src_y;
+  int so_x1 = so_x0 + gpu->screen.hres, so_y1 = so_y0 + gpu->screen.vres;
+  uint32_t pos;
+  int c, i;
+
+  if (!agpu || (agpu->idle && agpu->pos_added == agpu->pos_used))
+    return;
+  pos = *(volatile uint32_t *)&agpu->pos_used;
+  i = agpu->pos_area;
+  if (agpu->idle)
+    /* unlikely but possible - do a full sync */;
+  else if (agpu->draw_areas[(i+1) & AGPU_AREAS_MASK].pos > pos) {
+    agpu_log(gpu, "agpu: oldest draw area %d > %d\n",
+      agpu->draw_areas[(i+1) & AGPU_AREAS_MASK].pos, pos);
+  }
+  else {
+    for (c = 0, i = agpu->pos_area; c < AGPU_AREAS_CNT;
+         c++, i = (i - 1) & AGPU_AREAS_MASK)
+    {
+      int area_x0 = agpu->draw_areas[i].x0, area_y0 = agpu->draw_areas[i].y0;
+      int area_x1 = agpu->draw_areas[i].x1, area_y1 = agpu->draw_areas[i].y1;
+      if (so_x1 <= area_x0 || area_x1 <= so_x0)
+        /* no x intersect */;
+      else if (so_y1 <= area_y0 || area_y1 <= so_y0)
+        /* no y intersect */;
+      else {
+        agpu_log(gpu, "agpu: scanout #%d %d,%d %dx%d hit %d,%d %dx%d\n",
+          c, so_x0, so_y0, so_x1 - so_x0, so_y1 - so_y0,
+          area_x0, area_y0, area_x1 - area_x0, area_y1 - area_y0);
+        break;
+      }
+      pos = *(volatile uint32_t *)&agpu->pos_used;
+      if (pos >= agpu->draw_areas[i].pos)
+        return;
+    }
+    if (c > 0) {
+      i = (i + 1) & AGPU_AREAS_MASK;
+      agpu_log(gpu, "agpu: wait %d/%d\n", agpu->draw_areas[i].pos - agpu->pos_used,
+          agpu->pos_added - agpu->pos_used);
+      slock_lock(agpu->lock);
+      if (!agpu->idle) {
+        assert(agpu->wait_mode == waitmode_none);
+        agpu->pos_target = agpu->draw_areas[i].pos + 1;
+        agpu->wait_mode = waitmode_target;
+        scond_wait(agpu->cond_add, agpu->lock);
+      }
+      slock_unlock(agpu->lock);
+      return;
+    }
+  }
+  gpu_async_sync(gpu);
+}
+
 void gpu_async_sync_ecmds(struct psx_gpu *gpu)
 {
   struct psx_gpu_async *agpu = gpu->async;
diff --git a/plugins/gpulib/gpu_async.h b/plugins/gpulib/gpu_async.h
index 533a23e8..2b2e0c18 100644
--- a/plugins/gpulib/gpu_async.h
+++ b/plugins/gpulib/gpu_async.h
@@ -15,6 +15,7 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list, int list_len,
 void gpu_async_start(struct psx_gpu *gpu);
 void gpu_async_stop(struct psx_gpu *gpu);
 void gpu_async_sync(struct psx_gpu *gpu);
+void gpu_async_sync_scanout(struct psx_gpu *gpu);
 void gpu_async_sync_ecmds(struct psx_gpu *gpu);
 void gpu_async_notify_screen_change(struct psx_gpu *gpu);
 
@@ -25,6 +26,7 @@ void gpu_async_notify_screen_change(struct psx_gpu *gpu);
 #define gpu_async_start(gpu)
 #define gpu_async_stop(gpu)
 #define gpu_async_sync(gpu) do {} while (0)
+#define gpu_async_sync_scanout(gpu) do {} while (0)
 #define gpu_async_sync_ecmds(gpu)
 #define gpu_async_notify_screen_change(gpu)
 
-- 
2.47.3