From b668854a85a034cbe736d1b0edef381ac42a45fa Mon Sep 17 00:00:00 2001
From: notaz <notasas@gmail.com>
Date: Tue, 27 Jan 2026 03:24:01 +0200
Subject: [PATCH] gpulib: try delaying a frame on frameskip, more tuning

unsure if the delay thing is a good idea really...
for frameskip only so should not affect most people
---
 plugins/gpulib/gpu.c       |  48 +++++---
 plugins/gpulib/gpu_async.c | 220 +++++++++++++++++++++++++++----------
 plugins/gpulib/gpu_async.h |  10 +-
 3 files changed, 201 insertions(+), 77 deletions(-)

diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c
index 3285c6c6..dca0e806 100644
--- a/plugins/gpulib/gpu.c
+++ b/plugins/gpulib/gpu.c
@@ -199,7 +199,7 @@ static noinline void frameskip_prepare_noskip(struct psx_gpu *gpu)
     int dummy = 0;
     if (gpu_async_enabled(gpu))
       (void)gpu_async_do_cmd_list(gpu, gpu->frameskip.pending_fill, 3,
-          &dummy, &dummy, &dummy);
+          &dummy, &dummy, &dummy, &dummy);
     else
       renderer_do_cmd_list(gpu->frameskip.pending_fill, 3, gpu->ex_regs,
           &dummy, &dummy, &dummy);
@@ -207,8 +207,18 @@ static noinline void frameskip_prepare_noskip(struct psx_gpu *gpu)
   }
 }
 
-static noinline void decide_frameskip(struct psx_gpu *gpu)
+#define FRAMESKIP_MAX_FLIP_DELAY 5
+
+static noinline void decide_frameskip(struct psx_gpu *gpu, uint32_t flip_delay)
 {
+  if (flip_delay > FRAMESKIP_MAX_FLIP_DELAY) {
+    // don't skip if no updates for a while
+    gpu->frameskip.active = 0;
+    gpu->frameskip.cnt = 0;
+    gpu->frameskip.frame_ready = 1;
+    return;
+  }
+
   *gpu->frameskip.dirty = 1;
 
   if (gpu->frameskip.active)
@@ -397,8 +407,9 @@ void GPUwriteStatus(uint32_t data)
         gpu.screen.src_y = src_y;
         check_draw_to_display(&gpu);
         if (gpu.frameskip.set) {
-          if (gpu.frameskip.last_flip_frame != *gpu.state.frame_count)
-            decide_frameskip(&gpu);
+          uint32_t flip_delay = *gpu.state.frame_count - gpu.frameskip.last_flip_frame;
+          if (flip_delay)
+            decide_frameskip(&gpu, flip_delay);
           if (!gpu.frameskip.active || !gpu.frameskip.allow)
             frameskip_prepare_noskip(&gpu);
         }
@@ -688,7 +699,7 @@ static noinline int do_cmd_list_skip(struct psx_gpu *gpu, uint32_t *data, int li
         {
           // clearing something large, don't skip
           if (gpu_async_enabled(gpu))
-            (void)gpu_async_do_cmd_list(gpu, list, 3, &dummy, &dummy, &dummy);
+            (void)gpu_async_do_cmd_list(gpu, list, 3, &dummy, &dummy, &dummy, &dummy);
           else
             renderer_do_cmd_list(list, 3, gpu->ex_regs, &dummy, &dummy, &dummy);
         }
@@ -860,8 +871,7 @@ static noinline int do_cmd_buffer(struct psx_gpu *gpu, uint32_t *data, int count
                cycles_sum, cycles_last, &cmd);
     else if (gpu_async_enabled(gpu)) {
       pos += gpu_async_do_cmd_list(gpu, data + pos, count - pos,
-               cycles_sum, cycles_last, &cmd);
-      vram_dirty = 1;
+               cycles_sum, cycles_last, &cmd, &vram_dirty);
     }
     else {
       pos += renderer_do_cmd_list(data + pos, count - pos, gpu->ex_regs,
@@ -1087,13 +1097,19 @@ long GPUfreeze(uint32_t type, GPUFreeze_t *freeze)
 
 void GPUupdateLace(void)
 {
-  int updated = 0;
+  int delay_vout_update = 0;
+  int updated = 1;
 
-  if (gpu.frameskip.set && *gpu.state.frame_count - gpu.frameskip.last_flip_frame >= 10) {
-    gpu.frameskip.frame_ready = 1;
-    if (gpu.frameskip.active) {
-      gpu.frameskip.active = 0;
-      frameskip_on_no_skip(&gpu);
+  if (gpu.frameskip.set) {
+    uint32_t flip_delay = *gpu.state.frame_count - gpu.frameskip.last_flip_frame;
+    if (gpu_async_enabled(&gpu))
+      gpu_async_try_delayed_flip(&gpu, 0);
+    if (flip_delay > FRAMESKIP_MAX_FLIP_DELAY) {
+      gpu.frameskip.frame_ready = 1;
+      if (gpu.frameskip.active) {
+        gpu.frameskip.active = 0;
+        frameskip_prepare_noskip(&gpu);
+      }
     }
   }
 
@@ -1123,11 +1139,12 @@ void GPUupdateLace(void)
 #endif
 
   if (gpu_async_enabled(&gpu))
-    gpu_async_sync_scanout(&gpu);
+    delay_vout_update = gpu_async_sync_scanout(&gpu);
   else
     renderer_flush_queues();
 
-  updated = vout_update(&gpu, gpu.screen.src_x, gpu.screen.src_y);
+  if (!delay_vout_update)
+    updated = vout_update(&gpu, gpu.screen.src_x, gpu.screen.src_y);
   if (gpu.state.enhancement_active && !gpu.state.enhancement_was_active) {
     gpu_async_sync(&gpu);
     renderer_update_caches(0, 0, 1024, 512, 1);
@@ -1183,6 +1200,7 @@ void GPUrearmedCallbacks(const struct rearmed_cbs *cbs)
   gpu.frameskip.dirty = (void *)&cbs->fskip_dirty;
   gpu.frameskip.active = 0;
   gpu.frameskip.frame_ready = 1;
+  gpu.frameskip.last_flip_frame = *cbs->gpu_frame_count - FRAMESKIP_MAX_FLIP_DELAY - 1;
   gpu.state.hcnt = (uint32_t *)cbs->gpu_hcnt;
   gpu.state.frame_count = (uint32_t *)cbs->gpu_frame_count;
   gpu.state.allow_interlace = cbs->gpu_neon.allow_interlace;
diff --git a/plugins/gpulib/gpu_async.c b/plugins/gpulib/gpu_async.c
index a6c6f8b4..bc30dee7 100644
--- a/plugins/gpulib/gpu_async.c
+++ b/plugins/gpulib/gpu_async.c
@@ -77,6 +77,9 @@ struct psx_gpu_async
   uint32_t cmd_buffer[AGPU_BUF_LEN];
   uint32_t pos_area;
   struct pos_drawarea draw_areas[AGPU_AREAS_CNT];
+  struct {
+    int src_x, src_y;
+  } delayed_flip;
 };
 
 // cmd_* must be at least 3 words long
@@ -224,17 +227,39 @@ static void add_draw_area_e(struct psx_gpu_async *agpu, uint32_t pos, int force,
 }
 
 int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int list_len,
- int *cpu_cycles_sum_out, int *cpu_cycles_last, int *last_cmd)
+ int *cpu_cycles_sum_out, int *cpu_cycles_last, int *last_cmd, int *vram_dirty)
 {
-  uint32_t cyc_sum = 0, cyc = *cpu_cycles_last;
+  int cyc_sum = 0, cyc = *cpu_cycles_last;
   struct psx_gpu_async *agpu = gpu->async;
   int pos_handled = 0, dst_can_add = 1;
+  int cmd = -1, pos = 0, len;
   int rendered_anything = 0;
   int insert_break = 0;
-  int cmd = -1, pos, len;
+  uint32_t pos_added;
 
   assert(agpu);
-  for (pos = 0; pos < list_len; pos += len)
+  pos_added = agpu->pos_added;
+  if (RDPOS(agpu->idle) && pos_added == RDPOS(agpu->pos_used)) {
+    // do the cheap stuff directly to avoid thread wakeup/sync
+    for (; pos < list_len; pos++) {
+      uint32_t ecmd = LE32TOH(list_data[pos]);
+      cmd = ecmd >> 24;
+      if (cmd < 2)
+        continue;
+      if ((cmd & 0xf8) != 0xe0)
+        break;
+      if (gpu->ex_regs[cmd & 7] == ecmd)
+        continue;
+      gpu->ex_regs[cmd & 7] = ecmd;
+      if (cmd == 0xe3 || cmd == 0xe4)
+        add_draw_area_e(agpu, pos_added, 0, gpu->ex_regs);
+    }
+    pos_handled = pos;
+    if (pos)
+      renderer_do_cmd_list((uint32_t *)list_data, pos, agpu->ex_regs, &cyc_sum, &cyc, &cmd);
+  }
+
+  for (; pos < list_len; pos += len)
   {
     const uint32_t *list = list_data + pos;
     const int16_t *slist = (void *)list;
@@ -260,8 +285,8 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int li
           // let sync_scanout() know about changes outside of drawing area
           agpu_log(gpu, "agpu: fill %d,%d %dx%d vs area %d,%d %dx%d\n", x, y, w, h,
             darea->x0, darea->y0, darea->x1 - darea->x0, darea->y1 - darea->y0);
-          add_draw_area(agpu, agpu->pos_added, 1, x, y, x + w, y + h);
-          add_draw_area_e(agpu, agpu->pos_added + 1, 1, gpu->ex_regs);
+          add_draw_area(agpu, pos_added, 1, x, y, x + w, y + h);
+          add_draw_area_e(agpu, pos_added + 1, 1, gpu->ex_regs);
         }
         gput_sum(cyc_sum, cyc, gput_fill(w, h));
         break;
@@ -335,8 +360,8 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int li
         if ((w > 2 || h > 1) &&
             (x < darea->x0 || x + w > darea->x1 || y < darea->y0 || y + h > darea->y1))
         {
-          add_draw_area(agpu, agpu->pos_added, 1, x, y, x + w, y + h);
-          add_draw_area_e(agpu, agpu->pos_added + 1, 1, gpu->ex_regs);
+          add_draw_area(agpu, pos_added, 1, x, y, x + w, y + h);
+          add_draw_area_e(agpu, pos_added + 1, 1, gpu->ex_regs);
         }
         gput_sum(cyc_sum, cyc, gput_copy(w, h));
         break;
@@ -356,7 +381,7 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int li
           break;
         }
         gpu->ex_regs[cmd & 7] = LE32TOH(list[0]);
-        add_draw_area_e(agpu, agpu->pos_added, 0, gpu->ex_regs);
+        add_draw_area_e(agpu, pos_added, 0, gpu->ex_regs);
         insert_break = 1;
         break;
       default:
@@ -367,7 +392,7 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int li
     rendered_anything |= rendered;
     if (dst_can_add) {
       if (!skip) {
-        int added = dst_can_add = do_add(agpu, list, len);
+        int added = dst_can_add = do_add_pos(agpu, list, len, &pos_added);
         pos_handled += added;
       }
       else
@@ -375,6 +400,15 @@ int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int li
     }
   }
 breakloop:
+  *cpu_cycles_sum_out += cyc_sum;
+  *cpu_cycles_last = cyc;
+  *last_cmd = cmd;
+  *vram_dirty |= rendered_anything;
+
+  if (unlikely(rendered_anything && agpu->delayed_flip.src_x != -1))
+    gpu_async_try_delayed_flip(gpu, 1);
+  BARRIER();
+  WRPOS(agpu->pos_added, pos_added);
   if (pos_handled && (rendered_anything || pos_handled < pos))
     run_thread(agpu);
   if (pos_handled < pos) {
@@ -384,13 +418,9 @@ breakloop:
     do_add_with_wait(agpu, list_data + pos_handled, left);
   }
   if (insert_break) {
-    struct cmd_break cmd = {{ HTOLE32(FAKECMD_BREAK << 24), }};
-    do_add(agpu, cmd.u32s, sizeof(cmd.u32s) / sizeof(cmd.u32s[0]));
+    struct cmd_break cmdb = {{ HTOLE32(FAKECMD_BREAK << 24), }};
+    do_add(agpu, cmdb.u32s, sizeof(cmdb.u32s) / sizeof(cmdb.u32s[0]));
   }
-
-  *cpu_cycles_sum_out += cyc_sum;
-  *cpu_cycles_last = cyc;
-  *last_cmd = cmd;
   return pos;
 }
 
@@ -617,7 +647,7 @@ static int do_dma_write(struct psx_gpu *gpu,
   return done;
 }
 
-void gpu_async_sync(struct psx_gpu *gpu)
+static void gpu_async_sync_nocheck(struct psx_gpu *gpu)
 {
   struct psx_gpu_async *agpu = gpu->async;
 
@@ -639,67 +669,139 @@ void gpu_async_sync(struct psx_gpu *gpu)
   assert(agpu->idle);
 }
 
-void gpu_async_sync_scanout(struct psx_gpu *gpu)
+void gpu_async_sync(struct psx_gpu *gpu)
+{
+  struct psx_gpu_async *agpu = gpu->async;
+
+  if (!agpu)
+    return;
+  if (!RDPOS(agpu->idle) || agpu->pos_added != RDPOS(agpu->pos_used))
+    gpu_async_sync_nocheck(gpu);
+
+  if (unlikely(agpu->delayed_flip.src_x != -1)) {
+    int src_x = agpu->delayed_flip.src_x;
+    agpu_log(gpu, "agpu: delayed_flip 2\n");
+    agpu->delayed_flip.src_x = -1;
+    vout_update(gpu, src_x, agpu->delayed_flip.src_y);
+  }
+}
+
+static int calc_scanout_wait(struct psx_gpu *gpu, int so_x0, int so_y0,
+    uint32_t *pos_to)
 {
   struct psx_gpu_async *agpu = gpu->async;
-  int so_x0 = gpu->screen.src_x, so_y0 = gpu->screen.src_y;
   int so_x1 = so_x0 + gpu->screen.hres, so_y1 = so_y0 + gpu->screen.vres;
   uint32_t pos;
   int c, i;
 
-  if (!agpu)
-    return;
   pos = RDPOS(agpu->pos_used);
   if (RDPOS(agpu->idle) && agpu->pos_added == pos)
-    return;
+    return 0;
   i = agpu->pos_area;
   if (agpu->idle)
-    /* unlikely but possible - do a full sync */;
-  else if (so_x1 > 1024 || so_y1 > 512) {
+    // unlikely but possible - do a full sync
+    return -1;
+  if (so_x1 > 1024 || so_y1 > 512) {
     agpu_log(gpu, "agpu: wrap %d,%d %dx%d\n",
       so_x0, so_y0, so_x1 - so_x0, so_y1 - so_y0);
+    return -1;
   }
   else if (agpu->draw_areas[(i+1) & AGPU_AREAS_MASK].pos > pos) {
     agpu_log(gpu, "agpu: oldest draw area %d > %d\n",
       agpu->draw_areas[(i+1) & AGPU_AREAS_MASK].pos, pos);
+    return -1;
   }
-  else {
-    for (c = 0, i = agpu->pos_area; c < AGPU_AREAS_CNT;
-         c++, i = (i - 1) & AGPU_AREAS_MASK)
-    {
-      int area_x0 = agpu->draw_areas[i].x0, area_y0 = agpu->draw_areas[i].y0;
-      int area_x1 = agpu->draw_areas[i].x1, area_y1 = agpu->draw_areas[i].y1;
-      if (so_x1 <= area_x0 || area_x1 <= so_x0)
-        /* no x intersect */;
-      else if (so_y1 <= area_y0 || area_y1 <= so_y0)
-        /* no y intersect */;
-      else {
-        agpu_log(gpu, "agpu: scanout #%d %d,%d %dx%d hit %d,%d %dx%d\n",
-          c, so_x0, so_y0, so_x1 - so_x0, so_y1 - so_y0,
-          area_x0, area_y0, area_x1 - area_x0, area_y1 - area_y0);
-        break;
-      }
-      pos = RDPOS(agpu->pos_used);
-      if (pos >= agpu->draw_areas[i].pos)
-        return;
+
+  for (c = 0, i = agpu->pos_area; c < AGPU_AREAS_CNT;
+       c++, i = (i - 1) & AGPU_AREAS_MASK)
+  {
+    int area_x0 = agpu->draw_areas[i].x0, area_y0 = agpu->draw_areas[i].y0;
+    int area_x1 = agpu->draw_areas[i].x1, area_y1 = agpu->draw_areas[i].y1;
+    if (so_x1 <= area_x0 || area_x1 <= so_x0)
+      /* no x intersect */;
+    else if (so_y1 <= area_y0 || area_y1 <= so_y0)
+      /* no y intersect */;
+    else {
+      agpu_log(gpu, "agpu: scanout #%d %d,%d %dx%d hit %d,%d %dx%d\n",
+        c, so_x0, so_y0, so_x1 - so_x0, so_y1 - so_y0,
+        area_x0, area_y0, area_x1 - area_x0, area_y1 - area_y0);
+      break;
     }
-    if (c > 0) {
-      i = (i + 1) & AGPU_AREAS_MASK;
-      agpu_log(gpu, "agpu: wait %d/%d @ %u/%u\n",
-          agpu->draw_areas[i].pos - RDPOS(agpu->pos_used), agpu->pos_added -
-          RDPOS(agpu->pos_used), RDPOS(agpu->pos_used), agpu->pos_added);
-      slock_lock(agpu->lock);
-      if (!agpu->idle) {
-        assert(agpu->wait_mode == waitmode_none);
-        agpu->pos_target = agpu->draw_areas[i].pos + 1;
-        agpu->wait_mode = waitmode_target;
-        scond_wait(agpu->cond_add, agpu->lock);
-      }
-      slock_unlock(agpu->lock);
-      return;
+    pos = RDPOS(agpu->pos_used);
+    if (pos >= agpu->draw_areas[i].pos)
+      return 0;
+  }
+  if (c == 0)
+    // newest drawing area hits the scanout - full sync
+    return -1;
+
+  i = (i + 1) & AGPU_AREAS_MASK;
+  *pos_to = agpu->draw_areas[i].pos + 1;
+  return 1;
+}
+
+static void do_scanout_wait(struct psx_gpu *gpu, int check_ret, uint32_t target)
+{
+  struct psx_gpu_async *agpu = gpu->async;
+  if (check_ret == 1) {
+    agpu_log(gpu, "agpu: wait %d/%d @ %u/%u\n",
+        target - RDPOS(agpu->pos_used), agpu->pos_added -
+        RDPOS(agpu->pos_used), RDPOS(agpu->pos_used), agpu->pos_added);
+    slock_lock(agpu->lock);
+    if (!agpu->idle && (int32_t)(agpu->pos_used - target) < 0) {
+      assert(agpu->wait_mode == waitmode_none);
+      agpu->pos_target = target;
+      agpu->wait_mode = waitmode_target;
+      scond_wait(agpu->cond_add, agpu->lock);
     }
+    slock_unlock(agpu->lock);
+  }
+  else
+    gpu_async_sync_nocheck(gpu);
+}
+
+int gpu_async_sync_scanout(struct psx_gpu *gpu)
+{
+  struct psx_gpu_async *agpu = gpu->async;
+  uint32_t target = 0;
+  int ret;
+
+  if (!agpu)
+    return 0;
+  ret = calc_scanout_wait(gpu, gpu->screen.src_x, gpu->screen.src_y, &target);
+  if (ret == 0)
+    return 0;
+  if (gpu->frameskip.set) {
+    // delay. Could do it without fskip also, but that would cause frame/input lag
+    if (agpu->delayed_flip.src_x != -1)
+      agpu_log(gpu, "agpu: missed delayed_flip?\n");
+    agpu->delayed_flip.src_x = gpu->screen.src_x;
+    agpu->delayed_flip.src_y = gpu->screen.src_y;
+    return 1;
+  }
+  do_scanout_wait(gpu, ret, target);
+  return 0;
+}
+
+void gpu_async_try_delayed_flip(struct psx_gpu *gpu, int force)
+{
+  struct psx_gpu_async *agpu = gpu->async;
+  uint32_t target = 0;
+  int check_ret = 0;
+
+  if (!agpu || agpu->delayed_flip.src_x == -1)
+    return;
+  check_ret = calc_scanout_wait(gpu, gpu->screen.src_x, gpu->screen.src_y, &target);
+  if (force) {
+    do_scanout_wait(gpu, check_ret, target);
+    check_ret = 0;
+  }
+  if (check_ret == 0) {
+    int src_x = agpu->delayed_flip.src_x;
+    agpu_log(gpu, "agpu: delayed_flip %d\n", force);
+    agpu->delayed_flip.src_x = -1;
+    vout_update(gpu, src_x, agpu->delayed_flip.src_y);
   }
-  gpu_async_sync(gpu);
 }
 
 void gpu_async_sync_ecmds(struct psx_gpu *gpu)
@@ -761,6 +863,8 @@ void gpu_async_start(struct psx_gpu *gpu)
 
   agpu = calloc(1, sizeof(*agpu));
   if (agpu) {
+    agpu->delayed_flip.src_x = -1;
+    agpu->delayed_flip.src_y = -1;
     agpu->lock = slock_new();
     agpu->cond_add = scond_new();
     agpu->cond_use = scond_new();
diff --git a/plugins/gpulib/gpu_async.h b/plugins/gpulib/gpu_async.h
index 7edf0e10..65eafab8 100644
--- a/plugins/gpulib/gpu_async.h
+++ b/plugins/gpulib/gpu_async.h
@@ -13,26 +13,28 @@ struct psx_gpu_async;
 #define gpu_async_enabled(gpu) ((gpu)->async)
 
 int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list, int list_len,
-      int *cycles_sum_out, int *cycles_last, int *last_cmd);
+      int *cycles_sum_out, int *cycles_last, int *last_cmd, int *vram_dirty);
 int gpu_async_try_dma(struct psx_gpu *gpu, const uint32_t *data, int words);
 void gpu_async_start(struct psx_gpu *gpu);
 void gpu_async_stop(struct psx_gpu *gpu);
 void gpu_async_sync(struct psx_gpu *gpu);
-void gpu_async_sync_scanout(struct psx_gpu *gpu);
+int  gpu_async_sync_scanout(struct psx_gpu *gpu);
 void gpu_async_sync_ecmds(struct psx_gpu *gpu);
+void gpu_async_try_delayed_flip(struct psx_gpu *gpu, int force);
 void gpu_async_notify_screen_change(struct psx_gpu *gpu);
 void gpu_async_set_interlace(struct psx_gpu *gpu, int enable, int is_odd);
 
 #else
 
 #define gpu_async_enabled(gpu) 0
-#define gpu_async_do_cmd_list(gpu, list, list_len, c0, c1, cmd) (list_len)
+#define gpu_async_do_cmd_list(gpu, list, list_len, c0, c1, cmd, vrd) (list_len)
 #define gpu_async_try_dma(gpu, data, words) 0
 #define gpu_async_start(gpu)
 #define gpu_async_stop(gpu)
 #define gpu_async_sync(gpu) do {} while (0)
-#define gpu_async_sync_scanout(gpu) do {} while (0)
+#define gpu_async_sync_scanout(gpu) 0
 #define gpu_async_sync_ecmds(gpu)
+#define gpu_async_try_delayed_flip(gpu, force)
 #define gpu_async_notify_screen_change(gpu)
 #define gpu_async_set_interlace(gpu, enable, is_odd)
 
-- 
2.47.3