gpu_async: some dma support

author notaz <notasas@gmail.com>

Sat, 3 Jan 2026 23:18:45 +0000 (01:18 +0200)

committer notaz <notasas@gmail.com>

Sun, 4 Jan 2026 02:43:16 +0000 (04:43 +0200)
author notaz <notasas@gmail.com>
Sat, 3 Jan 2026 23:18:45 +0000 (01:18 +0200)
committer notaz <notasas@gmail.com>
Sun, 4 Jan 2026 02:43:16 +0000 (04:43 +0200)
diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c

index c9b764d..04ffbf6 100644 (file)
--- a/plugins/gpulib/gpu.c
+++ b/plugins/gpulib/gpu.c
@@ -11,7 +11,6 @@
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
-#include <stdlib.h> /* for calloc */
  
  #include "gpu.h"
  #include "gpu_timing.h"
@@ -31,7 +30,7 @@ struct psx_gpu gpu;
  
  static noinline int do_cmd_buffer(struct psx_gpu *gpu, uint32_t *data, int count,
      int *cycles_sum, int *cycles_last);
-static noinline void finish_vram_transfer(struct psx_gpu *gpu, int is_read);
+static noinline void finish_vram_transfer(struct psx_gpu *gpu, int is_read, int is_async);
  
  static void sync_renderer(struct psx_gpu *gpu)
  {
@@ -51,7 +50,7 @@ static noinline void do_cmd_reset(struct psx_gpu *gpu)
    sync_renderer(gpu);
  
    if (unlikely(gpu->dma.h > 0))
-    finish_vram_transfer(gpu, gpu->dma_start.is_read);
+    finish_vram_transfer(gpu, gpu->dma_start.is_read, 0);
    gpu->dma.h = 0;
  }
  
@@ -428,10 +427,8 @@ const unsigned char cmd_lengths[256] =
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  };
  
-#define VRAM_MEM_XY(vram_, x, y) &vram_[(y) * 1024 + (x)]
-
  // this isn't very useful so should be rare
-static void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6)
+void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6)
  {
    int i;
    if (r6 == 1) {
@@ -447,18 +444,6 @@ static void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6)
    }
  }
  
-static inline void do_vram_line(uint16_t *vram_, int x, int y,
-    uint16_t *mem, int l, int is_read, uint32_t r6)
-{
-  uint16_t *vram = VRAM_MEM_XY(vram_, x, y);
-  if (unlikely(is_read))
-    memcpy(mem, vram, l * 2);
-  else if (unlikely(r6))
-    cpy_mask(vram, mem, l, r6);
-  else
-    memcpy(vram, mem, l * 2);
-}
-
  static int do_vram_io(struct psx_gpu *gpu, uint32_t *data, int count, int is_read)
  {
    int count_initial = count;
@@ -468,11 +453,20 @@ static int do_vram_io(struct psx_gpu *gpu, uint32_t *data, int count, int is_rea
    int x = gpu->dma.x, y = gpu->dma.y;
    int w = gpu->dma.w, h = gpu->dma.h;
    int o = gpu->dma.offset;
-  int l;
-  count *= 2; // operate in 16bpp pixels
-
-  //sync_renderer(gpu); // done in start_vram_transfer()
+  int l, async_queued = 0;
+
+  if (gpu_async_enabled(gpu) && !is_read && o == 0 &&
+      count <= AGPU_DMA_MAX && w * h == count * 2)
+    async_queued = gpu_async_try_dma(gpu, data, count);
+  if (async_queued) {
+    gpu->dma.h = 0;
+    finish_vram_transfer(gpu, 0, 1);
+    return count;
+  }
+  if (o == 0)
+    sync_renderer(gpu);
  
+  count *= 2; // operate in 16bpp pixels
    if (gpu->dma.offset) {
      l = w - gpu->dma.offset;
      if (count < l)
@@ -505,7 +499,7 @@ static int do_vram_io(struct psx_gpu *gpu, uint32_t *data, int count, int is_rea
      }
    }
    else
-    finish_vram_transfer(gpu, is_read);
+    finish_vram_transfer(gpu, is_read, 0);
    gpu->dma.y = y;
    gpu->dma.h = h;
    gpu->dma.offset = o;
@@ -527,7 +521,8 @@ static noinline void start_vram_transfer(struct psx_gpu *gpu, uint32_t pos_word,
    gpu->dma.is_read = is_read;
    gpu->dma_start = gpu->dma;
  
-  sync_renderer(gpu);
+  // postponed until the actual transfer
+  //sync_renderer(gpu);
  
    if (is_read) {
      const uint16_t *mem = VRAM_MEM_XY(gpu->vram, gpu->dma.x, gpu->dma.y);
@@ -537,13 +532,17 @@ static noinline void start_vram_transfer(struct psx_gpu *gpu, uint32_t pos_word,
      gpu->state.last_vram_read_frame = *gpu->state.frame_count;
    }
  
-  log_io(gpu, "start_vram_transfer %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w',
-    gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h);
+  if (gpu->dma.x + gpu->dma.w > 1024)
+    log_anomaly(gpu, "vram tr xwrap: %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w',
+      gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h);
+  else
+    log_io(gpu, "start_vram_transfer %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w',
+      gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h);
    if (gpu->gpu_state_change)
      gpu->gpu_state_change(PGS_VRAM_TRANSFER_START, 0);
  }
  
-static void finish_vram_transfer(struct psx_gpu *gpu, int is_read)
+static void finish_vram_transfer(struct psx_gpu *gpu, int is_read, int is_async)
  {
    if (is_read)
      gpu->status &= ~PSX_GPU_STATUS_IMG;
@@ -562,8 +561,9 @@ static void finish_vram_transfer(struct psx_gpu *gpu, int is_read)
        gpu->dma_start.x, gpu->dma_start.y, gpu->dma_start.w, gpu->dma_start.h,
        gpu->screen.src_x, gpu->screen.src_y, gpu->screen.hres, gpu->screen.vres, !not_dirty);
      gpu->state.fb_dirty |= !not_dirty;
-    renderer_update_caches(gpu->dma_start.x, gpu->dma_start.y,
-                           gpu->dma_start.w, gpu->dma_start.h, 0);
+    if (!is_async)
+      renderer_update_caches(gpu->dma_start.x, gpu->dma_start.y,
+                             gpu->dma_start.w, gpu->dma_start.h, 0);
    }
    if (gpu->gpu_state_change)
      gpu->gpu_state_change(PGS_VRAM_TRANSFER_END, 0);
diff --git a/plugins/gpulib/gpu.h b/plugins/gpulib/gpu.h

index 7a2bcab..d4b66e5 100644 (file)
--- a/plugins/gpulib/gpu.h
+++ b/plugins/gpulib/gpu.h
@@ -12,14 +12,19 @@
  #define __GPULIB_GPU_H__
  
  #include <stdint.h>
+#include <string.h>
+#include "../../include/compiler_features.h"
  
  //#define RAW_FB_DISPLAY
  
  #define gpu_log(gpu, fmt, ...) \
    printf("%d:%03d: " fmt, *(gpu)->state.frame_count, *(gpu)->state.hcnt, ##__VA_ARGS__)
  
-//#define log_anomaly gpu_log
+#ifdef LOG_UNHANDLED
+#define log_anomaly gpu_log
+#else
  #define log_anomaly(...)
+#endif
  
  #ifdef __cplusplus
  extern "C" {
@@ -150,12 +155,28 @@ void vout_blank(void);
  void vout_set_config(const struct rearmed_cbs *config);
  
  // helpers
+#define VRAM_MEM_XY(vram_, x, y) &vram_[(y) * 1024 + (x)]
+
  int  do_vram_copy(uint16_t *vram, const uint32_t *ex_regs,
         const uint32_t *params, int *cpu_cycles);
  
  int  prim_try_simplify_quad_t (void *simplified, const void *prim);
  int  prim_try_simplify_quad_gt(void *simplified, const void *prim);
  
+void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6);
+
+static inline void do_vram_line(uint16_t *vram_, int x, int y,
+    uint16_t *mem, int l, int is_read, uint32_t r6)
+{
+  uint16_t *vram = VRAM_MEM_XY(vram_, x, y);
+  if (unlikely(is_read))
+    memcpy(mem, vram, l * 2);
+  else if (unlikely(r6))
+    cpy_mask(vram, mem, l, r6);
+  else
+    memcpy(vram, mem, l * 2);
+}
+
  /* listing these here for correct linkage if rasterizer uses c++ */
  struct GPUFreeze;
  
diff --git a/plugins/gpulib/gpu_async.c b/plugins/gpulib/gpu_async.c

index 385d78d..b7a085b 100644 (file)
--- a/plugins/gpulib/gpu_async.c
+++ b/plugins/gpulib/gpu_async.c
@@ -35,7 +35,8 @@
  // must be at least 3 words due to cmd_lengths[]
  #define FAKECMD_SCREEN_CHANGE 0xdfu
  #define FAKECMD_SET_INTERLACE 0xdeu
-#define FAKECMD_BREAK         0xddu
+#define FAKECMD_DMA_WRITE     0xddu
+#define FAKECMD_BREAK         0xdcu
  
  #if defined(__aarch64__) || defined(HAVE_ARMV7)
  #define BARRIER() __asm__ __volatile__ ("dmb ishst" ::: "memory")
@@ -99,6 +100,15 @@ union cmd_set_interlace
    };
  };
  
+union cmd_dma_write
+{
+  uint32_t u32s[3];
+  struct {
+    uint32_t cmd;
+    short x, y, w, h;
+  };
+};
+
  struct cmd_break
  {
    uint32_t u32s[3];
@@ -108,6 +118,8 @@ static int noinline do_notify_screen_change(struct psx_gpu *gpu,
      const union cmd_screen_change *cmd);
  static int do_set_interlace(struct psx_gpu *gpu,
      const union cmd_set_interlace *cmd);
+static int do_dma_write(struct psx_gpu *gpu,
+    const union cmd_dma_write *cmd, uint32_t pos);
  
  static void run_thread_nolock(struct psx_gpu_async *agpu)
  {
@@ -124,52 +136,62 @@ static void run_thread(struct psx_gpu_async *agpu)
    slock_unlock(agpu->lock);
  }
  
-static int calc_space_for_add(struct psx_gpu_async *agpu)
+static int calc_space_for_add(struct psx_gpu_async *agpu, uint32_t pos_added)
  {
-  int space = AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used));
+  int space = AGPU_BUF_LEN - (pos_added - RDPOS(agpu->pos_used));
    assert(space >= 0);
    assert(space <= AGPU_BUF_LEN);
    return space;
  }
  
  // adds everything or nothing, else we may get incomplete cmd
-static int do_add(struct psx_gpu_async *agpu, const uint32_t *list, int len)
+static int do_add_pos(struct psx_gpu_async *agpu, const void *list, int list_words,
+    uint32_t *pos_added_)
  {
    int pos, space, left, retval = 0;
-  uint32_t pos_added = agpu->pos_added;
+  uint32_t pos_added = *pos_added_;
  
-  assert(len < AGPU_BUF_LEN);
-  space = calc_space_for_add(agpu);
-  if (space < len)
+  assert(list_words < AGPU_BUF_LEN);
+  space = calc_space_for_add(agpu, pos_added);
+  if (space < list_words)
      return 0;
  
    pos = pos_added & AGPU_BUF_MASK;
    left = AGPU_BUF_LEN - pos;
-  if (left < len) {
+  if (left < list_words) {
      memset(&agpu->cmd_buffer[pos], 0, left * 4);
      pos_added += left;
      pos = 0;
-    space = calc_space_for_add(agpu);
+    space = calc_space_for_add(agpu, pos_added);
    }
-  if (space >= len) {
-    memcpy(&agpu->cmd_buffer[pos], list, len * 4);
-    pos_added += len;
-    retval = len;
+  if (space >= list_words) {
+    memcpy(&agpu->cmd_buffer[pos], list, list_words * 4);
+    pos_added += list_words;
+    retval = list_words;
    }
+  *pos_added_ = pos_added;
+  return retval;
+}
+
+static int do_add(struct psx_gpu_async *agpu, const void *list, int list_words)
+{
+  uint32_t pos_added = agpu->pos_added;
+  int ret = do_add_pos(agpu, list, list_words, &pos_added);
    BARRIER();
    WRPOS(agpu->pos_added, pos_added);
-  return retval;
+  return ret;
  }
  
-static void do_add_with_wait(struct psx_gpu_async *agpu, const uint32_t *list, int len)
+static void do_add_with_wait(struct psx_gpu_async *agpu,
+    const void *list, int list_words)
  {
    for (;;)
    {
-    if (do_add(agpu, list, len))
+    if (do_add(agpu, list, list_words))
        break;
      slock_lock(agpu->lock);
      run_thread_nolock(agpu);
-    while (len > AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used))) {
+    while (list_words > AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used))) {
        assert(!agpu->idle);
        assert(agpu->wait_mode == waitmode_none);
        agpu->wait_mode = waitmode_progress;
@@ -200,7 +222,7 @@ static void add_draw_area_e(struct psx_gpu_async *agpu, uint32_t pos, int force,
        (ex_regs[4] & 0x3ff) + 1, ((ex_regs[4] >> 10) & 0x1ff) + 1);
  }
  
-int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len,
+int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int list_len,
   int *cpu_cycles_sum_out, int *cpu_cycles_last, int *last_cmd)
  {
    uint32_t cyc_sum = 0, cyc = *cpu_cycles_last;
@@ -368,6 +390,50 @@ breakloop:
    return pos;
  }
  
+int gpu_async_try_dma(struct psx_gpu *gpu, const uint32_t *data, int words)
+{
+  struct psx_gpu_async *agpu = gpu->async;
+  int used, w = gpu->dma.w, h = gpu->dma.h;
+  uint32_t pos_added = agpu->pos_added;
+  union cmd_dma_write cmd;
+  int bad = 0;
+
+  if (!agpu)
+    return 0;
+  // avoid double copying
+  used = agpu->pos_added - RDPOS(agpu->pos_used);
+  if (agpu->idle && used == 0)
+    return 0;
+  // only proceed if there is space to avoid messy sync
+  if (AGPU_BUF_LEN - used < sizeof(cmd) / 4 + ((w + 1) / 2) * (h + 1)) {
+    agpu_log(gpu, "agpu: dma: used %d\n", used);
+    return 0;
+  }
+
+  cmd.cmd = HTOLE32(FAKECMD_DMA_WRITE << 24);
+  cmd.x = gpu->dma.x; cmd.y = gpu->dma.y;
+  cmd.w = gpu->dma.w; cmd.h = gpu->dma.h;
+  bad |= !do_add_pos(agpu, cmd.u32s, sizeof(cmd) / 4, &pos_added);
+  if (w & 1) {
+    // align lines to psx dma word units
+    const uint16_t *sdata = (const uint16_t *)data;
+    for (; h > 0; sdata += w, h--)
+      bad |= !do_add_pos(agpu, sdata, w / 2 + 1, &pos_added);
+  }
+  else {
+    for (; h > 0; data += w / 2, h--)
+      bad |= !do_add_pos(agpu, data, w / 2, &pos_added);
+  }
+  assert(!bad); (void)bad;
+
+  slock_lock(agpu->lock);
+  agpu->pos_added = pos_added;
+  run_thread_nolock(agpu);
+  slock_unlock(agpu->lock);
+
+  return 1;
+}
+
  static STRHEAD_RET_TYPE gpu_async_thread(void *unused)
  {
    struct psx_gpu *gpup = &gpu;
@@ -422,6 +488,9 @@ static STRHEAD_RET_TYPE gpu_async_thread(void *unused)
          case FAKECMD_SET_INTERLACE:
            done += do_set_interlace(gpup, list);
            break;
+        case FAKECMD_DMA_WRITE:
+          done += do_dma_write(gpup, list, pos + done);
+          break;
          case FAKECMD_BREAK:
            done += sizeof(struct cmd_break) / 4;
            break;
@@ -504,6 +573,34 @@ static int do_set_interlace(struct psx_gpu *gpu,
    return sizeof(*cmd) / 4;
  }
  
+static int do_dma_write(struct psx_gpu *gpu,
+    const union cmd_dma_write *cmd, uint32_t pos)
+{
+  int x = cmd->x, y = cmd->y, w = cmd->w, h = cmd->h;
+  struct psx_gpu_async *agpu = gpu->async;
+  uint32_t r6 = agpu->ex_regs[6] & 3;
+  uint16_t *vram = gpu->vram;
+  int stride = (w + 1) / 2;
+  int done = 0;
+
+  pos += sizeof(*cmd) / 4u;
+  done += sizeof(*cmd) / 4u;
+  assert(pos <= AGPU_BUF_LEN);
+  for (; h > 0; h--, y++) {
+    if (stride > AGPU_BUF_LEN - pos) {
+      done += AGPU_BUF_LEN - pos;
+      pos = 0;
+    }
+
+    y &= 511;
+    do_vram_line(vram, x, y, (uint16_t *)&agpu->cmd_buffer[pos], w, 0, r6);
+    pos += stride;
+    done += stride;
+  }
+  renderer_update_caches(x, cmd->y, w, cmd->h, 0);
+  return done;
+}
+
  void gpu_async_sync(struct psx_gpu *gpu)
  {
    struct psx_gpu_async *agpu = gpu->async;
@@ -629,6 +726,8 @@ void gpu_async_start(struct psx_gpu *gpu)
    if (gpu->async)
      return;
  
+  assert(AGPU_DMA_MAX <= AGPU_BUF_LEN / 2);
+
    agpu = calloc(1, sizeof(*agpu));
    if (agpu) {
      agpu->lock = slock_new();
diff --git a/plugins/gpulib/gpu_async.h b/plugins/gpulib/gpu_async.h

index cf429b7..7edf0e1 100644 (file)
--- a/plugins/gpulib/gpu_async.h
+++ b/plugins/gpulib/gpu_async.h
@@ -6,12 +6,15 @@
  struct psx_gpu;
  struct psx_gpu_async;
  
+#define AGPU_DMA_MAX 4096 // words
+
  #ifdef USE_ASYNC_GPU
  
  #define gpu_async_enabled(gpu) ((gpu)->async)
  
-int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list, int list_len,
- int *cycles_sum_out, int *cycles_last, int *last_cmd);
+int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list, int list_len,
+      int *cycles_sum_out, int *cycles_last, int *last_cmd);
+int gpu_async_try_dma(struct psx_gpu *gpu, const uint32_t *data, int words);
  void gpu_async_start(struct psx_gpu *gpu);
  void gpu_async_stop(struct psx_gpu *gpu);
  void gpu_async_sync(struct psx_gpu *gpu);
@@ -24,6 +27,7 @@ void gpu_async_set_interlace(struct psx_gpu *gpu, int enable, int is_odd);
  
  #define gpu_async_enabled(gpu) 0
  #define gpu_async_do_cmd_list(gpu, list, list_len, c0, c1, cmd) (list_len)
+#define gpu_async_try_dma(gpu, data, words) 0
  #define gpu_async_start(gpu)
  #define gpu_async_stop(gpu)
  #define gpu_async_sync(gpu) do {} while (0)
author	notaz <notasas@gmail.com>
	Sat, 3 Jan 2026 23:18:45 +0000 (01:18 +0200)
committer	notaz <notasas@gmail.com>
	Sun, 4 Jan 2026 02:43:16 +0000 (04:43 +0200)
plugins/gpulib/gpu.c		patch \| blob \| blame \| history
plugins/gpulib/gpu.h		patch \| blob \| blame \| history
plugins/gpulib/gpu_async.c		patch \| blob \| blame \| history
plugins/gpulib/gpu_async.h		patch \| blob \| blame \| history