#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <stdlib.h> /* for calloc */
#include "gpu.h"
#include "gpu_timing.h"
static noinline int do_cmd_buffer(struct psx_gpu *gpu, uint32_t *data, int count,
int *cycles_sum, int *cycles_last);
-static noinline void finish_vram_transfer(struct psx_gpu *gpu, int is_read);
+static noinline void finish_vram_transfer(struct psx_gpu *gpu, int is_read, int is_async);
static void sync_renderer(struct psx_gpu *gpu)
{
sync_renderer(gpu);
if (unlikely(gpu->dma.h > 0))
- finish_vram_transfer(gpu, gpu->dma_start.is_read);
+ finish_vram_transfer(gpu, gpu->dma_start.is_read, 0);
gpu->dma.h = 0;
}
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
-#define VRAM_MEM_XY(vram_, x, y) &vram_[(y) * 1024 + (x)]
-
// this isn't very useful so should be rare
-static void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6)
+void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6)
{
int i;
if (r6 == 1) {
}
}
-static inline void do_vram_line(uint16_t *vram_, int x, int y,
- uint16_t *mem, int l, int is_read, uint32_t r6)
-{
- uint16_t *vram = VRAM_MEM_XY(vram_, x, y);
- if (unlikely(is_read))
- memcpy(mem, vram, l * 2);
- else if (unlikely(r6))
- cpy_mask(vram, mem, l, r6);
- else
- memcpy(vram, mem, l * 2);
-}
-
static int do_vram_io(struct psx_gpu *gpu, uint32_t *data, int count, int is_read)
{
int count_initial = count;
int x = gpu->dma.x, y = gpu->dma.y;
int w = gpu->dma.w, h = gpu->dma.h;
int o = gpu->dma.offset;
- int l;
- count *= 2; // operate in 16bpp pixels
-
- //sync_renderer(gpu); // done in start_vram_transfer()
+ int l, async_queued = 0;
+
+ if (gpu_async_enabled(gpu) && !is_read && o == 0 &&
+ count <= AGPU_DMA_MAX && w * h == count * 2)
+ async_queued = gpu_async_try_dma(gpu, data, count);
+ if (async_queued) {
+ gpu->dma.h = 0;
+ finish_vram_transfer(gpu, 0, 1);
+ return count;
+ }
+ if (o == 0)
+ sync_renderer(gpu);
+ count *= 2; // operate in 16bpp pixels
if (gpu->dma.offset) {
l = w - gpu->dma.offset;
if (count < l)
}
}
else
- finish_vram_transfer(gpu, is_read);
+ finish_vram_transfer(gpu, is_read, 0);
gpu->dma.y = y;
gpu->dma.h = h;
gpu->dma.offset = o;
gpu->dma.is_read = is_read;
gpu->dma_start = gpu->dma;
- sync_renderer(gpu);
+ // postponed until the actual transfer
+ //sync_renderer(gpu);
if (is_read) {
const uint16_t *mem = VRAM_MEM_XY(gpu->vram, gpu->dma.x, gpu->dma.y);
gpu->state.last_vram_read_frame = *gpu->state.frame_count;
}
- log_io(gpu, "start_vram_transfer %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w',
- gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h);
+ if (gpu->dma.x + gpu->dma.w > 1024)
+ log_anomaly(gpu, "vram tr xwrap: %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w',
+ gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h);
+ else
+ log_io(gpu, "start_vram_transfer %c (%d, %d) %dx%d\n", is_read ? 'r' : 'w',
+ gpu->dma.x, gpu->dma.y, gpu->dma.w, gpu->dma.h);
if (gpu->gpu_state_change)
gpu->gpu_state_change(PGS_VRAM_TRANSFER_START, 0);
}
-static void finish_vram_transfer(struct psx_gpu *gpu, int is_read)
+static void finish_vram_transfer(struct psx_gpu *gpu, int is_read, int is_async)
{
if (is_read)
gpu->status &= ~PSX_GPU_STATUS_IMG;
gpu->dma_start.x, gpu->dma_start.y, gpu->dma_start.w, gpu->dma_start.h,
gpu->screen.src_x, gpu->screen.src_y, gpu->screen.hres, gpu->screen.vres, !not_dirty);
gpu->state.fb_dirty |= !not_dirty;
- renderer_update_caches(gpu->dma_start.x, gpu->dma_start.y,
- gpu->dma_start.w, gpu->dma_start.h, 0);
+ if (!is_async)
+ renderer_update_caches(gpu->dma_start.x, gpu->dma_start.y,
+ gpu->dma_start.w, gpu->dma_start.h, 0);
}
if (gpu->gpu_state_change)
gpu->gpu_state_change(PGS_VRAM_TRANSFER_END, 0);
#define __GPULIB_GPU_H__
#include <stdint.h>
+#include <string.h>
+#include "../../include/compiler_features.h"
//#define RAW_FB_DISPLAY
#define gpu_log(gpu, fmt, ...) \
printf("%d:%03d: " fmt, *(gpu)->state.frame_count, *(gpu)->state.hcnt, ##__VA_ARGS__)
-//#define log_anomaly gpu_log
+#ifdef LOG_UNHANDLED
+#define log_anomaly gpu_log
+#else
#define log_anomaly(...)
+#endif
#ifdef __cplusplus
extern "C" {
void vout_set_config(const struct rearmed_cbs *config);
// helpers
+#define VRAM_MEM_XY(vram_, x, y) &vram_[(y) * 1024 + (x)]
+
int do_vram_copy(uint16_t *vram, const uint32_t *ex_regs,
const uint32_t *params, int *cpu_cycles);
int prim_try_simplify_quad_t (void *simplified, const void *prim);
int prim_try_simplify_quad_gt(void *simplified, const void *prim);
+void cpy_mask(uint16_t *dst, const uint16_t *src, int l, uint32_t r6);
+
+static inline void do_vram_line(uint16_t *vram_, int x, int y,
+ uint16_t *mem, int l, int is_read, uint32_t r6)
+{
+ uint16_t *vram = VRAM_MEM_XY(vram_, x, y);
+ if (unlikely(is_read))
+ memcpy(mem, vram, l * 2);
+ else if (unlikely(r6))
+ cpy_mask(vram, mem, l, r6);
+ else
+ memcpy(vram, mem, l * 2);
+}
+
/* listing these here for correct linkage if rasterizer uses c++ */
struct GPUFreeze;
// must be at least 3 words due to cmd_lengths[]
#define FAKECMD_SCREEN_CHANGE 0xdfu
#define FAKECMD_SET_INTERLACE 0xdeu
-#define FAKECMD_BREAK 0xddu
+#define FAKECMD_DMA_WRITE 0xddu
+#define FAKECMD_BREAK 0xdcu
#if defined(__aarch64__) || defined(HAVE_ARMV7)
#define BARRIER() __asm__ __volatile__ ("dmb ishst" ::: "memory")
};
};
+union cmd_dma_write
+{
+ uint32_t u32s[3];
+ struct {
+ uint32_t cmd;
+ short x, y, w, h;
+ };
+};
+
struct cmd_break
{
uint32_t u32s[3];
const union cmd_screen_change *cmd);
static int do_set_interlace(struct psx_gpu *gpu,
const union cmd_set_interlace *cmd);
+static int do_dma_write(struct psx_gpu *gpu,
+ const union cmd_dma_write *cmd, uint32_t pos);
static void run_thread_nolock(struct psx_gpu_async *agpu)
{
slock_unlock(agpu->lock);
}
-static int calc_space_for_add(struct psx_gpu_async *agpu)
+static int calc_space_for_add(struct psx_gpu_async *agpu, uint32_t pos_added)
{
- int space = AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used));
+ int space = AGPU_BUF_LEN - (pos_added - RDPOS(agpu->pos_used));
assert(space >= 0);
assert(space <= AGPU_BUF_LEN);
return space;
}
// adds everything or nothing, else we may get incomplete cmd
-static int do_add(struct psx_gpu_async *agpu, const uint32_t *list, int len)
+static int do_add_pos(struct psx_gpu_async *agpu, const void *list, int list_words,
+ uint32_t *pos_added_)
{
int pos, space, left, retval = 0;
- uint32_t pos_added = agpu->pos_added;
+ uint32_t pos_added = *pos_added_;
- assert(len < AGPU_BUF_LEN);
- space = calc_space_for_add(agpu);
- if (space < len)
+ assert(list_words < AGPU_BUF_LEN);
+ space = calc_space_for_add(agpu, pos_added);
+ if (space < list_words)
return 0;
pos = pos_added & AGPU_BUF_MASK;
left = AGPU_BUF_LEN - pos;
- if (left < len) {
+ if (left < list_words) {
memset(&agpu->cmd_buffer[pos], 0, left * 4);
pos_added += left;
pos = 0;
- space = calc_space_for_add(agpu);
+ space = calc_space_for_add(agpu, pos_added);
}
- if (space >= len) {
- memcpy(&agpu->cmd_buffer[pos], list, len * 4);
- pos_added += len;
- retval = len;
+ if (space >= list_words) {
+ memcpy(&agpu->cmd_buffer[pos], list, list_words * 4);
+ pos_added += list_words;
+ retval = list_words;
}
+ *pos_added_ = pos_added;
+ return retval;
+}
+
+static int do_add(struct psx_gpu_async *agpu, const void *list, int list_words)
+{
+ uint32_t pos_added = agpu->pos_added;
+ int ret = do_add_pos(agpu, list, list_words, &pos_added);
BARRIER();
WRPOS(agpu->pos_added, pos_added);
- return retval;
+ return ret;
}
-static void do_add_with_wait(struct psx_gpu_async *agpu, const uint32_t *list, int len)
+static void do_add_with_wait(struct psx_gpu_async *agpu,
+ const void *list, int list_words)
{
for (;;)
{
- if (do_add(agpu, list, len))
+ if (do_add(agpu, list, list_words))
break;
slock_lock(agpu->lock);
run_thread_nolock(agpu);
- while (len > AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used))) {
+ while (list_words > AGPU_BUF_LEN - (agpu->pos_added - RDPOS(agpu->pos_used))) {
assert(!agpu->idle);
assert(agpu->wait_mode == waitmode_none);
agpu->wait_mode = waitmode_progress;
(ex_regs[4] & 0x3ff) + 1, ((ex_regs[4] >> 10) & 0x1ff) + 1);
}
-int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len,
+int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list_data, int list_len,
int *cpu_cycles_sum_out, int *cpu_cycles_last, int *last_cmd)
{
uint32_t cyc_sum = 0, cyc = *cpu_cycles_last;
return pos;
}
+int gpu_async_try_dma(struct psx_gpu *gpu, const uint32_t *data, int words)
+{
+ struct psx_gpu_async *agpu = gpu->async;
+ int used, w = gpu->dma.w, h = gpu->dma.h;
+ uint32_t pos_added = agpu->pos_added;
+ union cmd_dma_write cmd;
+ int bad = 0;
+
+ if (!agpu)
+ return 0;
+ // avoid double copying
+ used = agpu->pos_added - RDPOS(agpu->pos_used);
+ if (agpu->idle && used == 0)
+ return 0;
+ // only proceed if there is space to avoid messy sync
+ if (AGPU_BUF_LEN - used < sizeof(cmd) / 4 + ((w + 1) / 2) * (h + 1)) {
+ agpu_log(gpu, "agpu: dma: used %d\n", used);
+ return 0;
+ }
+
+ cmd.cmd = HTOLE32(FAKECMD_DMA_WRITE << 24);
+ cmd.x = gpu->dma.x; cmd.y = gpu->dma.y;
+ cmd.w = gpu->dma.w; cmd.h = gpu->dma.h;
+ bad |= !do_add_pos(agpu, cmd.u32s, sizeof(cmd) / 4, &pos_added);
+ if (w & 1) {
+ // align lines to psx dma word units
+ const uint16_t *sdata = (const uint16_t *)data;
+ for (; h > 0; sdata += w, h--)
+ bad |= !do_add_pos(agpu, sdata, w / 2 + 1, &pos_added);
+ }
+ else {
+ for (; h > 0; data += w / 2, h--)
+ bad |= !do_add_pos(agpu, data, w / 2, &pos_added);
+ }
+ assert(!bad); (void)bad;
+
+ slock_lock(agpu->lock);
+ agpu->pos_added = pos_added;
+ run_thread_nolock(agpu);
+ slock_unlock(agpu->lock);
+
+ return 1;
+}
+
static STRHEAD_RET_TYPE gpu_async_thread(void *unused)
{
struct psx_gpu *gpup = &gpu;
case FAKECMD_SET_INTERLACE:
done += do_set_interlace(gpup, list);
break;
+ case FAKECMD_DMA_WRITE:
+ done += do_dma_write(gpup, list, pos + done);
+ break;
case FAKECMD_BREAK:
done += sizeof(struct cmd_break) / 4;
break;
return sizeof(*cmd) / 4;
}
+static int do_dma_write(struct psx_gpu *gpu,
+ const union cmd_dma_write *cmd, uint32_t pos)
+{
+ int x = cmd->x, y = cmd->y, w = cmd->w, h = cmd->h;
+ struct psx_gpu_async *agpu = gpu->async;
+ uint32_t r6 = agpu->ex_regs[6] & 3;
+ uint16_t *vram = gpu->vram;
+ int stride = (w + 1) / 2;
+ int done = 0;
+
+ pos += sizeof(*cmd) / 4u;
+ done += sizeof(*cmd) / 4u;
+ assert(pos <= AGPU_BUF_LEN);
+ for (; h > 0; h--, y++) {
+ if (stride > AGPU_BUF_LEN - pos) {
+ done += AGPU_BUF_LEN - pos;
+ pos = 0;
+ }
+
+ y &= 511;
+ do_vram_line(vram, x, y, (uint16_t *)&agpu->cmd_buffer[pos], w, 0, r6);
+ pos += stride;
+ done += stride;
+ }
+ renderer_update_caches(x, cmd->y, w, cmd->h, 0);
+ return done;
+}
+
void gpu_async_sync(struct psx_gpu *gpu)
{
struct psx_gpu_async *agpu = gpu->async;
if (gpu->async)
return;
+ assert(AGPU_DMA_MAX <= AGPU_BUF_LEN / 2);
+
agpu = calloc(1, sizeof(*agpu));
if (agpu) {
agpu->lock = slock_new();
struct psx_gpu;
struct psx_gpu_async;
+#define AGPU_DMA_MAX 4096 // words
+
#ifdef USE_ASYNC_GPU
#define gpu_async_enabled(gpu) ((gpu)->async)
-int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list, int list_len,
- int *cycles_sum_out, int *cycles_last, int *last_cmd);
+int gpu_async_do_cmd_list(struct psx_gpu *gpu, const uint32_t *list, int list_len,
+ int *cycles_sum_out, int *cycles_last, int *last_cmd);
+int gpu_async_try_dma(struct psx_gpu *gpu, const uint32_t *data, int words);
void gpu_async_start(struct psx_gpu *gpu);
void gpu_async_stop(struct psx_gpu *gpu);
void gpu_async_sync(struct psx_gpu *gpu);
#define gpu_async_enabled(gpu) 0
#define gpu_async_do_cmd_list(gpu, list, list_len, c0, c1, cmd) (list_len)
+#define gpu_async_try_dma(gpu, data, words) 0
#define gpu_async_start(gpu)
#define gpu_async_stop(gpu)
#define gpu_async_sync(gpu) do {} while (0)