//#define agpu_log gpu_log
#define agpu_log(...)
-#define AGPU_BUF_LEN (128*1024/4u) // must be power of 2
-#define AGPU_BUF_MASK (AGPU_BUF_LEN - 1)
+// these constants must be power of 2
+#define AGPU_BUF_LEN (128*1024/4u)
+#define AGPU_BUF_MASK (AGPU_BUF_LEN - 1)
+#define AGPU_AREAS_CNT 8u
+#define AGPU_AREAS_MASK (AGPU_AREAS_CNT - 1)
+
#ifndef min
#define min(a, b) ((b) < (a) ? (b) : (a))
#endif
-// must be in 0xc0...0xdf range that can't appear in thread's real cmd stream
+// must be in 0xc0...0xdf range that can't appear in thread's real cmd stream;
+// must be at least 3 words due to cmd_lengths[]
#define FAKECMD_SCREEN_CHANGE 0xdfu
+#define FAKECMD_BREAK 0xdeu
#if defined(__aarch64__) || defined(HAVE_ARMV6)
#define BARRIER() __asm__ __volatile__ ("dmb ishst" ::: "memory")
enum waitmode {
waitmode_none = 0,
waitmode_progress,
+ waitmode_target,
waitmode_full,
};
+struct pos_drawarea {
+ uint32_t pos;
+ uint16_t x0, y0;
+ uint16_t x1, y1;
+};
+
struct psx_gpu_async
{
uint32_t pos_added;
uint32_t pos_used;
+ uint32_t pos_target;
enum waitmode wait_mode;
uint8_t exit;
uint8_t idle;
scond_t *cond_add;
uint32_t ex_regs[8]; // used by vram copy at least
uint32_t cmd_buffer[AGPU_BUF_LEN];
+ uint32_t pos_area;
+ struct pos_drawarea draw_areas[AGPU_AREAS_CNT];
};
union cmd_screen_change
assert(agpu->wait_mode == waitmode_none);
agpu->wait_mode = waitmode_progress;
scond_wait(agpu->cond_add, agpu->lock);
- agpu->wait_mode = waitmode_none;
}
slock_unlock(agpu->lock);
}
slock_unlock(agpu->lock);
}
+static void add_draw_area(struct psx_gpu_async *agpu, uint32_t pos, int force,
+ uint16_t x0, uint16_t y0, uint16_t x1, uint16_t y1)
+{
+ uint32_t pos_area = agpu->pos_area;
+ if (pos - agpu->draw_areas[pos_area].pos > 1u || force)
+ pos_area = agpu->pos_area = (pos_area + 1) & AGPU_AREAS_MASK;
+ agpu->draw_areas[pos_area].pos = pos;
+ agpu->draw_areas[pos_area].x0 = x0;
+ agpu->draw_areas[pos_area].y0 = y0;
+ agpu->draw_areas[pos_area].x1 = x1;
+ agpu->draw_areas[pos_area].y1 = y1;
+}
+
+static void add_draw_area_e(struct psx_gpu_async *agpu, uint32_t pos, int force,
+ const uint32_t *ex_regs)
+{
+ add_draw_area(agpu, pos, force,
+ ex_regs[3] & 0x3ff, (ex_regs[3] >> 10) & 0x1ff,
+ (ex_regs[4] & 0x3ff) + 1, ((ex_regs[4] >> 10) & 0x1ff) + 1);
+}
+
int gpu_async_do_cmd_list(struct psx_gpu *gpu, uint32_t *list_data, int list_len,
int *cpu_cycles_sum_out, int *cpu_cycles_last, int *last_cmd)
{
uint32_t cyc_sum = 0, cyc = *cpu_cycles_last;
struct psx_gpu_async *agpu = gpu->async;
- int dst_added = 0, dst_can_add = 1;
+ int pos_handled = 0, dst_can_add = 1;
int rendered_anything = 0;
+ int insert_break = 0;
int cmd = -1, pos, len;
assert(agpu);
{
const uint32_t *list = list_data + pos;
const int16_t *slist = (void *)list;
+ const struct pos_drawarea *darea;
int rendered = 1, skip = 0;
- int num_vertexes, w, h;
+ int num_vertexes, x, y, w, h;
cmd = LE32TOH(list[0]) >> 24;
len = 1 + cmd_lengths[cmd];
switch (cmd) {
case 0x02:
- w = LE16TOH(slist[4]) & 0x3FF;
- h = LE16TOH(slist[5]) & 0x1FF;
+ x = (LE16TOH(slist[2]) & 0x3ff) & ~0xf;
+ y = LE16TOH(slist[3]) & 0x1ff;
+ w = ((LE16TOH(slist[4]) & 0x3ff) + 0xf) & ~0xf;
+ h = LE16TOH(slist[5]) & 0x1ff;
+ darea = &agpu->draw_areas[agpu->pos_area];
+ if (x < darea->x0 || x + w > darea->x1 || y < darea->y0 || y + h > darea->y1) {
+ // let the main thread know about changes outside of drawing area
+ agpu_log(gpu, "agpu: fill %d,%d vs area %d,%d\n", x, y, darea->x0, darea->y0);
+ add_draw_area(agpu, agpu->pos_added, 1, x, x + w, y, y + h);
+ add_draw_area_e(agpu, agpu->pos_added + 1, 1, gpu->ex_regs);
+ }
gput_sum(cyc_sum, cyc, gput_fill(w, h));
break;
case 0x1f: // irq?
case 0x78 ... 0x7b:
case 0x7C ... 0x7f: gput_sum(cyc_sum, cyc, gput_sprite(16, 16)); break;
case 0x80 ... 0x9f: // vid -> vid
+ x = LE16TOH(slist[4]) & 0x3ff;
+ y = LE16TOH(slist[5]) & 0x1ff;
w = ((LE16TOH(slist[6]) - 1) & 0x3ff) + 1;
h = ((LE16TOH(slist[7]) - 1) & 0x1ff) + 1;
+ darea = &agpu->draw_areas[agpu->pos_area];
+ if (x < darea->x0 || x + w > darea->x1 || y < darea->y0 || y + h > darea->y1) {
+ add_draw_area(agpu, agpu->pos_added, 1, x, x + w, y, y + h);
+ add_draw_area_e(agpu, agpu->pos_added + 1, 1, gpu->ex_regs);
+ }
gput_sum(cyc_sum, cyc, gput_copy(w, h));
break;
case 0xa0 ... 0xbf: // sys -> vid
case 0xc0 ... 0xdf: // vid -> sys
goto breakloop;
- case 0xe0 ... 0xe7:
+ case 0xe0 ... 0xe2:
+ case 0xe5 ... 0xe7:
gpu->ex_regs[cmd & 7] = LE32TOH(list[0]);
rendered = 0;
break;
+ case 0xe3:
+ case 0xe4:
+ rendered = 0;
+ if (gpu->ex_regs[cmd & 7] == LE32TOH(list[0])) {
+ skip = 1;
+ break;
+ }
+ gpu->ex_regs[cmd & 7] = LE32TOH(list[0]);
+ add_draw_area_e(agpu, agpu->pos_added, 1, gpu->ex_regs);
+ insert_break = 1;
+ break;
default:
rendered = 0;
skip = 1;
if (dst_can_add) {
if (!skip) {
int added = dst_can_add = do_add(agpu, list, len);
- dst_added += added;
+ pos_handled += added;
}
else
- dst_added += len;
+ pos_handled += len;
}
}
breakloop:
- if (dst_added && (rendered_anything || dst_added < pos))
+ if (pos_handled && (rendered_anything || pos_handled < pos))
run_thread(agpu);
- if (dst_added < pos) {
- int left = pos - dst_added;
- agpu_log(gpu, "agpu: wait %d left %d\n", agpu->pos_added - agpu->pos_used, left);
- do_add_with_wait(agpu, list_data + dst_added, left);
+ if (pos_handled < pos) {
+ // note: this is poorly implemented (wrong pos_added for draw_areas)
+ int left = pos - pos_handled;
+ agpu_log(gpu, "agpu: full %d left %d\n", agpu->pos_added - agpu->pos_used, left);
+ do_add_with_wait(agpu, list_data + pos_handled, left);
+ }
+ if (insert_break) {
+ uint32_t cmd[3] = { HTOLE32(FAKECMD_BREAK << 24), };
+ do_add(agpu, cmd, sizeof(cmd) / sizeof(cmd[0]));
}
*cpu_cycles_sum_out += cyc_sum;
while (!agpu->exit)
{
int len = agpu->pos_added - agpu->pos_used;
- int pos, done, cycles_dummy = 0, cmd = -1;
+ int pos = agpu->pos_used & AGPU_BUF_MASK;
+ int done, cycles_dummy = 0, cmd = -1;
assert(len >= 0);
if (len == 0 && !dirty) {
- if (agpu->wait_mode == waitmode_full)
- scond_signal(agpu->cond_add);
+ switch (agpu->wait_mode) {
+ case waitmode_full:
+ case waitmode_target:
+ agpu->wait_mode = waitmode_none;
+ scond_signal(agpu->cond_add);
+ break;
+ case waitmode_none:
+ break;
+ default:
+ assert(0);
+ }
agpu->idle = 1;
scond_wait(agpu->cond_use, agpu->lock);
continue;
continue;
}
- pos = agpu->pos_used & AGPU_BUF_MASK;
len = min(len, AGPU_BUF_LEN - pos);
done = renderer_do_cmd_list(agpu->cmd_buffer + pos, len, agpu->ex_regs,
&cycles_dummy, &cycles_dummy, &cmd);
if (done != len) {
- if (0x80 <= cmd && cmd < 0xa0)
- done += do_vram_copy(gpup->vram, agpu->ex_regs,
- agpu->cmd_buffer + pos + done, &cycles_dummy);
- else if (cmd == FAKECMD_SCREEN_CHANGE)
- done += do_notify_screen_change(gpup,
- (const void *)(agpu->cmd_buffer + pos + done));
- else if (0xa0 <= cmd && cmd < 0xec)
- assert(0); // todo?
- else
- assert(0); // should not happen
+ switch (cmd) {
+ case 0x80 ... 0x9f:
+ done += do_vram_copy(gpup->vram, agpu->ex_regs,
+ agpu->cmd_buffer + pos + done, &cycles_dummy);
+ break;
+ case FAKECMD_SCREEN_CHANGE:
+ done += do_notify_screen_change(gpup,
+ (const void *)(agpu->cmd_buffer + pos + done));
+ break;
+ case FAKECMD_BREAK:
+ done++;
+ break;
+ default:
+ assert(0);
+ done++;
+ break;
+ }
}
dirty = 1;
assert(done > 0);
slock_lock(agpu->lock);
agpu->pos_used += done;
- if (agpu->wait_mode == waitmode_progress)
- scond_signal(agpu->cond_add);
+ switch (agpu->wait_mode) {
+ case waitmode_target:
+ if ((int32_t)(agpu->pos_used - agpu->pos_target) < 0)
+ break;
+ // fallthrough
+ case waitmode_progress:
+ agpu->wait_mode = waitmode_none;
+ scond_signal(agpu->cond_add);
+ break;
+ default:
+ break;
+ }
}
slock_unlock(agpu->lock);
STRHEAD_RETURN();
if (!agpu || (agpu->idle && agpu->pos_added == agpu->pos_used))
return;
- agpu_log(gpu, "agpu: stall %d\n", agpu->pos_added - agpu->pos_used);
+ agpu_log(gpu, "agpu: sync %d\n", agpu->pos_added - agpu->pos_used);
slock_lock(agpu->lock);
- if (agpu->idle && agpu->pos_added != agpu->pos_used)
+ if (agpu->idle && agpu->pos_added != agpu->pos_used) {
+ agpu_log(gpu, "agpu: idle %d\n", agpu->pos_added - agpu->pos_used);
run_thread_nolock(agpu);
+ }
if (!agpu->idle) {
assert(agpu->wait_mode == waitmode_none);
agpu->wait_mode = waitmode_full;
scond_wait(agpu->cond_add, agpu->lock);
- agpu->wait_mode = waitmode_none;
}
slock_unlock(agpu->lock);
assert(agpu->pos_added == agpu->pos_used);
assert(agpu->idle);
}
+void gpu_async_sync_scanout(struct psx_gpu *gpu)
+{
+ struct psx_gpu_async *agpu = gpu->async;
+ int so_x0 = gpu->screen.src_x, so_y0 = gpu->screen.src_y;
+ int so_x1 = so_x0 + gpu->screen.hres, so_y1 = so_y0 + gpu->screen.vres;
+ uint32_t pos;
+ int c, i;
+
+ if (!agpu || (agpu->idle && agpu->pos_added == agpu->pos_used))
+ return;
+ pos = *(volatile uint32_t *)&agpu->pos_used;
+ i = agpu->pos_area;
+ if (agpu->idle)
+ /* unlikely but possible - do a full sync */;
+ else if (agpu->draw_areas[(i+1) & AGPU_AREAS_MASK].pos > pos) {
+ agpu_log(gpu, "agpu: oldest draw area %d > %d\n",
+ agpu->draw_areas[(i+1) & AGPU_AREAS_MASK].pos, pos);
+ }
+ else {
+ for (c = 0, i = agpu->pos_area; c < AGPU_AREAS_CNT;
+ c++, i = (i - 1) & AGPU_AREAS_MASK)
+ {
+ int area_x0 = agpu->draw_areas[i].x0, area_y0 = agpu->draw_areas[i].y0;
+ int area_x1 = agpu->draw_areas[i].x1, area_y1 = agpu->draw_areas[i].y1;
+ if (so_x1 <= area_x0 || area_x1 <= so_x0)
+ /* no x intersect */;
+ else if (so_y1 <= area_y0 || area_y1 <= so_y0)
+ /* no y intersect */;
+ else {
+ agpu_log(gpu, "agpu: scanout #%d %d,%d %dx%d hit %d,%d %dx%d\n",
+ c, so_x0, so_y0, so_x1 - so_x0, so_y1 - so_y0,
+ area_x0, area_y0, area_x1 - area_x0, area_y1 - area_y0);
+ break;
+ }
+ pos = *(volatile uint32_t *)&agpu->pos_used;
+ if (pos >= agpu->draw_areas[i].pos)
+ return;
+ }
+ if (c > 0) {
+ i = (i + 1) & AGPU_AREAS_MASK;
+ agpu_log(gpu, "agpu: wait %d/%d\n", agpu->draw_areas[i].pos - agpu->pos_used,
+ agpu->pos_added - agpu->pos_used);
+ slock_lock(agpu->lock);
+ if (!agpu->idle) {
+ assert(agpu->wait_mode == waitmode_none);
+ agpu->pos_target = agpu->draw_areas[i].pos + 1;
+ agpu->wait_mode = waitmode_target;
+ scond_wait(agpu->cond_add, agpu->lock);
+ }
+ slock_unlock(agpu->lock);
+ return;
+ }
+ }
+ gpu_async_sync(gpu);
+}
+
void gpu_async_sync_ecmds(struct psx_gpu *gpu)
{
struct psx_gpu_async *agpu = gpu->async;