--- /dev/null
+/**************************************************************************
+* Copyright (C) 2020 The RetroArch Team *
+* *
+* This program is free software; you can redistribute it and/or modify *
+* it under the terms of the GNU General Public License as published by *
+* the Free Software Foundation; either version 2 of the License, or *
+* (at your option) any later version. *
+* *
+* This program is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+* GNU General Public License for more details. *
+* *
+* You should have received a copy of the GNU General Public License *
+* along with this program; if not, write to the *
+* Free Software Foundation, Inc., *
+* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. *
+***************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "../gpulib/gpu.h"
+#include "../../frontend/plugin_lib.h"
+#include "gpulib_thread_if.h"
+
+typedef struct {
+ uint32_t *cmd_list;
+ int count;
+ int last_cmd;
+} video_thread_cmd;
+
+#define QUEUE_SIZE 0x2000
+
+typedef struct {
+ size_t start;
+ size_t end;
+ size_t used;
+ video_thread_cmd queue[QUEUE_SIZE];
+} video_thread_queue;
+
+typedef struct {
+ pthread_t thread;
+ pthread_mutex_t queue_lock;
+ pthread_cond_t cond_msg_avail;
+ pthread_cond_t cond_msg_done;
+ pthread_cond_t cond_queue_empty;
+ video_thread_queue *queue;
+ video_thread_queue *bg_queue;
+ bool running;
+} video_thread_state;
+
+static video_thread_state thread;
+static video_thread_queue queues[2];
+static int thread_rendering;
+static bool hold_cmds;
+static bool needs_display;
+
+extern const unsigned char cmd_lengths[];
+
+static void *video_thread_main(void *arg) {
+ video_thread_state *thread = (video_thread_state *)arg;
+ video_thread_cmd *cmd;
+ int i;
+ static int processed = 0;
+
+ while(1) {
+ int result, last_cmd, start, end;
+ video_thread_queue *queue;
+ pthread_mutex_lock(&thread->queue_lock);
+
+ while (!thread->queue->used && thread->running) {
+ pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
+ }
+
+ if (!thread->running) {
+ pthread_mutex_unlock(&thread->queue_lock);
+ break;
+ }
+
+ queue = thread->queue;
+ start = queue->start;
+ end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
+ queue->start = end % QUEUE_SIZE;
+ pthread_mutex_unlock(&thread->queue_lock);
+
+ for (i = start; i < end; i++) {
+ cmd = &queue->queue[i];
+ result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);
+
+ if (result != cmd->count) {
+ fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
+ }
+
+#ifdef _3DS
+ /* Periodically yield so as not to starve other threads */
+ processed += cmd->count;
+ if (processed >= 512) {
+ svcSleepThread(1);
+ processed %= 512;
+ }
+#endif
+ }
+
+ pthread_mutex_lock(&thread->queue_lock);
+ queue->used -= (end - start);
+
+ if (!queue->used)
+ pthread_cond_signal(&thread->cond_queue_empty);
+
+ pthread_cond_signal(&thread->cond_msg_done);
+ pthread_mutex_unlock(&thread->queue_lock);
+ }
+
+ return 0;
+}
+
+static void cmd_queue_swap() {
+ video_thread_queue *tmp;
+ if (!thread.bg_queue->used) return;
+
+ pthread_mutex_lock(&thread.queue_lock);
+ if (!thread.queue->used) {
+ tmp = thread.queue;
+ thread.queue = thread.bg_queue;
+ thread.bg_queue = tmp;
+ needs_display = true;
+ pthread_cond_signal(&thread.cond_msg_avail);
+ }
+ pthread_mutex_unlock(&thread.queue_lock);
+}
+
+/* Waits for the main queue to completely finish. */
+void renderer_wait() {
+ if (!thread.running) return;
+
+ /* Not completely safe, but should be fine since the render thread
+ * only decreases used, and we check again inside the lock. */
+ if (!thread.queue->used) {
+ return;
+ }
+
+ pthread_mutex_lock(&thread.queue_lock);
+
+ while (thread.queue->used) {
+ pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
+ }
+
+ pthread_mutex_unlock(&thread.queue_lock);
+}
+
+/* Waits for all GPU commands in both queues to finish, bringing VRAM
+ * completely up-to-date. */
+void renderer_sync(void) {
+ if (!thread.running) return;
+
+ /* Not completely safe, but should be fine since the render thread
+ * only decreases used, and we check again inside the lock. */
+ if (!thread.queue->used && !thread.bg_queue->used) {
+ return;
+ }
+
+ /* Flush both queues. This is necessary because gpulib could be
+ * trying to process a DMA write that a command in the queue should
+ * run beforehand. For example, Xenogears sprites write a black
+ * rectangle over the to-be-DMA'd spot in VRAM -- if this write
+ * happens after the DMA, it will clear the DMA, resulting in
+ * flickering sprites. We need to be totally up-to-date. This may
+ * drop a frame. */
+ renderer_wait();
+ cmd_queue_swap();
+ hold_cmds = false;
+ renderer_wait();
+}
+
+static void video_thread_stop() {
+ int i;
+ renderer_sync();
+
+ if (thread.running) {
+ thread.running = false;
+ pthread_cond_signal(&thread.cond_msg_avail);
+ pthread_join(thread.thread, NULL);
+ }
+
+ pthread_mutex_destroy(&thread.queue_lock);
+ pthread_cond_destroy(&thread.cond_msg_avail);
+ pthread_cond_destroy(&thread.cond_msg_done);
+ pthread_cond_destroy(&thread.cond_queue_empty);
+
+ for (i = 0; i < QUEUE_SIZE; i++) {
+ video_thread_cmd *cmd = &thread.queue->queue[i];
+ free(cmd->cmd_list);
+ cmd->cmd_list = NULL;
+ }
+
+ for (i = 0; i < QUEUE_SIZE; i++) {
+ video_thread_cmd *cmd = &thread.bg_queue->queue[i];
+ free(cmd->cmd_list);
+ cmd->cmd_list = NULL;
+ }
+}
+
+static void video_thread_start() {
+ fprintf(stdout, "Starting render thread\n");
+
+ if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
+ pthread_cond_init(&thread.cond_msg_done, NULL) ||
+ pthread_cond_init(&thread.cond_queue_empty, NULL) ||
+ pthread_mutex_init(&thread.queue_lock, NULL) ||
+ pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
+ goto error;
+ }
+
+ thread.queue = &queues[0];
+ thread.bg_queue = &queues[1];
+
+ thread.running = true;
+ return;
+
+ error:
+ fprintf(stderr,"Failed to start rendering thread\n");
+ video_thread_stop();
+}
+
+static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
+ video_thread_cmd *cmd;
+ uint32_t *cmd_list;
+ video_thread_queue *queue;
+ bool lock;
+
+ cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
+
+ if (!cmd_list) {
+ /* Out of memory, disable the thread and run sync from now on */
+ fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
+ video_thread_stop();
+ }
+
+ memcpy(cmd_list, list, count * sizeof(uint32_t));
+
+ if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
+ /* If the bg queue is full, do a full sync to empty both queues
+ * and clear space. This should be very rare, I've only seen it in
+ * Tekken 3 post-battle-replay. */
+ renderer_sync();
+ }
+
+ if (hold_cmds) {
+ queue = thread.bg_queue;
+ lock = false;
+ } else {
+ queue = thread.queue;
+ lock = true;
+ }
+
+ if (lock) {
+ pthread_mutex_lock(&thread.queue_lock);
+
+ while (queue->used >= QUEUE_SIZE) {
+ pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
+ }
+ }
+
+ cmd = &queue->queue[queue->end];
+ free(cmd->cmd_list);
+ cmd->cmd_list = cmd_list;
+ cmd->count = count;
+ cmd->last_cmd = last_cmd;
+ queue->end = (queue->end + 1) % QUEUE_SIZE;
+ queue->used++;
+
+ if (lock) {
+ pthread_cond_signal(&thread.cond_msg_avail);
+ pthread_mutex_unlock(&thread.queue_lock);
+ }
+}
+
+/* Slice off just the part of the list that can be handled async, and
+ * update ex_regs. */
+static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
+{
+ int cmd = 0, pos = 0, len, v;
+
+ while (pos < count) {
+ uint32_t *list = data + pos;
+ cmd = list[0] >> 24;
+ len = 1 + cmd_lengths[cmd];
+
+ switch (cmd) {
+ case 0x02:
+ break;
+ case 0x24 ... 0x27:
+ case 0x2c ... 0x2f:
+ case 0x34 ... 0x37:
+ case 0x3c ... 0x3f:
+ gpu.ex_regs[1] &= ~0x1ff;
+ gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
+ break;
+ case 0x48 ... 0x4F:
+ for (v = 3; pos + v < count; v++)
+ {
+ if ((list[v] & 0xf000f000) == 0x50005000)
+ break;
+ }
+ len += v - 3;
+ break;
+ case 0x58 ... 0x5F:
+ for (v = 4; pos + v < count; v += 2)
+ {
+ if ((list[v] & 0xf000f000) == 0x50005000)
+ break;
+ }
+ len += v - 4;
+ break;
+ default:
+ if ((cmd & 0xf8) == 0xe0)
+ gpu.ex_regs[cmd & 7] = list[0];
+ break;
+ }
+
+ if (pos + len > count) {
+ cmd = -1;
+ break; /* incomplete cmd */
+ }
+ if (0xa0 <= cmd && cmd <= 0xdf)
+ break; /* image i/o */
+
+ pos += len;
+ }
+
+ *last_cmd = cmd;
+ return pos;
+}
+
+int do_cmd_list(uint32_t *list, int count, int *last_cmd) {
+ int pos = 0;
+
+ if (thread.running) {
+ pos = scan_cmd_list(list, count, last_cmd);
+ video_thread_queue_cmd(list, pos, *last_cmd);
+ } else {
+ pos = real_do_cmd_list(list, count, last_cmd);
+ memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
+ }
+ return pos;
+}
+
+int renderer_init(void) {
+ if (thread_rendering) {
+ video_thread_start();
+ }
+ return real_renderer_init();
+}
+
+void renderer_finish(void) {
+ real_renderer_finish();
+
+ if (thread_rendering && thread.running) {
+ video_thread_stop();
+ }
+}
+
+void renderer_sync_ecmds(uint32_t * ecmds) {
+ if (thread.running) {
+ int dummy;
+ do_cmd_list(&ecmds[1], 6, &dummy);
+ } else {
+ real_renderer_sync_ecmds(ecmds);
+ }
+}
+
+void renderer_update_caches(int x, int y, int w, int h) {
+ renderer_sync();
+ real_renderer_update_caches(x, y, w, h);
+}
+
+void renderer_flush_queues(void) {
+ /* Called during DMA and updateLace. We want to sync if it's DMA,
+ * but not if it's updateLace. Instead of syncing here, there's a
+ * renderer_sync call during DMA. */
+ real_renderer_flush_queues();
+}
+
+/*
+ * Normally all GPU commands are processed before rendering the
+ * frame. For games that naturally run < 50/60fps, this is unnecessary
+ * -- it forces the game to render as if it was 60fps and leaves the
+ * GPU idle half the time on a 30fps game, for example.
+ *
+ * Allowing the renderer to wait until a frame is done before
+ * rendering it would give it double, triple, or quadruple the amount
+ * of time to finish before we have to wait for it.
+ *
+ * We can use a heuristic to figure out when to force a render.
+ *
+ * - If a frame isn't done when we're asked to render, wait for it and
+ * put future GPU commands in a separate buffer (for the next frame)
+ *
+ * - If the frame is done, and had no future GPU commands, render it.
+ *
+ * - If we do have future GPU commands, it meant the frame took too
+ * long to render and there's another frame waiting. Stop until the
+ * first frame finishes, render it, and start processing the next
+ * one.
+ *
+ * This may possibly add a frame or two of latency that shouldn't be
+ * different than the real device. It may skip rendering a frame
+ * entirely if a VRAM transfer happens while a frame is waiting, or in
+ * games that natively run at 60fps if frames are coming in too
+ * quickly to process. Depending on how the game treats "60fps," this
+ * may not be noticeable.
+ */
+void renderer_notify_update_lace(int updated) {
+ if (!thread.running) return;
+
+ if (thread_rendering == THREAD_RENDERING_SYNC) {
+ renderer_sync();
+ return;
+ }
+
+ if (updated) {
+ cmd_queue_swap();
+ return;
+ }
+
+ pthread_mutex_lock(&thread.queue_lock);
+ if (thread.bg_queue->used) {
+ /* We have commands for a future frame to run. Force a wait until
+ * the current frame is finished, and start processing the next
+ * frame after it's drawn (see the `updated` clause above). */
+ pthread_mutex_unlock(&thread.queue_lock);
+ renderer_wait();
+ pthread_mutex_lock(&thread.queue_lock);
+
+ /* We are no longer holding commands back, so the next frame may
+ * get mixed into the following frame. This is usually fine, but can
+ * result in frameskip-like effects for 60fps games. */
+ hold_cmds = false;
+ needs_display = true;
+ gpu.state.fb_dirty = true;
+ } else if (thread.queue->used) {
+ /* We are still drawing during a vblank. Cut off the current frame
+ * by sending new commands to the background queue and skip
+ * drawing our partly rendered frame to the display. */
+ hold_cmds = true;
+ needs_display = true;
+ gpu.state.fb_dirty = false;
+ } else if (needs_display && !thread.queue->used) {
+ /* We have processed all commands in the queue, render the
+ * buffer. We know we have something to render, because
+ * needs_display is true. */
+ hold_cmds = false;
+ needs_display = false;
+ gpu.state.fb_dirty = true;
+ } else {
+ /* Everything went normally, so do the normal thing. */
+ }
+
+ pthread_mutex_unlock(&thread.queue_lock);
+}
+
+void renderer_set_interlace(int enable, int is_odd) {
+ real_renderer_set_interlace(enable, is_odd);
+}
+
+void renderer_set_config(const struct rearmed_cbs *cbs) {
+ renderer_sync();
+ thread_rendering = cbs->thread_rendering;
+ if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
+ video_thread_start();
+ } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
+ video_thread_stop();
+ }
+ real_renderer_set_config(cbs);
+}
+
+void renderer_notify_res_change(void) {
+ renderer_sync();
+ real_renderer_notify_res_change();
+}