[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c

/**************************************************************************
*   Copyright (C) 2020 The RetroArch Team                                 *
*                                                                         *
*   This program is free software; you can redistribute it and/or modify  *
*   it under the terms of the GNU General Public License as published by  *
*   the Free Software Foundation; either version 2 of the License, or     *
*   (at your option) any later version.                                   *
*                                                                         *
*   This program is distributed in the hope that it will be useful,       *
*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*   GNU General Public License for more details.                          *
*                                                                         *
*   You should have received a copy of the GNU General Public License     *
*   along with this program; if not, write to the                         *
*   Free Software Foundation, Inc.,                                       *
*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
***************************************************************************/

#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include "../gpulib/gpu.h"
#include "../../frontend/plugin_lib.h"
#include "gpulib_thread_if.h"

typedef struct {
	uint32_t *cmd_list;
	int count;
	int last_cmd;
} video_thread_cmd;

#define QUEUE_SIZE 0x2000

typedef struct {
	size_t start;
	size_t end;
	size_t used;
	video_thread_cmd queue[QUEUE_SIZE];
} video_thread_queue;

typedef struct {
	pthread_t thread;
	pthread_mutex_t queue_lock;
	pthread_cond_t cond_msg_avail;
	pthread_cond_t cond_msg_done;
	pthread_cond_t cond_queue_empty;
	video_thread_queue *queue;
	video_thread_queue *bg_queue;
	bool running;
} video_thread_state;

static video_thread_state thread;
static video_thread_queue queues[2];
static int thread_rendering;
static bool hold_cmds;
static bool needs_display;

extern const unsigned char cmd_lengths[];

static void *video_thread_main(void *arg) {
	video_thread_state *thread = (video_thread_state *)arg;
	video_thread_cmd *cmd;
	int i;
	static int processed = 0;

	while(1) {
		int result, last_cmd, start, end;
		video_thread_queue *queue;
		pthread_mutex_lock(&thread->queue_lock);

		while (!thread->queue->used && thread->running) {
			pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
		}

		if (!thread->running) {
			pthread_mutex_unlock(&thread->queue_lock);
			break;
		}

		queue = thread->queue;
		start = queue->start;
		end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
		queue->start = end % QUEUE_SIZE;
		pthread_mutex_unlock(&thread->queue_lock);

		for (i = start; i < end; i++) {
			cmd = &queue->queue[i];
			result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);

			if (result != cmd->count) {
				fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
			}

#ifdef _3DS
			/* Periodically yield so as not to starve other threads */
			processed += cmd->count;
			if (processed >= 512) {
				svcSleepThread(1);
				processed %= 512;
			}
#endif
		}

		pthread_mutex_lock(&thread->queue_lock);
		queue->used -= (end - start);

		if (!queue->used)
			pthread_cond_signal(&thread->cond_queue_empty);

		pthread_cond_signal(&thread->cond_msg_done);
		pthread_mutex_unlock(&thread->queue_lock);
	}

	return 0;
}

static void cmd_queue_swap() {
	video_thread_queue *tmp;
	if (!thread.bg_queue->used) return;

	pthread_mutex_lock(&thread.queue_lock);
	if (!thread.queue->used) {
		tmp = thread.queue;
		thread.queue = thread.bg_queue;
		thread.bg_queue = tmp;
		needs_display = true;
		pthread_cond_signal(&thread.cond_msg_avail);
	}
	pthread_mutex_unlock(&thread.queue_lock);
}

/* Waits for the main queue to completely finish. */
void renderer_wait() {
	if (!thread.running) return;

	/* Not completely safe, but should be fine since the render thread
	 * only decreases used, and we check again inside the lock. */
	if (!thread.queue->used) {
		return;
	}

	pthread_mutex_lock(&thread.queue_lock);

	while (thread.queue->used) {
		pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
	}

	pthread_mutex_unlock(&thread.queue_lock);
}

/* Waits for all GPU commands in both queues to finish, bringing VRAM
 * completely up-to-date. */
void renderer_sync(void) {
	if (!thread.running) return;

	/* Not completely safe, but should be fine since the render thread
	 * only decreases used, and we check again inside the lock. */
	if (!thread.queue->used && !thread.bg_queue->used) {
		return;
	}

	/* Flush both queues. This is necessary because gpulib could be
	 * trying to process a DMA write that a command in the queue should
	 * run beforehand. For example, Xenogears sprites write a black
	 * rectangle over the to-be-DMA'd spot in VRAM -- if this write
	 * happens after the DMA, it will clear the DMA, resulting in
	 * flickering sprites. We need to be totally up-to-date. This may
	 * drop a frame. */
	renderer_wait();
	cmd_queue_swap();
	hold_cmds = false;
	renderer_wait();
}

static void video_thread_stop() {
	int i;
	renderer_sync();

	if (thread.running) {
		thread.running = false;
		pthread_cond_signal(&thread.cond_msg_avail);
		pthread_join(thread.thread, NULL);
	}

	pthread_mutex_destroy(&thread.queue_lock);
	pthread_cond_destroy(&thread.cond_msg_avail);
	pthread_cond_destroy(&thread.cond_msg_done);
	pthread_cond_destroy(&thread.cond_queue_empty);

	for (i = 0; i < QUEUE_SIZE; i++) {
		video_thread_cmd *cmd = &thread.queue->queue[i];
		free(cmd->cmd_list);
		cmd->cmd_list = NULL;
	}

	for (i = 0; i < QUEUE_SIZE; i++) {
		video_thread_cmd *cmd = &thread.bg_queue->queue[i];
		free(cmd->cmd_list);
		cmd->cmd_list = NULL;
	}
}

static void video_thread_start() {
	fprintf(stdout, "Starting render thread\n");

	if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
			pthread_cond_init(&thread.cond_msg_done, NULL) ||
			pthread_cond_init(&thread.cond_queue_empty, NULL) ||
			pthread_mutex_init(&thread.queue_lock, NULL) ||
			pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
		goto error;
	}

	thread.queue = &queues[0];
	thread.bg_queue = &queues[1];

	thread.running = true;
	return;

 error:
	fprintf(stderr,"Failed to start rendering thread\n");
	video_thread_stop();
}

static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
	video_thread_cmd *cmd;
	uint32_t *cmd_list;
	video_thread_queue *queue;
	bool lock;

	cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));

	if (!cmd_list) {
		/* Out of memory, disable the thread and run sync from now on */
		fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
		video_thread_stop();
	}

	memcpy(cmd_list, list, count * sizeof(uint32_t));

	if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
		/* If the bg queue is full, do a full sync to empty both queues
		 * and clear space. This should be very rare, I've only seen it in
		 * Tekken 3 post-battle-replay. */
		renderer_sync();
	}

	if (hold_cmds) {
		queue = thread.bg_queue;
		lock = false;
	} else {
		queue = thread.queue;
		lock = true;
	}

	if (lock) {
		pthread_mutex_lock(&thread.queue_lock);

		while (queue->used >= QUEUE_SIZE) {
			pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
		}
	}

	cmd = &queue->queue[queue->end];
	free(cmd->cmd_list);
	cmd->cmd_list = cmd_list;
	cmd->count = count;
	cmd->last_cmd = last_cmd;
	queue->end = (queue->end + 1) % QUEUE_SIZE;
	queue->used++;

	if (lock) {
		pthread_cond_signal(&thread.cond_msg_avail);
		pthread_mutex_unlock(&thread.queue_lock);
	}
}

/* Slice off just the part of the list that can be handled async, and
 * update ex_regs. */
static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
{
	int cmd = 0, pos = 0, len, v;

	while (pos < count) {
		uint32_t *list = data + pos;
		cmd = list[0] >> 24;
		len = 1 + cmd_lengths[cmd];

		switch (cmd) {
			case 0x02:
				break;
			case 0x24 ... 0x27:
			case 0x2c ... 0x2f:
			case 0x34 ... 0x37:
			case 0x3c ... 0x3f:
				gpu.ex_regs[1] &= ~0x1ff;
				gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
				break;
			case 0x48 ... 0x4F:
				for (v = 3; pos + v < count; v++)
				{
					if ((list[v] & 0xf000f000) == 0x50005000)
						break;
				}
				len += v - 3;
				break;
			case 0x58 ... 0x5F:
				for (v = 4; pos + v < count; v += 2)
				{
					if ((list[v] & 0xf000f000) == 0x50005000)
						break;
				}
				len += v - 4;
				break;
			default:
				if ((cmd & 0xf8) == 0xe0)
					gpu.ex_regs[cmd & 7] = list[0];
				break;
		}

		if (pos + len > count) {
			cmd = -1;
			break; /* incomplete cmd */
		}
		if (0xa0 <= cmd && cmd <= 0xdf)
			break; /* image i/o */

		pos += len;
	}

	*last_cmd = cmd;
	return pos;
}

int do_cmd_list(uint32_t *list, int count, int *last_cmd) {
	int pos = 0;

	if (thread.running) {
		pos = scan_cmd_list(list, count, last_cmd);
		video_thread_queue_cmd(list, pos, *last_cmd);
	} else {
		pos = real_do_cmd_list(list, count, last_cmd);
		memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
	}
	return pos;
}

int renderer_init(void) {
	if (thread_rendering) {
		video_thread_start();
	}
	return real_renderer_init();
}

void renderer_finish(void) {
	real_renderer_finish();

	if (thread_rendering && thread.running) {
		video_thread_stop();
	}
}

void renderer_sync_ecmds(uint32_t * ecmds) {
	if (thread.running) {
		int dummy;
		do_cmd_list(&ecmds[1], 6, &dummy);
	} else {
		real_renderer_sync_ecmds(ecmds);
	}
}

void renderer_update_caches(int x, int y, int w, int h) {
	renderer_sync();
	real_renderer_update_caches(x, y, w, h);
}

void renderer_flush_queues(void) {
	/* Called during DMA and updateLace. We want to sync if it's DMA,
	 * but not if it's updateLace. Instead of syncing here, there's a
	 * renderer_sync call during DMA. */
	real_renderer_flush_queues();
}

/*
 * Normally all GPU commands are processed before rendering the
 * frame. For games that naturally run < 50/60fps, this is unnecessary
 * -- it forces the game to render as if it was 60fps and leaves the
 * GPU idle half the time on a 30fps game, for example.
 *
 * Allowing the renderer to wait until a frame is done before
 * rendering it would give it double, triple, or quadruple the amount
 * of time to finish before we have to wait for it.
 *
 * We can use a heuristic to figure out when to force a render.
 *
 * - If a frame isn't done when we're asked to render, wait for it and
 *   put future GPU commands in a separate buffer (for the next frame)
 *
 * - If the frame is done, and had no future GPU commands, render it.
 *
 * - If we do have future GPU commands, it meant the frame took too
 *   long to render and there's another frame waiting. Stop until the
 *   first frame finishes, render it, and start processing the next
 *   one.
 *
 * This may possibly add a frame or two of latency that shouldn't be
 * different than the real device. It may skip rendering a frame
 * entirely if a VRAM transfer happens while a frame is waiting, or in
 * games that natively run at 60fps if frames are coming in too
 * quickly to process. Depending on how the game treats "60fps," this
 * may not be noticeable.
 */
void renderer_notify_update_lace(int updated) {
	if (!thread.running) return;

	if (thread_rendering == THREAD_RENDERING_SYNC) {
		renderer_sync();
		return;
	}

	if (updated) {
		cmd_queue_swap();
		return;
	}

	pthread_mutex_lock(&thread.queue_lock);
	if (thread.bg_queue->used) {
		/* We have commands for a future frame to run. Force a wait until
		 * the current frame is finished, and start processing the next
		 * frame after it's drawn (see the `updated` clause above). */
		pthread_mutex_unlock(&thread.queue_lock);
		renderer_wait();
		pthread_mutex_lock(&thread.queue_lock);

		/* We are no longer holding commands back, so the next frame may
		 * get mixed into the following frame. This is usually fine, but can
		 * result in frameskip-like effects for 60fps games. */
		hold_cmds = false;
		needs_display = true;
		gpu.state.fb_dirty = true;
	} else if (thread.queue->used) {
		/* We are still drawing during a vblank. Cut off the current frame
		 * by sending new commands to the background queue and skip
		 * drawing our partly rendered frame to the display. */
		hold_cmds = true;
		needs_display = true;
		gpu.state.fb_dirty = false;
	} else if (needs_display && !thread.queue->used) {
		/* We have processed all commands in the queue, render the
		 * buffer. We know we have something to render, because
		 * needs_display is true. */
		hold_cmds = false;
		needs_display = false;
		gpu.state.fb_dirty = true;
	} else {
		/* Everything went normally, so do the normal thing. */
	}

	pthread_mutex_unlock(&thread.queue_lock);
}

void renderer_set_interlace(int enable, int is_odd) {
	real_renderer_set_interlace(enable, is_odd);
}

void renderer_set_config(const struct rearmed_cbs *cbs) {
	renderer_sync();
	thread_rendering = cbs->thread_rendering;
	if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
		video_thread_start();
	} else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
		video_thread_stop();
	}
	real_renderer_set_config(cbs);
}

void renderer_notify_res_change(void) {
	renderer_sync();
	real_renderer_notify_res_change();
}
Commit	Line	Data
c765eb86 JW	1	/**************************************************************************
	2	* Copyright (C) 2020 The RetroArch Team *
	3	* *
	4	* This program is free software; you can redistribute it and/or modify *
	5	* it under the terms of the GNU General Public License as published by *
	6	* the Free Software Foundation; either version 2 of the License, or *
	7	* (at your option) any later version. *
	8	* *
	9	* This program is distributed in the hope that it will be useful, *
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
	12	* GNU General Public License for more details. *
	13	* *
	14	* You should have received a copy of the GNU General Public License *
	15	* along with this program; if not, write to the *
	16	* Free Software Foundation, Inc., *
	17	* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. *
	18	***************************************************************************/
	19
	20	#include <stdlib.h>
	21	#include <string.h>
	22	#include <pthread.h>
	23	#include "../gpulib/gpu.h"
	24	#include "../../frontend/plugin_lib.h"
	25	#include "gpulib_thread_if.h"
	26
	27	typedef struct {
	28	uint32_t *cmd_list;
	29	int count;
	30	int last_cmd;
	31	} video_thread_cmd;
	32
	33	#define QUEUE_SIZE 0x2000
	34
	35	typedef struct {
	36	size_t start;
	37	size_t end;
	38	size_t used;
	39	video_thread_cmd queue[QUEUE_SIZE];
	40	} video_thread_queue;
	41
	42	typedef struct {
	43	pthread_t thread;
	44	pthread_mutex_t queue_lock;
	45	pthread_cond_t cond_msg_avail;
	46	pthread_cond_t cond_msg_done;
	47	pthread_cond_t cond_queue_empty;
	48	video_thread_queue *queue;
	49	video_thread_queue *bg_queue;
	50	bool running;
	51	} video_thread_state;
	52
	53	static video_thread_state thread;
	54	static video_thread_queue queues[2];
	55	static int thread_rendering;
	56	static bool hold_cmds;
	57	static bool needs_display;
	58
	59	extern const unsigned char cmd_lengths[];
	60
	61	static void video_thread_main(void arg) {
	62	video_thread_state thread = (video_thread_state )arg;
	63	video_thread_cmd *cmd;
	64	int i;
65	static int processed = 0;
66
67	while(1) {
68	int result, last_cmd, start, end;
69	video_thread_queue *queue;
70	pthread_mutex_lock(&thread->queue_lock);
71
72	while (!thread->queue->used && thread->running) {
73	pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
74	}
75
76	if (!thread->running) {
77	pthread_mutex_unlock(&thread->queue_lock);
78	break;
79	}
80
81	queue = thread->queue;
82	start = queue->start;
83	end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
84	queue->start = end % QUEUE_SIZE;
85	pthread_mutex_unlock(&thread->queue_lock);
86
87	for (i = start; i < end; i++) {
88	cmd = &queue->queue[i];
89	result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);
90
91	if (result != cmd->count) {
92	fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
93	}
94
95	#ifdef _3DS
96	/* Periodically yield so as not to starve other threads */
97	processed += cmd->count;
98	if (processed >= 512) {
99	svcSleepThread(1);
100	processed %= 512;
101	}
102	#endif
103	}
104
105	pthread_mutex_lock(&thread->queue_lock);
106	queue->used -= (end - start);
107
108	if (!queue->used)
109	pthread_cond_signal(&thread->cond_queue_empty);
110
111	pthread_cond_signal(&thread->cond_msg_done);
112	pthread_mutex_unlock(&thread->queue_lock);
113	}
114
115	return 0;
116	}
117
118	static void cmd_queue_swap() {
119	video_thread_queue *tmp;
120	if (!thread.bg_queue->used) return;
121
122	pthread_mutex_lock(&thread.queue_lock);
123	if (!thread.queue->used) {
124	tmp = thread.queue;
125	thread.queue = thread.bg_queue;
126	thread.bg_queue = tmp;
127	needs_display = true;
128	pthread_cond_signal(&thread.cond_msg_avail);
129	}
130	pthread_mutex_unlock(&thread.queue_lock);
131	}
132
133	/* Waits for the main queue to completely finish. */
134	void renderer_wait() {
135	if (!thread.running) return;
136
137	/* Not completely safe, but should be fine since the render thread
138	* only decreases used, and we check again inside the lock. */
139	if (!thread.queue->used) {
140	return;
141	}
142
143	pthread_mutex_lock(&thread.queue_lock);
144
145	while (thread.queue->used) {
146	pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
147	}
148
149	pthread_mutex_unlock(&thread.queue_lock);
150	}
151
152	/* Waits for all GPU commands in both queues to finish, bringing VRAM
153	* completely up-to-date. */
154	void renderer_sync(void) {
155	if (!thread.running) return;
156
157	/* Not completely safe, but should be fine since the render thread
158	* only decreases used, and we check again inside the lock. */
159	if (!thread.queue->used && !thread.bg_queue->used) {
160	return;
161	}
162
163	/* Flush both queues. This is necessary because gpulib could be
164	* trying to process a DMA write that a command in the queue should
165	* run beforehand. For example, Xenogears sprites write a black
166	* rectangle over the to-be-DMA'd spot in VRAM -- if this write
167	* happens after the DMA, it will clear the DMA, resulting in
168	* flickering sprites. We need to be totally up-to-date. This may
169	* drop a frame. */
170	renderer_wait();
171	cmd_queue_swap();
172	hold_cmds = false;
173	renderer_wait();
174	}
175
176	static void video_thread_stop() {
177	int i;
178	renderer_sync();
179
180	if (thread.running) {
181	thread.running = false;
182	pthread_cond_signal(&thread.cond_msg_avail);
183	pthread_join(thread.thread, NULL);
184	}
185
186	pthread_mutex_destroy(&thread.queue_lock);
187	pthread_cond_destroy(&thread.cond_msg_avail);
188	pthread_cond_destroy(&thread.cond_msg_done);
189	pthread_cond_destroy(&thread.cond_queue_empty);
190
191	for (i = 0; i < QUEUE_SIZE; i++) {
192	video_thread_cmd *cmd = &thread.queue->queue[i];
193	free(cmd->cmd_list);
194	cmd->cmd_list = NULL;
195	}
196
197	for (i = 0; i < QUEUE_SIZE; i++) {
198	video_thread_cmd *cmd = &thread.bg_queue->queue[i];
199	free(cmd->cmd_list);
200	cmd->cmd_list = NULL;
201	}
202	}
203
204	static void video_thread_start() {
205	fprintf(stdout, "Starting render thread\n");
206
207	if (pthread_cond_init(&thread.cond_msg_avail, NULL) \|\|
208	pthread_cond_init(&thread.cond_msg_done, NULL) \|\|
209	pthread_cond_init(&thread.cond_queue_empty, NULL) \|\|
210	pthread_mutex_init(&thread.queue_lock, NULL) \|\|
211	pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
212	goto error;
213	}
214
215	thread.queue = &queues[0];
216	thread.bg_queue = &queues[1];
217
218	thread.running = true;
219	return;
220
221	error:
222	fprintf(stderr,"Failed to start rendering thread\n");
223	video_thread_stop();
224	}
225
226	static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
227	video_thread_cmd *cmd;
228	uint32_t *cmd_list;
229	video_thread_queue *queue;
230	bool lock;
231
232	cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
233
234	if (!cmd_list) {
235	/* Out of memory, disable the thread and run sync from now on */
236	fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
237	video_thread_stop();
238	}
239
240	memcpy(cmd_list, list, count * sizeof(uint32_t));
241
242	if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
243	/* If the bg queue is full, do a full sync to empty both queues
244	* and clear space. This should be very rare, I've only seen it in
245	* Tekken 3 post-battle-replay. */
246	renderer_sync();
247	}
248
249	if (hold_cmds) {
250	queue = thread.bg_queue;
251	lock = false;
252	} else {
253	queue = thread.queue;
254	lock = true;
255	}
256
257	if (lock) {
258	pthread_mutex_lock(&thread.queue_lock);
259
260	while (queue->used >= QUEUE_SIZE) {
261	pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
262	}
263	}
264
265	cmd = &queue->queue[queue->end];
266	free(cmd->cmd_list);
267	cmd->cmd_list = cmd_list;
268	cmd->count = count;
269	cmd->last_cmd = last_cmd;
270	queue->end = (queue->end + 1) % QUEUE_SIZE;
271	queue->used++;
272
273	if (lock) {
274	pthread_cond_signal(&thread.cond_msg_avail);
275	pthread_mutex_unlock(&thread.queue_lock);
276	}
277	}
278
279	/* Slice off just the part of the list that can be handled async, and
280	* update ex_regs. */
281	static int scan_cmd_list(uint32_t data, int count, int last_cmd)
282	{
283	int cmd = 0, pos = 0, len, v;
284
285	while (pos < count) {
286	uint32_t *list = data + pos;
287	cmd = list[0] >> 24;
288	len = 1 + cmd_lengths[cmd];
289
290	switch (cmd) {
291	case 0x02:
292	break;
293	case 0x24 ... 0x27:
294	case 0x2c ... 0x2f:
295	case 0x34 ... 0x37:
296	case 0x3c ... 0x3f:
297	gpu.ex_regs[1] &= ~0x1ff;
298	gpu.ex_regs[1] \|= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
299	break;
300	case 0x48 ... 0x4F:
301	for (v = 3; pos + v < count; v++)
302	{
303	if ((list[v] & 0xf000f000) == 0x50005000)
304	break;
305	}
306	len += v - 3;
307	break;
308	case 0x58 ... 0x5F:
309	for (v = 4; pos + v < count; v += 2)
310	{
311	if ((list[v] & 0xf000f000) == 0x50005000)
312	break;
313	}
314	len += v - 4;
315	break;
316	default:
317	if ((cmd & 0xf8) == 0xe0)
318	gpu.ex_regs[cmd & 7] = list[0];
319	break;
320	}
321
322	if (pos + len > count) {
323	cmd = -1;
324	break; /* incomplete cmd */
325	}
326	if (0xa0 <= cmd && cmd <= 0xdf)
327	break; /* image i/o */
328
329	pos += len;
330	}
331
332	*last_cmd = cmd;
333	return pos;
334	}
335
336	int do_cmd_list(uint32_t list, int count, int last_cmd) {
337	int pos = 0;
338
339	if (thread.running) {
340	pos = scan_cmd_list(list, count, last_cmd);
341	video_thread_queue_cmd(list, pos, *last_cmd);
342	} else {
343	pos = real_do_cmd_list(list, count, last_cmd);
344	memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
345	}
346	return pos;
347	}
348
349	int renderer_init(void) {
350	if (thread_rendering) {
351	video_thread_start();
352	}
353	return real_renderer_init();
354	}
355
356	void renderer_finish(void) {
357	real_renderer_finish();
358
359	if (thread_rendering && thread.running) {
360	video_thread_stop();
361	}
362	}
363
364	void renderer_sync_ecmds(uint32_t * ecmds) {
365	if (thread.running) {
366	int dummy;
367	do_cmd_list(&ecmds[1], 6, &dummy);
368	} else {
369	real_renderer_sync_ecmds(ecmds);
370	}
371	}
372
373	void renderer_update_caches(int x, int y, int w, int h) {
374	renderer_sync();
375	real_renderer_update_caches(x, y, w, h);
376	}
377
378	void renderer_flush_queues(void) {
379	/* Called during DMA and updateLace. We want to sync if it's DMA,
380	* but not if it's updateLace. Instead of syncing here, there's a
381	* renderer_sync call during DMA. */
382	real_renderer_flush_queues();
383	}
384
385	/*
386	* Normally all GPU commands are processed before rendering the
387	* frame. For games that naturally run < 50/60fps, this is unnecessary
388	* -- it forces the game to render as if it was 60fps and leaves the
389	* GPU idle half the time on a 30fps game, for example.
390	*
391	* Allowing the renderer to wait until a frame is done before
392	* rendering it would give it double, triple, or quadruple the amount
393	* of time to finish before we have to wait for it.
394	*
395	* We can use a heuristic to figure out when to force a render.
396	*
397	* - If a frame isn't done when we're asked to render, wait for it and
398	* put future GPU commands in a separate buffer (for the next frame)
399	*
400	* - If the frame is done, and had no future GPU commands, render it.
401	*
402	* - If we do have future GPU commands, it meant the frame took too
403	* long to render and there's another frame waiting. Stop until the
404	* first frame finishes, render it, and start processing the next
405	* one.
406	*
407	* This may possibly add a frame or two of latency that shouldn't be
408	* different than the real device. It may skip rendering a frame
409	* entirely if a VRAM transfer happens while a frame is waiting, or in
410	* games that natively run at 60fps if frames are coming in too
411	* quickly to process. Depending on how the game treats "60fps," this
412	* may not be noticeable.
413	*/
414	void renderer_notify_update_lace(int updated) {
415	if (!thread.running) return;
416
417	if (thread_rendering == THREAD_RENDERING_SYNC) {
418	renderer_sync();
419	return;
420	}
421
422	if (updated) {
423	cmd_queue_swap();
424	return;
425	}
426
427	pthread_mutex_lock(&thread.queue_lock);
428	if (thread.bg_queue->used) {
429	/* We have commands for a future frame to run. Force a wait until
430	* the current frame is finished, and start processing the next
431	* frame after it's drawn (see the `updated` clause above). */
432	pthread_mutex_unlock(&thread.queue_lock);
433	renderer_wait();
434	pthread_mutex_lock(&thread.queue_lock);
435
436	/* We are no longer holding commands back, so the next frame may
437	* get mixed into the following frame. This is usually fine, but can
438	* result in frameskip-like effects for 60fps games. */
439	hold_cmds = false;
440	needs_display = true;
441	gpu.state.fb_dirty = true;
442	} else if (thread.queue->used) {
443	/* We are still drawing during a vblank. Cut off the current frame
444	* by sending new commands to the background queue and skip
445	* drawing our partly rendered frame to the display. */
446	hold_cmds = true;
447	needs_display = true;
448	gpu.state.fb_dirty = false;
449	} else if (needs_display && !thread.queue->used) {
450	/* We have processed all commands in the queue, render the
451	* buffer. We know we have something to render, because
452	* needs_display is true. */
453	hold_cmds = false;
454	needs_display = false;
455	gpu.state.fb_dirty = true;
456	} else {
457	/* Everything went normally, so do the normal thing. */
458	}
459
460	pthread_mutex_unlock(&thread.queue_lock);
461	}
462
463	void renderer_set_interlace(int enable, int is_odd) {
464	real_renderer_set_interlace(enable, is_odd);
465	}
466
467	void renderer_set_config(const struct rearmed_cbs *cbs) {
468	renderer_sync();
469	thread_rendering = cbs->thread_rendering;
470	if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
471	video_thread_start();
472	} else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
473	video_thread_stop();
474	}
475	real_renderer_set_config(cbs);
476	}
477
478	void renderer_notify_res_change(void) {
479	renderer_sync();
480	real_renderer_notify_res_change();
481	}