[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c

/**************************************************************************
*   Copyright (C) 2020 The RetroArch Team                                 *
*                                                                         *
*   This program is free software; you can redistribute it and/or modify  *
*   it under the terms of the GNU General Public License as published by  *
*   the Free Software Foundation; either version 2 of the License, or     *
*   (at your option) any later version.                                   *
*                                                                         *
*   This program is distributed in the hope that it will be useful,       *
*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*   GNU General Public License for more details.                          *
*                                                                         *
*   You should have received a copy of the GNU General Public License     *
*   along with this program; if not, write to the                         *
*   Free Software Foundation, Inc.,                                       *
*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
***************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include "../gpulib/gpu.h"
#include "../../frontend/plugin_lib.h"
#include "gpu.h"
#include "gpu_timing.h"
#include "gpulib_thread_if.h"

extern void SysPrintf(const char *fmt, ...);

#define FALSE 0
#define TRUE 1
#define BOOL unsigned short

typedef struct {
	uint32_t *cmd_list;
	int count;
	int last_cmd;
} video_thread_cmd;

#define QUEUE_SIZE 0x2000

typedef struct {
	size_t start;
	size_t end;
	size_t used;
	video_thread_cmd queue[QUEUE_SIZE];
} video_thread_queue;

typedef struct {
	pthread_t thread;
	pthread_mutex_t queue_lock;
	pthread_cond_t cond_msg_avail;
	pthread_cond_t cond_msg_done;
	pthread_cond_t cond_queue_empty;
	video_thread_queue *queue;
	video_thread_queue *bg_queue;
	BOOL running;
} video_thread_state;

static video_thread_state thread;
static video_thread_queue queues[2];
static int thread_rendering;
static BOOL hold_cmds;
static BOOL needs_display;
static BOOL flushed;

extern const unsigned char cmd_lengths[];

static void *video_thread_main(void *arg) {
	video_thread_cmd *cmd;
	int i;

#ifdef _3DS
	static int processed = 0;
#endif /* _3DS */

#if defined(__arm__) && defined(__ARM_FP)
	// RunFast mode
	uint32_t fpscr = ~0;
	__asm__ volatile("vmrs %0, fpscr" : "=r"(fpscr));
	fpscr &= ~0x00009f9f;
	fpscr |=  0x03000000; // DN | FZ
	__asm__ volatile("vmsr fpscr, %0" :: "r"(fpscr));
#endif

	while(1) {
		int result, cycles_dummy = 0, last_cmd, start, end;
		video_thread_queue *queue;
		pthread_mutex_lock(&thread.queue_lock);

		while (!thread.queue->used && thread.running) {
			pthread_cond_wait(&thread.cond_msg_avail, &thread.queue_lock);
		}

		if (!thread.running) {
			pthread_mutex_unlock(&thread.queue_lock);
			break;
		}

		queue = thread.queue;
		start = queue->start;
		end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
		queue->start = end % QUEUE_SIZE;
		pthread_mutex_unlock(&thread.queue_lock);

		for (i = start; i < end; i++) {
			cmd = &queue->queue[i];
			result = real_do_cmd_list(cmd->cmd_list, cmd->count,
					&cycles_dummy, &cycles_dummy, &last_cmd);
			if (result != cmd->count) {
				fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
			}

#ifdef _3DS
			/* Periodically yield so as not to starve other threads */
			processed += cmd->count;
			if (processed >= 512) {
				svcSleepThread(1);
				processed %= 512;
			}
#endif /* _3DS */
		}

		pthread_mutex_lock(&thread.queue_lock);
		queue->used -= (end - start);

		if (!queue->used)
			pthread_cond_signal(&thread.cond_queue_empty);

		pthread_cond_signal(&thread.cond_msg_done);
		pthread_mutex_unlock(&thread.queue_lock);
	}

	return 0;
}

static void cmd_queue_swap() {
	video_thread_queue *tmp;
	if (!thread.bg_queue->used) return;

	pthread_mutex_lock(&thread.queue_lock);
	if (!thread.queue->used) {
		tmp = thread.queue;
		thread.queue = thread.bg_queue;
		thread.bg_queue = tmp;
		pthread_cond_signal(&thread.cond_msg_avail);
	}
	pthread_mutex_unlock(&thread.queue_lock);
}

/* Waits for the main queue to completely finish. */
void renderer_wait() {
	if (!thread.running) return;

	/* Not completely safe, but should be fine since the render thread
	 * only decreases used, and we check again inside the lock. */
	if (!thread.queue->used) {
		return;
	}

	pthread_mutex_lock(&thread.queue_lock);

	while (thread.queue->used) {
		pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
	}

	pthread_mutex_unlock(&thread.queue_lock);
}

/* Waits for all GPU commands in both queues to finish, bringing VRAM
 * completely up-to-date. */
void renderer_sync(void) {
	if (!thread.running) return;

	/* Not completely safe, but should be fine since the render thread
	 * only decreases used, and we check again inside the lock. */
	if (!thread.queue->used && !thread.bg_queue->used) {
		return;
	}

	if (thread.bg_queue->used) {
		/* When we flush the background queue, the vblank handler can't
		 * know that we had a frame pending, and we delay rendering too
		 * long. Force it. */
		flushed = TRUE;
	}

	/* Flush both queues. This is necessary because gpulib could be
	 * trying to process a DMA write that a command in the queue should
	 * run beforehand. For example, Xenogears sprites write a black
	 * rectangle over the to-be-DMA'd spot in VRAM -- if this write
	 * happens after the DMA, it will clear the DMA, resulting in
	 * flickering sprites. We need to be totally up-to-date. This may
	 * drop a frame. */
	renderer_wait();
	cmd_queue_swap();
	hold_cmds = FALSE;
	renderer_wait();
}

static void video_thread_stop() {
	int i;
	renderer_sync();

	if (thread.running) {
		thread.running = FALSE;
		pthread_cond_signal(&thread.cond_msg_avail);
		pthread_join(thread.thread, NULL);
	}

	pthread_mutex_destroy(&thread.queue_lock);
	pthread_cond_destroy(&thread.cond_msg_avail);
	pthread_cond_destroy(&thread.cond_msg_done);
	pthread_cond_destroy(&thread.cond_queue_empty);

	for (i = 0; i < QUEUE_SIZE; i++) {
		video_thread_cmd *cmd = &thread.queue->queue[i];
		free(cmd->cmd_list);
		cmd->cmd_list = NULL;
	}

	for (i = 0; i < QUEUE_SIZE; i++) {
		video_thread_cmd *cmd = &thread.bg_queue->queue[i];
		free(cmd->cmd_list);
		cmd->cmd_list = NULL;
	}
}

static void video_thread_start() {
	SysPrintf("Starting render thread\n");

	thread.queue = &queues[0];
	thread.bg_queue = &queues[1];
	thread.running = TRUE;

	if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
			pthread_cond_init(&thread.cond_msg_done, NULL) ||
			pthread_cond_init(&thread.cond_queue_empty, NULL) ||
			pthread_mutex_init(&thread.queue_lock, NULL) ||
			pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
		goto error;
	}

	return;

 error:
	SysPrintf("Failed to start rendering thread\n");
	thread.running = FALSE;
	video_thread_stop();
}

static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
	video_thread_cmd *cmd;
	uint32_t *cmd_list;
	video_thread_queue *queue;
	BOOL lock;

	cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));

	if (!cmd_list) {
		/* Out of memory, disable the thread and run sync from now on */
		SysPrintf("Failed to allocate render thread command list, stopping thread\n");
		video_thread_stop();
	}

	memcpy(cmd_list, list, count * sizeof(uint32_t));

	if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
		/* If the bg queue is full, do a full sync to empty both queues
		 * and clear space. This should be very rare, I've only seen it in
		 * Tekken 3 post-battle-replay. */
		renderer_sync();
	}

	if (hold_cmds) {
		queue = thread.bg_queue;
		lock = FALSE;
	} else {
		queue = thread.queue;
		lock = TRUE;
	}

	if (lock) {
		pthread_mutex_lock(&thread.queue_lock);

		while (queue->used >= QUEUE_SIZE) {
			pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
		}
	}

	cmd = &queue->queue[queue->end];
	free(cmd->cmd_list);
	cmd->cmd_list = cmd_list;
	cmd->count = count;
	cmd->last_cmd = last_cmd;
	queue->end = (queue->end + 1) % QUEUE_SIZE;
	queue->used++;

	if (lock) {
		pthread_cond_signal(&thread.cond_msg_avail);
		pthread_mutex_unlock(&thread.queue_lock);
	}
}

/* Slice off just the part of the list that can be handled async, and
 * update ex_regs. */
static int scan_cmd_list(uint32_t *data, int count,
	int *cycles_sum_out, int *cycles_last, int *last_cmd)
{
	int cpu_cycles_sum = 0, cpu_cycles = *cycles_last;
	int cmd = 0, pos = 0, len, v;

	while (pos < count) {
		uint32_t *list = data + pos;
		short *slist = (void *)list;
		cmd = LE32TOH(list[0]) >> 24;
		len = 1 + cmd_lengths[cmd];

		switch (cmd) {
			case 0x02:
				gput_sum(cpu_cycles_sum, cpu_cycles,
					gput_fill(LE16TOH(slist[4]) & 0x3ff,
						LE16TOH(slist[5]) & 0x1ff));
				break;
			case 0x20 ... 0x23:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
				break;
			case 0x24 ... 0x27:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
				gpu.ex_regs[1] &= ~0x1ff;
				gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
				break;
			case 0x28 ... 0x2b:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
				break;
			case 0x2c ... 0x2f:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
				gpu.ex_regs[1] &= ~0x1ff;
				gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
				break;
			case 0x30 ... 0x33:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
				break;
			case 0x34 ... 0x37:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
				gpu.ex_regs[1] &= ~0x1ff;
				gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
				break;
			case 0x38 ... 0x3b:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
				break;
			case 0x3c ... 0x3f:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
				gpu.ex_regs[1] &= ~0x1ff;
				gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
				break;
			case 0x40 ... 0x47:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
				break;
			case 0x48 ... 0x4F:
				for (v = 3; pos + v < count; v++)
				{
					gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
					if ((list[v] & 0xf000f000) == 0x50005000)
						break;
				}
				len += v - 3;
				break;
			case 0x50 ... 0x57:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
				break;
			case 0x58 ... 0x5F:
				for (v = 4; pos + v < count; v += 2)
				{
					gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
					if ((list[v] & 0xf000f000) == 0x50005000)
						break;
				}
				len += v - 4;
				break;
			case 0x60 ... 0x63:
				gput_sum(cpu_cycles_sum, cpu_cycles,
					gput_sprite(LE16TOH(slist[4]) & 0x3ff,
						LE16TOH(slist[5]) & 0x1ff));
				break;
			case 0x64 ... 0x67:
				gput_sum(cpu_cycles_sum, cpu_cycles,
					gput_sprite(LE16TOH(slist[6]) & 0x3ff,
						LE16TOH(slist[7]) & 0x1ff));
				break;
			case 0x68 ... 0x6b:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
				break;
			case 0x70 ... 0x77:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8));
				break;
			case 0x78 ... 0x7f:
				gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16));
				break;
			default:
				if ((cmd & 0xf8) == 0xe0)
					gpu.ex_regs[cmd & 7] = list[0];
				break;
		}

		if (pos + len > count) {
			cmd = -1;
			break; /* incomplete cmd */
		}
		if (0x80 <= cmd && cmd <= 0xdf)
			break; /* image i/o */

		pos += len;
	}

	*cycles_sum_out += cpu_cycles_sum;
	*cycles_last = cpu_cycles;
	*last_cmd = cmd;
	return pos;
}

int do_cmd_list(uint32_t *list, int count,
 int *cycles_sum, int *cycles_last, int *last_cmd)
{
	int pos = 0;

	if (thread.running) {
		pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
		video_thread_queue_cmd(list, pos, *last_cmd);
	} else {
		pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
		memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
	}
	return pos;
}

int renderer_init(void) {
	if (thread_rendering) {
		video_thread_start();
	}
	return real_renderer_init();
}

void renderer_finish(void) {
	real_renderer_finish();

	if (thread_rendering && thread.running) {
		video_thread_stop();
	}
}

void renderer_sync_ecmds(uint32_t * ecmds) {
	if (thread.running) {
		int dummy = 0;
		do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy);
	} else {
		real_renderer_sync_ecmds(ecmds);
	}
}

void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
	renderer_sync();
	real_renderer_update_caches(x, y, w, h, state_changed);
}

void renderer_flush_queues(void) {
	/* Called during DMA and updateLace. We want to sync if it's DMA,
	 * but not if it's updateLace. Instead of syncing here, there's a
	 * renderer_sync call during DMA. */
	real_renderer_flush_queues();
}

/*
 * Normally all GPU commands are processed before rendering the
 * frame. For games that naturally run < 50/60fps, this is unnecessary
 * -- it forces the game to render as if it was 60fps and leaves the
 * GPU idle half the time on a 30fps game, for example.
 *
 * Allowing the renderer to wait until a frame is done before
 * rendering it would give it double, triple, or quadruple the amount
 * of time to finish before we have to wait for it.
 *
 * We can use a heuristic to figure out when to force a render.
 *
 * - If a frame isn't done when we're asked to render, wait for it and
 *   put future GPU commands in a separate buffer (for the next frame)
 *
 * - If the frame is done, and had no future GPU commands, render it.
 *
 * - If we do have future GPU commands, it meant the frame took too
 *   long to render and there's another frame waiting. Stop until the
 *   first frame finishes, render it, and start processing the next
 *   one.
 *
 * This may possibly add a frame or two of latency that shouldn't be
 * different than the real device. It may skip rendering a frame
 * entirely if a VRAM transfer happens while a frame is waiting, or in
 * games that natively run at 60fps if frames are coming in too
 * quickly to process. Depending on how the game treats "60fps," this
 * may not be noticeable.
 */
void renderer_notify_update_lace(int updated) {
	if (!thread.running) return;

	if (thread_rendering == THREAD_RENDERING_SYNC) {
		renderer_sync();
		return;
	}

	if (updated) {
		cmd_queue_swap();
		return;
	}

	pthread_mutex_lock(&thread.queue_lock);
	if (thread.bg_queue->used || flushed) {
		/* We have commands for a future frame to run. Force a wait until
		 * the current frame is finished, and start processing the next
		 * frame after it's drawn (see the `updated` clause above). */
		pthread_mutex_unlock(&thread.queue_lock);
		renderer_wait();
		pthread_mutex_lock(&thread.queue_lock);

		/* We are no longer holding commands back, so the next frame may
		 * get mixed into the following frame. This is usually fine, but can
		 * result in frameskip-like effects for 60fps games. */
		flushed = FALSE;
		hold_cmds = FALSE;
		needs_display = TRUE;
		gpu.state.fb_dirty = TRUE;
	} else if (thread.queue->used) {
		/* We are still drawing during a vblank. Cut off the current frame
		 * by sending new commands to the background queue and skip
		 * drawing our partly rendered frame to the display. */
		hold_cmds = TRUE;
		needs_display = TRUE;
		gpu.state.fb_dirty = FALSE;
	} else if (needs_display && !thread.queue->used) {
		/* We have processed all commands in the queue, render the
		 * buffer. We know we have something to render, because
		 * needs_display is TRUE. */
		hold_cmds = FALSE;
		needs_display = FALSE;
		gpu.state.fb_dirty = TRUE;
	} else {
		/* Everything went normally, so do the normal thing. */
	}

	pthread_mutex_unlock(&thread.queue_lock);
}

void renderer_set_interlace(int enable, int is_odd) {
	real_renderer_set_interlace(enable, is_odd);
}

void renderer_set_config(const struct rearmed_cbs *cbs) {
	renderer_sync();
	thread_rendering = cbs->thread_rendering;
	if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
		video_thread_start();
	} else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
		video_thread_stop();
	}
	real_renderer_set_config(cbs);
}

void renderer_notify_res_change(void) {
	renderer_sync();
	real_renderer_notify_res_change();
}
Commit	Line	Data
79c4d434	1	/**************************************************************************
	2	* Copyright (C) 2020 The RetroArch Team *
	3	* *
	4	* This program is free software; you can redistribute it and/or modify *
	5	* it under the terms of the GNU General Public License as published by *
	6	* the Free Software Foundation; either version 2 of the License, or *
	7	* (at your option) any later version. *
	8	* *
	9	* This program is distributed in the hope that it will be useful, *
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
	12	* GNU General Public License for more details. *
	13	* *
	14	* You should have received a copy of the GNU General Public License *
	15	* along with this program; if not, write to the *
	16	* Free Software Foundation, Inc., *
	17	* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. *
	18	***************************************************************************/
	19
	20	#include <stdlib.h>
	21	#include <stdio.h>
	22	#include <string.h>
	23	#include <pthread.h>
	24	#include "../gpulib/gpu.h"
	25	#include "../../frontend/plugin_lib.h"
	26	#include "gpu.h"
	27	#include "gpu_timing.h"
	28	#include "gpulib_thread_if.h"
	29
	30	extern void SysPrintf(const char *fmt, ...);
	31
	32	#define FALSE 0
	33	#define TRUE 1
	34	#define BOOL unsigned short
	35
	36	typedef struct {
	37	uint32_t *cmd_list;
	38	int count;
	39	int last_cmd;
	40	} video_thread_cmd;
	41
	42	#define QUEUE_SIZE 0x2000
	43
	44	typedef struct {
	45	size_t start;
	46	size_t end;
	47	size_t used;
	48	video_thread_cmd queue[QUEUE_SIZE];
	49	} video_thread_queue;
	50
	51	typedef struct {
	52	pthread_t thread;
	53	pthread_mutex_t queue_lock;
	54	pthread_cond_t cond_msg_avail;
	55	pthread_cond_t cond_msg_done;
	56	pthread_cond_t cond_queue_empty;
	57	video_thread_queue *queue;
	58	video_thread_queue *bg_queue;
	59	BOOL running;
	60	} video_thread_state;
	61
	62	static video_thread_state thread;
	63	static video_thread_queue queues[2];
	64	static int thread_rendering;
65	static BOOL hold_cmds;
66	static BOOL needs_display;
67	static BOOL flushed;
68
69	extern const unsigned char cmd_lengths[];
70
71	static void video_thread_main(void arg) {
72	video_thread_cmd *cmd;
73	int i;
74
75	#ifdef _3DS
76	static int processed = 0;
77	#endif /* _3DS */
78
30431e75	79	#if defined(__arm__) && defined(__ARM_FP)
f6ede72d	80	// RunFast mode
22082ea1	81	uint32_t fpscr = ~0;
30431e75	82	__asm__ volatile("vmrs %0, fpscr" : "=r"(fpscr));
f6ede72d	83	fpscr &= ~0x00009f9f;
f6ede72d	84	fpscr \|= 0x03000000; // DN \| FZ
30431e75	85	__asm__ volatile("vmsr fpscr, %0" :: "r"(fpscr));
f6ede72d	86	#endif
f6ede72d	87
79c4d434	88	while(1) {
	89	int result, cycles_dummy = 0, last_cmd, start, end;
	90	video_thread_queue *queue;
	91	pthread_mutex_lock(&thread.queue_lock);
	92
	93	while (!thread.queue->used && thread.running) {
	94	pthread_cond_wait(&thread.cond_msg_avail, &thread.queue_lock);
	95	}
	96
	97	if (!thread.running) {
	98	pthread_mutex_unlock(&thread.queue_lock);
	99	break;
	100	}
	101
	102	queue = thread.queue;
	103	start = queue->start;
	104	end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
	105	queue->start = end % QUEUE_SIZE;
	106	pthread_mutex_unlock(&thread.queue_lock);
	107
	108	for (i = start; i < end; i++) {
	109	cmd = &queue->queue[i];
	110	result = real_do_cmd_list(cmd->cmd_list, cmd->count,
	111	&cycles_dummy, &cycles_dummy, &last_cmd);
	112	if (result != cmd->count) {
	113	fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
	114	}
	115
	116	#ifdef _3DS
	117	/* Periodically yield so as not to starve other threads */
	118	processed += cmd->count;
	119	if (processed >= 512) {
	120	svcSleepThread(1);
	121	processed %= 512;
	122	}
	123	#endif /* _3DS */
	124	}
	125
	126	pthread_mutex_lock(&thread.queue_lock);
	127	queue->used -= (end - start);
	128
	129	if (!queue->used)
	130	pthread_cond_signal(&thread.cond_queue_empty);
	131
	132	pthread_cond_signal(&thread.cond_msg_done);
	133	pthread_mutex_unlock(&thread.queue_lock);
	134	}
	135
	136	return 0;
	137	}
	138
	139	static void cmd_queue_swap() {
	140	video_thread_queue *tmp;
	141	if (!thread.bg_queue->used) return;
	142
	143	pthread_mutex_lock(&thread.queue_lock);
	144	if (!thread.queue->used) {
	145	tmp = thread.queue;
	146	thread.queue = thread.bg_queue;
	147	thread.bg_queue = tmp;
	148	pthread_cond_signal(&thread.cond_msg_avail);
	149	}
	150	pthread_mutex_unlock(&thread.queue_lock);
	151	}
152
153	/* Waits for the main queue to completely finish. */
154	void renderer_wait() {
155	if (!thread.running) return;
156
157	/* Not completely safe, but should be fine since the render thread
158	* only decreases used, and we check again inside the lock. */
159	if (!thread.queue->used) {
160	return;
161	}
162
163	pthread_mutex_lock(&thread.queue_lock);
164
165	while (thread.queue->used) {
166	pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
167	}
168
169	pthread_mutex_unlock(&thread.queue_lock);
170	}
171
172	/* Waits for all GPU commands in both queues to finish, bringing VRAM
173	* completely up-to-date. */
174	void renderer_sync(void) {
175	if (!thread.running) return;
176
177	/* Not completely safe, but should be fine since the render thread
178	* only decreases used, and we check again inside the lock. */
179	if (!thread.queue->used && !thread.bg_queue->used) {
180	return;
181	}
182
183	if (thread.bg_queue->used) {
184	/* When we flush the background queue, the vblank handler can't
185	* know that we had a frame pending, and we delay rendering too
186	* long. Force it. */
187	flushed = TRUE;
188	}
189
190	/* Flush both queues. This is necessary because gpulib could be
191	* trying to process a DMA write that a command in the queue should
192	* run beforehand. For example, Xenogears sprites write a black
193	* rectangle over the to-be-DMA'd spot in VRAM -- if this write
194	* happens after the DMA, it will clear the DMA, resulting in
195	* flickering sprites. We need to be totally up-to-date. This may
196	* drop a frame. */
197	renderer_wait();
198	cmd_queue_swap();
199	hold_cmds = FALSE;
200	renderer_wait();
201	}
202
203	static void video_thread_stop() {
204	int i;
205	renderer_sync();
206
207	if (thread.running) {
208	thread.running = FALSE;
209	pthread_cond_signal(&thread.cond_msg_avail);
210	pthread_join(thread.thread, NULL);
211	}
212
213	pthread_mutex_destroy(&thread.queue_lock);
214	pthread_cond_destroy(&thread.cond_msg_avail);
215	pthread_cond_destroy(&thread.cond_msg_done);
216	pthread_cond_destroy(&thread.cond_queue_empty);
217
218	for (i = 0; i < QUEUE_SIZE; i++) {
219	video_thread_cmd *cmd = &thread.queue->queue[i];
220	free(cmd->cmd_list);
221	cmd->cmd_list = NULL;
222	}
223
224	for (i = 0; i < QUEUE_SIZE; i++) {
225	video_thread_cmd *cmd = &thread.bg_queue->queue[i];
226	free(cmd->cmd_list);
227	cmd->cmd_list = NULL;
228	}
229	}
230
231	static void video_thread_start() {
232	SysPrintf("Starting render thread\n");
233
234	thread.queue = &queues[0];
235	thread.bg_queue = &queues[1];
236	thread.running = TRUE;
237
238	if (pthread_cond_init(&thread.cond_msg_avail, NULL) \|\|
239	pthread_cond_init(&thread.cond_msg_done, NULL) \|\|
240	pthread_cond_init(&thread.cond_queue_empty, NULL) \|\|
241	pthread_mutex_init(&thread.queue_lock, NULL) \|\|
242	pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
243	goto error;
244	}
245
246	return;
247
248	error:
249	SysPrintf("Failed to start rendering thread\n");
250	thread.running = FALSE;
251	video_thread_stop();
252	}
253
254	static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
255	video_thread_cmd *cmd;
256	uint32_t *cmd_list;
257	video_thread_queue *queue;
258	BOOL lock;
259
260	cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
261
262	if (!cmd_list) {
263	/* Out of memory, disable the thread and run sync from now on */
264	SysPrintf("Failed to allocate render thread command list, stopping thread\n");
265	video_thread_stop();
266	}
267
268	memcpy(cmd_list, list, count * sizeof(uint32_t));
269
270	if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
271	/* If the bg queue is full, do a full sync to empty both queues
272	* and clear space. This should be very rare, I've only seen it in
273	* Tekken 3 post-battle-replay. */
274	renderer_sync();
275	}
276
277	if (hold_cmds) {
278	queue = thread.bg_queue;
279	lock = FALSE;
280	} else {
281	queue = thread.queue;
282	lock = TRUE;
283	}
284
285	if (lock) {
286	pthread_mutex_lock(&thread.queue_lock);
287
288	while (queue->used >= QUEUE_SIZE) {
289	pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
290	}
291	}
292
293	cmd = &queue->queue[queue->end];
294	free(cmd->cmd_list);
295	cmd->cmd_list = cmd_list;
296	cmd->count = count;
297	cmd->last_cmd = last_cmd;
298	queue->end = (queue->end + 1) % QUEUE_SIZE;
299	queue->used++;
300
301	if (lock) {
302	pthread_cond_signal(&thread.cond_msg_avail);
303	pthread_mutex_unlock(&thread.queue_lock);
304	}
305	}
306
307	/* Slice off just the part of the list that can be handled async, and
308	* update ex_regs. */
309	static int scan_cmd_list(uint32_t *data, int count,
310	int cycles_sum_out, int cycles_last, int *last_cmd)
311	{
312	int cpu_cycles_sum = 0, cpu_cycles = *cycles_last;
313	int cmd = 0, pos = 0, len, v;
314
315	while (pos < count) {
316	uint32_t *list = data + pos;
317	short slist = (void )list;
318	cmd = LE32TOH(list[0]) >> 24;
319	len = 1 + cmd_lengths[cmd];
320
321	switch (cmd) {
322	case 0x02:
323	gput_sum(cpu_cycles_sum, cpu_cycles,
324	gput_fill(LE16TOH(slist[4]) & 0x3ff,
325	LE16TOH(slist[5]) & 0x1ff));
326	break;
327	case 0x20 ... 0x23:
328	gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
329	break;
330	case 0x24 ... 0x27:
331	gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
332	gpu.ex_regs[1] &= ~0x1ff;
333	gpu.ex_regs[1] \|= LE32TOH(list[4]) & 0x1ff;
334	break;
335	case 0x28 ... 0x2b:
336	gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
337	break;
338	case 0x2c ... 0x2f:
339	gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
340	gpu.ex_regs[1] &= ~0x1ff;
341	gpu.ex_regs[1] \|= LE32TOH(list[4]) & 0x1ff;
342	break;
343	case 0x30 ... 0x33:
344	gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
345	break;
346	case 0x34 ... 0x37:
347	gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
348	gpu.ex_regs[1] &= ~0x1ff;
349	gpu.ex_regs[1] \|= LE32TOH(list[5]) & 0x1ff;
350	break;
351	case 0x38 ... 0x3b:
352	gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
353	break;
354	case 0x3c ... 0x3f:
355	gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
356	gpu.ex_regs[1] &= ~0x1ff;
357	gpu.ex_regs[1] \|= LE32TOH(list[5]) & 0x1ff;
358	break;
359	case 0x40 ... 0x47:
360	gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
361	break;
362	case 0x48 ... 0x4F:
363	for (v = 3; pos + v < count; v++)
364	{
365	gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
366	if ((list[v] & 0xf000f000) == 0x50005000)
367	break;
368	}
369	len += v - 3;
370	break;
371	case 0x50 ... 0x57:
372	gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
373	break;
374	case 0x58 ... 0x5F:
375	for (v = 4; pos + v < count; v += 2)
376	{
377	gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
378	if ((list[v] & 0xf000f000) == 0x50005000)
379	break;
380	}
381	len += v - 4;
382	break;
383	case 0x60 ... 0x63:
384	gput_sum(cpu_cycles_sum, cpu_cycles,
385	gput_sprite(LE16TOH(slist[4]) & 0x3ff,
386	LE16TOH(slist[5]) & 0x1ff));
387	break;
388	case 0x64 ... 0x67:
389	gput_sum(cpu_cycles_sum, cpu_cycles,
390	gput_sprite(LE16TOH(slist[6]) & 0x3ff,
391	LE16TOH(slist[7]) & 0x1ff));
392	break;
393	case 0x68 ... 0x6b:
394	gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
395	break;
396	case 0x70 ... 0x77:
397	gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8));
398	break;
399	case 0x78 ... 0x7f:
400	gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16));
401	break;
402	default:
403	if ((cmd & 0xf8) == 0xe0)
404	gpu.ex_regs[cmd & 7] = list[0];
405	break;
406	}
407
408	if (pos + len > count) {
409	cmd = -1;
410	break; /* incomplete cmd */
411	}
412	if (0x80 <= cmd && cmd <= 0xdf)
413	break; /* image i/o */
414
415	pos += len;
416	}
417
418	*cycles_sum_out += cpu_cycles_sum;
419	*cycles_last = cpu_cycles;
420	*last_cmd = cmd;
421	return pos;
422	}
423
424	int do_cmd_list(uint32_t *list, int count,
425	int cycles_sum, int cycles_last, int *last_cmd)
426	{
427	int pos = 0;
428
429	if (thread.running) {
430	pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
431	video_thread_queue_cmd(list, pos, *last_cmd);
432	} else {
433	pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
434	memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
435	}
436	return pos;
437	}
438
439	int renderer_init(void) {
440	if (thread_rendering) {
441	video_thread_start();
442	}
443	return real_renderer_init();
444	}
445
446	void renderer_finish(void) {
447	real_renderer_finish();
448
449	if (thread_rendering && thread.running) {
450	video_thread_stop();
451	}
452	}
453
454	void renderer_sync_ecmds(uint32_t * ecmds) {
455	if (thread.running) {
456	int dummy = 0;
457	do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy);
458	} else {
459	real_renderer_sync_ecmds(ecmds);
460	}
461	}
462
463	void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
464	renderer_sync();
465	real_renderer_update_caches(x, y, w, h, state_changed);
466	}
467
468	void renderer_flush_queues(void) {
469	/* Called during DMA and updateLace. We want to sync if it's DMA,
470	* but not if it's updateLace. Instead of syncing here, there's a
471	* renderer_sync call during DMA. */
472	real_renderer_flush_queues();
473	}
474
475	/*
476	* Normally all GPU commands are processed before rendering the
477	* frame. For games that naturally run < 50/60fps, this is unnecessary
478	* -- it forces the game to render as if it was 60fps and leaves the
479	* GPU idle half the time on a 30fps game, for example.
480	*
481	* Allowing the renderer to wait until a frame is done before
482	* rendering it would give it double, triple, or quadruple the amount
483	* of time to finish before we have to wait for it.
484	*
485	* We can use a heuristic to figure out when to force a render.
486	*
487	* - If a frame isn't done when we're asked to render, wait for it and
488	* put future GPU commands in a separate buffer (for the next frame)
489	*
490	* - If the frame is done, and had no future GPU commands, render it.
491	*
492	* - If we do have future GPU commands, it meant the frame took too
493	* long to render and there's another frame waiting. Stop until the
494	* first frame finishes, render it, and start processing the next
495	* one.
496	*
497	* This may possibly add a frame or two of latency that shouldn't be
498	* different than the real device. It may skip rendering a frame
499	* entirely if a VRAM transfer happens while a frame is waiting, or in
500	* games that natively run at 60fps if frames are coming in too
501	* quickly to process. Depending on how the game treats "60fps," this
502	* may not be noticeable.
503	*/
504	void renderer_notify_update_lace(int updated) {
505	if (!thread.running) return;
506
507	if (thread_rendering == THREAD_RENDERING_SYNC) {
508	renderer_sync();
509	return;
510	}
511
512	if (updated) {
513	cmd_queue_swap();
514	return;
515	}
516
517	pthread_mutex_lock(&thread.queue_lock);
518	if (thread.bg_queue->used \|\| flushed) {
519	/* We have commands for a future frame to run. Force a wait until
520	* the current frame is finished, and start processing the next
521	* frame after it's drawn (see the `updated` clause above). */
522	pthread_mutex_unlock(&thread.queue_lock);
523	renderer_wait();
524	pthread_mutex_lock(&thread.queue_lock);
525
526	/* We are no longer holding commands back, so the next frame may
527	* get mixed into the following frame. This is usually fine, but can
528	* result in frameskip-like effects for 60fps games. */
529	flushed = FALSE;
530	hold_cmds = FALSE;
531	needs_display = TRUE;
532	gpu.state.fb_dirty = TRUE;
533	} else if (thread.queue->used) {
534	/* We are still drawing during a vblank. Cut off the current frame
535	* by sending new commands to the background queue and skip
536	* drawing our partly rendered frame to the display. */
537	hold_cmds = TRUE;
538	needs_display = TRUE;
539	gpu.state.fb_dirty = FALSE;
540	} else if (needs_display && !thread.queue->used) {
541	/* We have processed all commands in the queue, render the
542	* buffer. We know we have something to render, because
543	* needs_display is TRUE. */
544	hold_cmds = FALSE;
545	needs_display = FALSE;
546	gpu.state.fb_dirty = TRUE;
547	} else {
548	/* Everything went normally, so do the normal thing. */
549	}
550
551	pthread_mutex_unlock(&thread.queue_lock);
552	}
553
554	void renderer_set_interlace(int enable, int is_odd) {
555	real_renderer_set_interlace(enable, is_odd);
556	}
557
558	void renderer_set_config(const struct rearmed_cbs *cbs) {
559	renderer_sync();
560	thread_rendering = cbs->thread_rendering;
561	if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
562	video_thread_start();
563	} else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
564	video_thread_stop();
565	}
566	real_renderer_set_config(cbs);
567	}
568
569	void renderer_notify_res_change(void) {
570	renderer_sync();
571	real_renderer_notify_res_change();
572	}