Merge pull request #462 from justinweiss/threaded-rendering
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
CommitLineData
c765eb86
JW
1/**************************************************************************
2* Copyright (C) 2020 The RetroArch Team *
3* *
4* This program is free software; you can redistribute it and/or modify *
5* it under the terms of the GNU General Public License as published by *
6* the Free Software Foundation; either version 2 of the License, or *
7* (at your option) any later version. *
8* *
9* This program is distributed in the hope that it will be useful, *
10* but WITHOUT ANY WARRANTY; without even the implied warranty of *
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
12* GNU General Public License for more details. *
13* *
14* You should have received a copy of the GNU General Public License *
15* along with this program; if not, write to the *
16* Free Software Foundation, Inc., *
17* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. *
18***************************************************************************/
19
20#include <stdlib.h>
21#include <string.h>
22#include <pthread.h>
23#include "../gpulib/gpu.h"
24#include "../../frontend/plugin_lib.h"
25#include "gpulib_thread_if.h"
26
27typedef struct {
28 uint32_t *cmd_list;
29 int count;
30 int last_cmd;
31} video_thread_cmd;
32
33#define QUEUE_SIZE 0x2000
34
35typedef struct {
36 size_t start;
37 size_t end;
38 size_t used;
39 video_thread_cmd queue[QUEUE_SIZE];
40} video_thread_queue;
41
42typedef struct {
43 pthread_t thread;
44 pthread_mutex_t queue_lock;
45 pthread_cond_t cond_msg_avail;
46 pthread_cond_t cond_msg_done;
47 pthread_cond_t cond_queue_empty;
48 video_thread_queue *queue;
49 video_thread_queue *bg_queue;
50 bool running;
51} video_thread_state;
52
53static video_thread_state thread;
54static video_thread_queue queues[2];
55static int thread_rendering;
56static bool hold_cmds;
57static bool needs_display;
58
59extern const unsigned char cmd_lengths[];
60
61static void *video_thread_main(void *arg) {
62 video_thread_state *thread = (video_thread_state *)arg;
63 video_thread_cmd *cmd;
64 int i;
65 static int processed = 0;
66
67 while(1) {
68 int result, last_cmd, start, end;
69 video_thread_queue *queue;
70 pthread_mutex_lock(&thread->queue_lock);
71
72 while (!thread->queue->used && thread->running) {
73 pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
74 }
75
76 if (!thread->running) {
77 pthread_mutex_unlock(&thread->queue_lock);
78 break;
79 }
80
81 queue = thread->queue;
82 start = queue->start;
83 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
84 queue->start = end % QUEUE_SIZE;
85 pthread_mutex_unlock(&thread->queue_lock);
86
87 for (i = start; i < end; i++) {
88 cmd = &queue->queue[i];
89 result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);
90
91 if (result != cmd->count) {
92 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
93 }
94
95#ifdef _3DS
96 /* Periodically yield so as not to starve other threads */
97 processed += cmd->count;
98 if (processed >= 512) {
99 svcSleepThread(1);
100 processed %= 512;
101 }
102#endif
103 }
104
105 pthread_mutex_lock(&thread->queue_lock);
106 queue->used -= (end - start);
107
108 if (!queue->used)
109 pthread_cond_signal(&thread->cond_queue_empty);
110
111 pthread_cond_signal(&thread->cond_msg_done);
112 pthread_mutex_unlock(&thread->queue_lock);
113 }
114
115 return 0;
116}
117
118static void cmd_queue_swap() {
119 video_thread_queue *tmp;
120 if (!thread.bg_queue->used) return;
121
122 pthread_mutex_lock(&thread.queue_lock);
123 if (!thread.queue->used) {
124 tmp = thread.queue;
125 thread.queue = thread.bg_queue;
126 thread.bg_queue = tmp;
127 needs_display = true;
128 pthread_cond_signal(&thread.cond_msg_avail);
129 }
130 pthread_mutex_unlock(&thread.queue_lock);
131}
132
133/* Waits for the main queue to completely finish. */
134void renderer_wait() {
135 if (!thread.running) return;
136
137 /* Not completely safe, but should be fine since the render thread
138 * only decreases used, and we check again inside the lock. */
139 if (!thread.queue->used) {
140 return;
141 }
142
143 pthread_mutex_lock(&thread.queue_lock);
144
145 while (thread.queue->used) {
146 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
147 }
148
149 pthread_mutex_unlock(&thread.queue_lock);
150}
151
152/* Waits for all GPU commands in both queues to finish, bringing VRAM
153 * completely up-to-date. */
154void renderer_sync(void) {
155 if (!thread.running) return;
156
157 /* Not completely safe, but should be fine since the render thread
158 * only decreases used, and we check again inside the lock. */
159 if (!thread.queue->used && !thread.bg_queue->used) {
160 return;
161 }
162
163 /* Flush both queues. This is necessary because gpulib could be
164 * trying to process a DMA write that a command in the queue should
165 * run beforehand. For example, Xenogears sprites write a black
166 * rectangle over the to-be-DMA'd spot in VRAM -- if this write
167 * happens after the DMA, it will clear the DMA, resulting in
168 * flickering sprites. We need to be totally up-to-date. This may
169 * drop a frame. */
170 renderer_wait();
171 cmd_queue_swap();
172 hold_cmds = false;
173 renderer_wait();
174}
175
176static void video_thread_stop() {
177 int i;
178 renderer_sync();
179
180 if (thread.running) {
181 thread.running = false;
182 pthread_cond_signal(&thread.cond_msg_avail);
183 pthread_join(thread.thread, NULL);
184 }
185
186 pthread_mutex_destroy(&thread.queue_lock);
187 pthread_cond_destroy(&thread.cond_msg_avail);
188 pthread_cond_destroy(&thread.cond_msg_done);
189 pthread_cond_destroy(&thread.cond_queue_empty);
190
191 for (i = 0; i < QUEUE_SIZE; i++) {
192 video_thread_cmd *cmd = &thread.queue->queue[i];
193 free(cmd->cmd_list);
194 cmd->cmd_list = NULL;
195 }
196
197 for (i = 0; i < QUEUE_SIZE; i++) {
198 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
199 free(cmd->cmd_list);
200 cmd->cmd_list = NULL;
201 }
202}
203
204static void video_thread_start() {
205 fprintf(stdout, "Starting render thread\n");
206
207 if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
208 pthread_cond_init(&thread.cond_msg_done, NULL) ||
209 pthread_cond_init(&thread.cond_queue_empty, NULL) ||
210 pthread_mutex_init(&thread.queue_lock, NULL) ||
211 pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
212 goto error;
213 }
214
215 thread.queue = &queues[0];
216 thread.bg_queue = &queues[1];
217
218 thread.running = true;
219 return;
220
221 error:
222 fprintf(stderr,"Failed to start rendering thread\n");
223 video_thread_stop();
224}
225
226static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
227 video_thread_cmd *cmd;
228 uint32_t *cmd_list;
229 video_thread_queue *queue;
230 bool lock;
231
232 cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
233
234 if (!cmd_list) {
235 /* Out of memory, disable the thread and run sync from now on */
236 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
237 video_thread_stop();
238 }
239
240 memcpy(cmd_list, list, count * sizeof(uint32_t));
241
242 if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
243 /* If the bg queue is full, do a full sync to empty both queues
244 * and clear space. This should be very rare, I've only seen it in
245 * Tekken 3 post-battle-replay. */
246 renderer_sync();
247 }
248
249 if (hold_cmds) {
250 queue = thread.bg_queue;
251 lock = false;
252 } else {
253 queue = thread.queue;
254 lock = true;
255 }
256
257 if (lock) {
258 pthread_mutex_lock(&thread.queue_lock);
259
260 while (queue->used >= QUEUE_SIZE) {
261 pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
262 }
263 }
264
265 cmd = &queue->queue[queue->end];
266 free(cmd->cmd_list);
267 cmd->cmd_list = cmd_list;
268 cmd->count = count;
269 cmd->last_cmd = last_cmd;
270 queue->end = (queue->end + 1) % QUEUE_SIZE;
271 queue->used++;
272
273 if (lock) {
274 pthread_cond_signal(&thread.cond_msg_avail);
275 pthread_mutex_unlock(&thread.queue_lock);
276 }
277}
278
279/* Slice off just the part of the list that can be handled async, and
280 * update ex_regs. */
281static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
282{
283 int cmd = 0, pos = 0, len, v;
284
285 while (pos < count) {
286 uint32_t *list = data + pos;
287 cmd = list[0] >> 24;
288 len = 1 + cmd_lengths[cmd];
289
290 switch (cmd) {
291 case 0x02:
292 break;
293 case 0x24 ... 0x27:
294 case 0x2c ... 0x2f:
295 case 0x34 ... 0x37:
296 case 0x3c ... 0x3f:
297 gpu.ex_regs[1] &= ~0x1ff;
298 gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
299 break;
300 case 0x48 ... 0x4F:
301 for (v = 3; pos + v < count; v++)
302 {
303 if ((list[v] & 0xf000f000) == 0x50005000)
304 break;
305 }
306 len += v - 3;
307 break;
308 case 0x58 ... 0x5F:
309 for (v = 4; pos + v < count; v += 2)
310 {
311 if ((list[v] & 0xf000f000) == 0x50005000)
312 break;
313 }
314 len += v - 4;
315 break;
316 default:
317 if ((cmd & 0xf8) == 0xe0)
318 gpu.ex_regs[cmd & 7] = list[0];
319 break;
320 }
321
322 if (pos + len > count) {
323 cmd = -1;
324 break; /* incomplete cmd */
325 }
326 if (0xa0 <= cmd && cmd <= 0xdf)
327 break; /* image i/o */
328
329 pos += len;
330 }
331
332 *last_cmd = cmd;
333 return pos;
334}
335
336int do_cmd_list(uint32_t *list, int count, int *last_cmd) {
337 int pos = 0;
338
339 if (thread.running) {
340 pos = scan_cmd_list(list, count, last_cmd);
341 video_thread_queue_cmd(list, pos, *last_cmd);
342 } else {
343 pos = real_do_cmd_list(list, count, last_cmd);
344 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
345 }
346 return pos;
347}
348
349int renderer_init(void) {
350 if (thread_rendering) {
351 video_thread_start();
352 }
353 return real_renderer_init();
354}
355
356void renderer_finish(void) {
357 real_renderer_finish();
358
359 if (thread_rendering && thread.running) {
360 video_thread_stop();
361 }
362}
363
364void renderer_sync_ecmds(uint32_t * ecmds) {
365 if (thread.running) {
366 int dummy;
367 do_cmd_list(&ecmds[1], 6, &dummy);
368 } else {
369 real_renderer_sync_ecmds(ecmds);
370 }
371}
372
373void renderer_update_caches(int x, int y, int w, int h) {
374 renderer_sync();
375 real_renderer_update_caches(x, y, w, h);
376}
377
378void renderer_flush_queues(void) {
379 /* Called during DMA and updateLace. We want to sync if it's DMA,
380 * but not if it's updateLace. Instead of syncing here, there's a
381 * renderer_sync call during DMA. */
382 real_renderer_flush_queues();
383}
384
385/*
386 * Normally all GPU commands are processed before rendering the
387 * frame. For games that naturally run < 50/60fps, this is unnecessary
388 * -- it forces the game to render as if it was 60fps and leaves the
389 * GPU idle half the time on a 30fps game, for example.
390 *
391 * Allowing the renderer to wait until a frame is done before
392 * rendering it would give it double, triple, or quadruple the amount
393 * of time to finish before we have to wait for it.
394 *
395 * We can use a heuristic to figure out when to force a render.
396 *
397 * - If a frame isn't done when we're asked to render, wait for it and
398 * put future GPU commands in a separate buffer (for the next frame)
399 *
400 * - If the frame is done, and had no future GPU commands, render it.
401 *
402 * - If we do have future GPU commands, it meant the frame took too
403 * long to render and there's another frame waiting. Stop until the
404 * first frame finishes, render it, and start processing the next
405 * one.
406 *
407 * This may possibly add a frame or two of latency that shouldn't be
408 * different than the real device. It may skip rendering a frame
409 * entirely if a VRAM transfer happens while a frame is waiting, or in
410 * games that natively run at 60fps if frames are coming in too
411 * quickly to process. Depending on how the game treats "60fps," this
412 * may not be noticeable.
413 */
414void renderer_notify_update_lace(int updated) {
415 if (!thread.running) return;
416
417 if (thread_rendering == THREAD_RENDERING_SYNC) {
418 renderer_sync();
419 return;
420 }
421
422 if (updated) {
423 cmd_queue_swap();
424 return;
425 }
426
427 pthread_mutex_lock(&thread.queue_lock);
428 if (thread.bg_queue->used) {
429 /* We have commands for a future frame to run. Force a wait until
430 * the current frame is finished, and start processing the next
431 * frame after it's drawn (see the `updated` clause above). */
432 pthread_mutex_unlock(&thread.queue_lock);
433 renderer_wait();
434 pthread_mutex_lock(&thread.queue_lock);
435
436 /* We are no longer holding commands back, so the next frame may
437 * get mixed into the following frame. This is usually fine, but can
438 * result in frameskip-like effects for 60fps games. */
439 hold_cmds = false;
440 needs_display = true;
441 gpu.state.fb_dirty = true;
442 } else if (thread.queue->used) {
443 /* We are still drawing during a vblank. Cut off the current frame
444 * by sending new commands to the background queue and skip
445 * drawing our partly rendered frame to the display. */
446 hold_cmds = true;
447 needs_display = true;
448 gpu.state.fb_dirty = false;
449 } else if (needs_display && !thread.queue->used) {
450 /* We have processed all commands in the queue, render the
451 * buffer. We know we have something to render, because
452 * needs_display is true. */
453 hold_cmds = false;
454 needs_display = false;
455 gpu.state.fb_dirty = true;
456 } else {
457 /* Everything went normally, so do the normal thing. */
458 }
459
460 pthread_mutex_unlock(&thread.queue_lock);
461}
462
463void renderer_set_interlace(int enable, int is_odd) {
464 real_renderer_set_interlace(enable, is_odd);
465}
466
467void renderer_set_config(const struct rearmed_cbs *cbs) {
468 renderer_sync();
469 thread_rendering = cbs->thread_rendering;
470 if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
471 video_thread_start();
472 } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
473 video_thread_stop();
474 }
475 real_renderer_set_config(cbs);
476}
477
478void renderer_notify_res_change(void) {
479 renderer_sync();
480 real_renderer_notify_res_change();
481}