unbreak the build of gpu thread thing
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
CommitLineData
c765eb86
JW
1/**************************************************************************
2* Copyright (C) 2020 The RetroArch Team *
3* *
4* This program is free software; you can redistribute it and/or modify *
5* it under the terms of the GNU General Public License as published by *
6* the Free Software Foundation; either version 2 of the License, or *
7* (at your option) any later version. *
8* *
9* This program is distributed in the hope that it will be useful, *
10* but WITHOUT ANY WARRANTY; without even the implied warranty of *
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
12* GNU General Public License for more details. *
13* *
14* You should have received a copy of the GNU General Public License *
15* along with this program; if not, write to the *
16* Free Software Foundation, Inc., *
17* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. *
18***************************************************************************/
19
20#include <stdlib.h>
a903b131 21#include <stdio.h>
c765eb86
JW
22#include <string.h>
23#include <pthread.h>
24#include "../gpulib/gpu.h"
25#include "../../frontend/plugin_lib.h"
26#include "gpulib_thread_if.h"
27
a903b131
JW
28#define FALSE 0
29#define TRUE 1
30#define BOOL unsigned short
31
c765eb86
JW
32typedef struct {
33 uint32_t *cmd_list;
34 int count;
35 int last_cmd;
36} video_thread_cmd;
37
38#define QUEUE_SIZE 0x2000
39
40typedef struct {
41 size_t start;
42 size_t end;
43 size_t used;
44 video_thread_cmd queue[QUEUE_SIZE];
45} video_thread_queue;
46
47typedef struct {
48 pthread_t thread;
49 pthread_mutex_t queue_lock;
50 pthread_cond_t cond_msg_avail;
51 pthread_cond_t cond_msg_done;
52 pthread_cond_t cond_queue_empty;
53 video_thread_queue *queue;
54 video_thread_queue *bg_queue;
a903b131 55 BOOL running;
c765eb86
JW
56} video_thread_state;
57
58static video_thread_state thread;
59static video_thread_queue queues[2];
60static int thread_rendering;
a903b131
JW
61static BOOL hold_cmds;
62static BOOL needs_display;
847f57a0 63static BOOL flushed;
c765eb86
JW
64
65extern const unsigned char cmd_lengths[];
66
67static void *video_thread_main(void *arg) {
68 video_thread_state *thread = (video_thread_state *)arg;
69 video_thread_cmd *cmd;
70 int i;
a903b131
JW
71
72#ifdef _3DS
c765eb86 73 static int processed = 0;
a903b131 74#endif /* _3DS */
c765eb86
JW
75
76 while(1) {
ff3890db 77 int result, cpu_cycles = 0, last_cmd, start, end;
c765eb86
JW
78 video_thread_queue *queue;
79 pthread_mutex_lock(&thread->queue_lock);
80
81 while (!thread->queue->used && thread->running) {
82 pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
83 }
84
85 if (!thread->running) {
86 pthread_mutex_unlock(&thread->queue_lock);
87 break;
88 }
89
90 queue = thread->queue;
91 start = queue->start;
92 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
93 queue->start = end % QUEUE_SIZE;
94 pthread_mutex_unlock(&thread->queue_lock);
95
96 for (i = start; i < end; i++) {
97 cmd = &queue->queue[i];
ff3890db 98 result = real_do_cmd_list(cmd->cmd_list, cmd->count,
99 &cpu_cycles, &last_cmd);
c765eb86
JW
100 if (result != cmd->count) {
101 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
102 }
103
104#ifdef _3DS
105 /* Periodically yield so as not to starve other threads */
106 processed += cmd->count;
107 if (processed >= 512) {
108 svcSleepThread(1);
109 processed %= 512;
110 }
a903b131 111#endif /* _3DS */
c765eb86
JW
112 }
113
114 pthread_mutex_lock(&thread->queue_lock);
115 queue->used -= (end - start);
116
117 if (!queue->used)
118 pthread_cond_signal(&thread->cond_queue_empty);
119
120 pthread_cond_signal(&thread->cond_msg_done);
121 pthread_mutex_unlock(&thread->queue_lock);
122 }
123
124 return 0;
125}
126
127static void cmd_queue_swap() {
128 video_thread_queue *tmp;
129 if (!thread.bg_queue->used) return;
130
131 pthread_mutex_lock(&thread.queue_lock);
132 if (!thread.queue->used) {
133 tmp = thread.queue;
134 thread.queue = thread.bg_queue;
135 thread.bg_queue = tmp;
c765eb86
JW
136 pthread_cond_signal(&thread.cond_msg_avail);
137 }
138 pthread_mutex_unlock(&thread.queue_lock);
139}
140
141/* Waits for the main queue to completely finish. */
142void renderer_wait() {
143 if (!thread.running) return;
144
145 /* Not completely safe, but should be fine since the render thread
146 * only decreases used, and we check again inside the lock. */
147 if (!thread.queue->used) {
148 return;
149 }
150
151 pthread_mutex_lock(&thread.queue_lock);
152
153 while (thread.queue->used) {
154 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
155 }
156
157 pthread_mutex_unlock(&thread.queue_lock);
158}
159
160/* Waits for all GPU commands in both queues to finish, bringing VRAM
161 * completely up-to-date. */
162void renderer_sync(void) {
163 if (!thread.running) return;
164
165 /* Not completely safe, but should be fine since the render thread
166 * only decreases used, and we check again inside the lock. */
167 if (!thread.queue->used && !thread.bg_queue->used) {
168 return;
169 }
170
847f57a0
JW
171 if (thread.bg_queue->used) {
172 /* When we flush the background queue, the vblank handler can't
173 * know that we had a frame pending, and we delay rendering too
174 * long. Force it. */
175 flushed = TRUE;
176 }
177
c765eb86
JW
178 /* Flush both queues. This is necessary because gpulib could be
179 * trying to process a DMA write that a command in the queue should
180 * run beforehand. For example, Xenogears sprites write a black
181 * rectangle over the to-be-DMA'd spot in VRAM -- if this write
182 * happens after the DMA, it will clear the DMA, resulting in
183 * flickering sprites. We need to be totally up-to-date. This may
184 * drop a frame. */
185 renderer_wait();
186 cmd_queue_swap();
a903b131 187 hold_cmds = FALSE;
c765eb86
JW
188 renderer_wait();
189}
190
191static void video_thread_stop() {
192 int i;
193 renderer_sync();
194
195 if (thread.running) {
a903b131 196 thread.running = FALSE;
c765eb86
JW
197 pthread_cond_signal(&thread.cond_msg_avail);
198 pthread_join(thread.thread, NULL);
199 }
200
201 pthread_mutex_destroy(&thread.queue_lock);
202 pthread_cond_destroy(&thread.cond_msg_avail);
203 pthread_cond_destroy(&thread.cond_msg_done);
204 pthread_cond_destroy(&thread.cond_queue_empty);
205
206 for (i = 0; i < QUEUE_SIZE; i++) {
207 video_thread_cmd *cmd = &thread.queue->queue[i];
208 free(cmd->cmd_list);
209 cmd->cmd_list = NULL;
210 }
211
212 for (i = 0; i < QUEUE_SIZE; i++) {
213 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
214 free(cmd->cmd_list);
215 cmd->cmd_list = NULL;
216 }
217}
218
219static void video_thread_start() {
220 fprintf(stdout, "Starting render thread\n");
221
222 if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
223 pthread_cond_init(&thread.cond_msg_done, NULL) ||
224 pthread_cond_init(&thread.cond_queue_empty, NULL) ||
225 pthread_mutex_init(&thread.queue_lock, NULL) ||
226 pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
227 goto error;
228 }
229
230 thread.queue = &queues[0];
231 thread.bg_queue = &queues[1];
232
a903b131 233 thread.running = TRUE;
c765eb86
JW
234 return;
235
236 error:
237 fprintf(stderr,"Failed to start rendering thread\n");
238 video_thread_stop();
239}
240
241static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
242 video_thread_cmd *cmd;
243 uint32_t *cmd_list;
244 video_thread_queue *queue;
a903b131 245 BOOL lock;
c765eb86
JW
246
247 cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
248
249 if (!cmd_list) {
250 /* Out of memory, disable the thread and run sync from now on */
251 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
252 video_thread_stop();
253 }
254
255 memcpy(cmd_list, list, count * sizeof(uint32_t));
256
257 if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
258 /* If the bg queue is full, do a full sync to empty both queues
259 * and clear space. This should be very rare, I've only seen it in
260 * Tekken 3 post-battle-replay. */
261 renderer_sync();
262 }
263
264 if (hold_cmds) {
265 queue = thread.bg_queue;
a903b131 266 lock = FALSE;
c765eb86
JW
267 } else {
268 queue = thread.queue;
a903b131 269 lock = TRUE;
c765eb86
JW
270 }
271
272 if (lock) {
273 pthread_mutex_lock(&thread.queue_lock);
274
275 while (queue->used >= QUEUE_SIZE) {
276 pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
277 }
278 }
279
280 cmd = &queue->queue[queue->end];
281 free(cmd->cmd_list);
282 cmd->cmd_list = cmd_list;
283 cmd->count = count;
284 cmd->last_cmd = last_cmd;
285 queue->end = (queue->end + 1) % QUEUE_SIZE;
286 queue->used++;
287
288 if (lock) {
289 pthread_cond_signal(&thread.cond_msg_avail);
290 pthread_mutex_unlock(&thread.queue_lock);
291 }
292}
293
294/* Slice off just the part of the list that can be handled async, and
295 * update ex_regs. */
296static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
297{
298 int cmd = 0, pos = 0, len, v;
299
300 while (pos < count) {
301 uint32_t *list = data + pos;
302 cmd = list[0] >> 24;
303 len = 1 + cmd_lengths[cmd];
304
305 switch (cmd) {
306 case 0x02:
307 break;
308 case 0x24 ... 0x27:
309 case 0x2c ... 0x2f:
310 case 0x34 ... 0x37:
311 case 0x3c ... 0x3f:
312 gpu.ex_regs[1] &= ~0x1ff;
313 gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
314 break;
315 case 0x48 ... 0x4F:
316 for (v = 3; pos + v < count; v++)
317 {
318 if ((list[v] & 0xf000f000) == 0x50005000)
319 break;
320 }
321 len += v - 3;
322 break;
323 case 0x58 ... 0x5F:
324 for (v = 4; pos + v < count; v += 2)
325 {
326 if ((list[v] & 0xf000f000) == 0x50005000)
327 break;
328 }
329 len += v - 4;
330 break;
331 default:
332 if ((cmd & 0xf8) == 0xe0)
333 gpu.ex_regs[cmd & 7] = list[0];
334 break;
335 }
336
337 if (pos + len > count) {
338 cmd = -1;
339 break; /* incomplete cmd */
340 }
341 if (0xa0 <= cmd && cmd <= 0xdf)
342 break; /* image i/o */
343
344 pos += len;
345 }
346
347 *last_cmd = cmd;
348 return pos;
349}
350
ff3890db 351int do_cmd_list(uint32_t *list, int count, int *cycles, int *last_cmd) {
c765eb86
JW
352 int pos = 0;
353
354 if (thread.running) {
355 pos = scan_cmd_list(list, count, last_cmd);
356 video_thread_queue_cmd(list, pos, *last_cmd);
357 } else {
ff3890db 358 pos = real_do_cmd_list(list, count, cycles, last_cmd);
c765eb86
JW
359 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
360 }
361 return pos;
362}
363
364int renderer_init(void) {
365 if (thread_rendering) {
366 video_thread_start();
367 }
368 return real_renderer_init();
369}
370
371void renderer_finish(void) {
372 real_renderer_finish();
373
374 if (thread_rendering && thread.running) {
375 video_thread_stop();
376 }
377}
378
379void renderer_sync_ecmds(uint32_t * ecmds) {
380 if (thread.running) {
ff3890db 381 int dummy = 0;
382 do_cmd_list(&ecmds[1], 6, &dummy, &dummy);
c765eb86
JW
383 } else {
384 real_renderer_sync_ecmds(ecmds);
385 }
386}
387
adca9bef 388void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
c765eb86 389 renderer_sync();
adca9bef 390 real_renderer_update_caches(x, y, w, h, state_changed);
c765eb86
JW
391}
392
393void renderer_flush_queues(void) {
394 /* Called during DMA and updateLace. We want to sync if it's DMA,
395 * but not if it's updateLace. Instead of syncing here, there's a
396 * renderer_sync call during DMA. */
397 real_renderer_flush_queues();
398}
399
400/*
401 * Normally all GPU commands are processed before rendering the
402 * frame. For games that naturally run < 50/60fps, this is unnecessary
403 * -- it forces the game to render as if it was 60fps and leaves the
404 * GPU idle half the time on a 30fps game, for example.
405 *
406 * Allowing the renderer to wait until a frame is done before
407 * rendering it would give it double, triple, or quadruple the amount
408 * of time to finish before we have to wait for it.
409 *
410 * We can use a heuristic to figure out when to force a render.
411 *
412 * - If a frame isn't done when we're asked to render, wait for it and
413 * put future GPU commands in a separate buffer (for the next frame)
414 *
415 * - If the frame is done, and had no future GPU commands, render it.
416 *
417 * - If we do have future GPU commands, it meant the frame took too
418 * long to render and there's another frame waiting. Stop until the
419 * first frame finishes, render it, and start processing the next
420 * one.
421 *
422 * This may possibly add a frame or two of latency that shouldn't be
423 * different than the real device. It may skip rendering a frame
424 * entirely if a VRAM transfer happens while a frame is waiting, or in
425 * games that natively run at 60fps if frames are coming in too
426 * quickly to process. Depending on how the game treats "60fps," this
427 * may not be noticeable.
428 */
429void renderer_notify_update_lace(int updated) {
430 if (!thread.running) return;
431
432 if (thread_rendering == THREAD_RENDERING_SYNC) {
433 renderer_sync();
434 return;
435 }
436
437 if (updated) {
438 cmd_queue_swap();
439 return;
440 }
441
442 pthread_mutex_lock(&thread.queue_lock);
847f57a0 443 if (thread.bg_queue->used || flushed) {
c765eb86
JW
444 /* We have commands for a future frame to run. Force a wait until
445 * the current frame is finished, and start processing the next
446 * frame after it's drawn (see the `updated` clause above). */
447 pthread_mutex_unlock(&thread.queue_lock);
448 renderer_wait();
449 pthread_mutex_lock(&thread.queue_lock);
450
451 /* We are no longer holding commands back, so the next frame may
452 * get mixed into the following frame. This is usually fine, but can
453 * result in frameskip-like effects for 60fps games. */
847f57a0 454 flushed = FALSE;
a903b131
JW
455 hold_cmds = FALSE;
456 needs_display = TRUE;
457 gpu.state.fb_dirty = TRUE;
c765eb86
JW
458 } else if (thread.queue->used) {
459 /* We are still drawing during a vblank. Cut off the current frame
460 * by sending new commands to the background queue and skip
461 * drawing our partly rendered frame to the display. */
a903b131
JW
462 hold_cmds = TRUE;
463 needs_display = TRUE;
464 gpu.state.fb_dirty = FALSE;
c765eb86
JW
465 } else if (needs_display && !thread.queue->used) {
466 /* We have processed all commands in the queue, render the
467 * buffer. We know we have something to render, because
a903b131
JW
468 * needs_display is TRUE. */
469 hold_cmds = FALSE;
470 needs_display = FALSE;
471 gpu.state.fb_dirty = TRUE;
c765eb86
JW
472 } else {
473 /* Everything went normally, so do the normal thing. */
474 }
475
476 pthread_mutex_unlock(&thread.queue_lock);
477}
478
479void renderer_set_interlace(int enable, int is_odd) {
480 real_renderer_set_interlace(enable, is_odd);
481}
482
483void renderer_set_config(const struct rearmed_cbs *cbs) {
484 renderer_sync();
485 thread_rendering = cbs->thread_rendering;
486 if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
487 video_thread_start();
488 } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
489 video_thread_stop();
490 }
491 real_renderer_set_config(cbs);
492}
493
494void renderer_notify_res_change(void) {
495 renderer_sync();
496 real_renderer_notify_res_change();
497}