Allow threaded rendering for peops and enable on unix
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
1 /**************************************************************************
2 *   Copyright (C) 2020 The RetroArch Team                                 *
3 *                                                                         *
4 *   This program is free software; you can redistribute it and/or modify  *
5 *   it under the terms of the GNU General Public License as published by  *
6 *   the Free Software Foundation; either version 2 of the License, or     *
7 *   (at your option) any later version.                                   *
8 *                                                                         *
9 *   This program is distributed in the hope that it will be useful,       *
10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
12 *   GNU General Public License for more details.                          *
13 *                                                                         *
14 *   You should have received a copy of the GNU General Public License     *
15 *   along with this program; if not, write to the                         *
16 *   Free Software Foundation, Inc.,                                       *
17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
18 ***************************************************************************/
19
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <pthread.h>
24 #include "../gpulib/gpu.h"
25 #include "../../frontend/plugin_lib.h"
26 #include "gpulib_thread_if.h"
27
28 #define FALSE 0
29 #define TRUE 1
30 #define BOOL unsigned short
31
32 typedef struct {
33         uint32_t *cmd_list;
34         int count;
35         int last_cmd;
36 } video_thread_cmd;
37
38 #define QUEUE_SIZE 0x2000
39
40 typedef struct {
41         size_t start;
42         size_t end;
43         size_t used;
44         video_thread_cmd queue[QUEUE_SIZE];
45 } video_thread_queue;
46
47 typedef struct {
48         pthread_t thread;
49         pthread_mutex_t queue_lock;
50         pthread_cond_t cond_msg_avail;
51         pthread_cond_t cond_msg_done;
52         pthread_cond_t cond_queue_empty;
53         video_thread_queue *queue;
54         video_thread_queue *bg_queue;
55         BOOL running;
56 } video_thread_state;
57
58 static video_thread_state thread;
59 static video_thread_queue queues[2];
60 static int thread_rendering;
61 static BOOL hold_cmds;
62 static BOOL needs_display;
63
64 extern const unsigned char cmd_lengths[];
65
66 static void *video_thread_main(void *arg) {
67         video_thread_state *thread = (video_thread_state *)arg;
68         video_thread_cmd *cmd;
69         int i;
70
71 #ifdef _3DS
72         static int processed = 0;
73 #endif /* _3DS */
74
75         while(1) {
76                 int result, last_cmd, start, end;
77                 video_thread_queue *queue;
78                 pthread_mutex_lock(&thread->queue_lock);
79
80                 while (!thread->queue->used && thread->running) {
81                         pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
82                 }
83
84                 if (!thread->running) {
85                         pthread_mutex_unlock(&thread->queue_lock);
86                         break;
87                 }
88
89                 queue = thread->queue;
90                 start = queue->start;
91                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
92                 queue->start = end % QUEUE_SIZE;
93                 pthread_mutex_unlock(&thread->queue_lock);
94
95                 for (i = start; i < end; i++) {
96                         cmd = &queue->queue[i];
97                         result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);
98
99                         if (result != cmd->count) {
100                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
101                         }
102
103 #ifdef _3DS
104                         /* Periodically yield so as not to starve other threads */
105                         processed += cmd->count;
106                         if (processed >= 512) {
107                                 svcSleepThread(1);
108                                 processed %= 512;
109                         }
110 #endif /* _3DS */
111                 }
112
113                 pthread_mutex_lock(&thread->queue_lock);
114                 queue->used -= (end - start);
115
116                 if (!queue->used)
117                         pthread_cond_signal(&thread->cond_queue_empty);
118
119                 pthread_cond_signal(&thread->cond_msg_done);
120                 pthread_mutex_unlock(&thread->queue_lock);
121         }
122
123         return 0;
124 }
125
126 static void cmd_queue_swap() {
127         video_thread_queue *tmp;
128         if (!thread.bg_queue->used) return;
129
130         pthread_mutex_lock(&thread.queue_lock);
131         if (!thread.queue->used) {
132                 tmp = thread.queue;
133                 thread.queue = thread.bg_queue;
134                 thread.bg_queue = tmp;
135                 needs_display = TRUE;
136                 pthread_cond_signal(&thread.cond_msg_avail);
137         }
138         pthread_mutex_unlock(&thread.queue_lock);
139 }
140
141 /* Waits for the main queue to completely finish. */
142 void renderer_wait() {
143         if (!thread.running) return;
144
145         /* Not completely safe, but should be fine since the render thread
146          * only decreases used, and we check again inside the lock. */
147         if (!thread.queue->used) {
148                 return;
149         }
150
151         pthread_mutex_lock(&thread.queue_lock);
152
153         while (thread.queue->used) {
154                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
155         }
156
157         pthread_mutex_unlock(&thread.queue_lock);
158 }
159
160 /* Waits for all GPU commands in both queues to finish, bringing VRAM
161  * completely up-to-date. */
162 void renderer_sync(void) {
163         if (!thread.running) return;
164
165         /* Not completely safe, but should be fine since the render thread
166          * only decreases used, and we check again inside the lock. */
167         if (!thread.queue->used && !thread.bg_queue->used) {
168                 return;
169         }
170
171         /* Flush both queues. This is necessary because gpulib could be
172          * trying to process a DMA write that a command in the queue should
173          * run beforehand. For example, Xenogears sprites write a black
174          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
175          * happens after the DMA, it will clear the DMA, resulting in
176          * flickering sprites. We need to be totally up-to-date. This may
177          * drop a frame. */
178         renderer_wait();
179         cmd_queue_swap();
180         hold_cmds = FALSE;
181         renderer_wait();
182 }
183
184 static void video_thread_stop() {
185         int i;
186         renderer_sync();
187
188         if (thread.running) {
189                 thread.running = FALSE;
190                 pthread_cond_signal(&thread.cond_msg_avail);
191                 pthread_join(thread.thread, NULL);
192         }
193
194         pthread_mutex_destroy(&thread.queue_lock);
195         pthread_cond_destroy(&thread.cond_msg_avail);
196         pthread_cond_destroy(&thread.cond_msg_done);
197         pthread_cond_destroy(&thread.cond_queue_empty);
198
199         for (i = 0; i < QUEUE_SIZE; i++) {
200                 video_thread_cmd *cmd = &thread.queue->queue[i];
201                 free(cmd->cmd_list);
202                 cmd->cmd_list = NULL;
203         }
204
205         for (i = 0; i < QUEUE_SIZE; i++) {
206                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
207                 free(cmd->cmd_list);
208                 cmd->cmd_list = NULL;
209         }
210 }
211
212 static void video_thread_start() {
213         fprintf(stdout, "Starting render thread\n");
214
215         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
216                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
217                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
218                         pthread_mutex_init(&thread.queue_lock, NULL) ||
219                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
220                 goto error;
221         }
222
223         thread.queue = &queues[0];
224         thread.bg_queue = &queues[1];
225
226         thread.running = TRUE;
227         return;
228
229  error:
230         fprintf(stderr,"Failed to start rendering thread\n");
231         video_thread_stop();
232 }
233
234 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
235         video_thread_cmd *cmd;
236         uint32_t *cmd_list;
237         video_thread_queue *queue;
238         BOOL lock;
239
240         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
241
242         if (!cmd_list) {
243                 /* Out of memory, disable the thread and run sync from now on */
244                 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
245                 video_thread_stop();
246         }
247
248         memcpy(cmd_list, list, count * sizeof(uint32_t));
249
250         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
251                 /* If the bg queue is full, do a full sync to empty both queues
252                  * and clear space. This should be very rare, I've only seen it in
253                  * Tekken 3 post-battle-replay. */
254                 renderer_sync();
255         }
256
257         if (hold_cmds) {
258                 queue = thread.bg_queue;
259                 lock = FALSE;
260         } else {
261                 queue = thread.queue;
262                 lock = TRUE;
263         }
264
265         if (lock) {
266                 pthread_mutex_lock(&thread.queue_lock);
267
268                 while (queue->used >= QUEUE_SIZE) {
269                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
270                 }
271         }
272
273         cmd = &queue->queue[queue->end];
274         free(cmd->cmd_list);
275         cmd->cmd_list = cmd_list;
276         cmd->count = count;
277         cmd->last_cmd = last_cmd;
278         queue->end = (queue->end + 1) % QUEUE_SIZE;
279         queue->used++;
280
281         if (lock) {
282                 pthread_cond_signal(&thread.cond_msg_avail);
283                 pthread_mutex_unlock(&thread.queue_lock);
284         }
285 }
286
287 /* Slice off just the part of the list that can be handled async, and
288  * update ex_regs. */
289 static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
290 {
291         int cmd = 0, pos = 0, len, v;
292
293         while (pos < count) {
294                 uint32_t *list = data + pos;
295                 cmd = list[0] >> 24;
296                 len = 1 + cmd_lengths[cmd];
297
298                 switch (cmd) {
299                         case 0x02:
300                                 break;
301                         case 0x24 ... 0x27:
302                         case 0x2c ... 0x2f:
303                         case 0x34 ... 0x37:
304                         case 0x3c ... 0x3f:
305                                 gpu.ex_regs[1] &= ~0x1ff;
306                                 gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
307                                 break;
308                         case 0x48 ... 0x4F:
309                                 for (v = 3; pos + v < count; v++)
310                                 {
311                                         if ((list[v] & 0xf000f000) == 0x50005000)
312                                                 break;
313                                 }
314                                 len += v - 3;
315                                 break;
316                         case 0x58 ... 0x5F:
317                                 for (v = 4; pos + v < count; v += 2)
318                                 {
319                                         if ((list[v] & 0xf000f000) == 0x50005000)
320                                                 break;
321                                 }
322                                 len += v - 4;
323                                 break;
324                         default:
325                                 if ((cmd & 0xf8) == 0xe0)
326                                         gpu.ex_regs[cmd & 7] = list[0];
327                                 break;
328                 }
329
330                 if (pos + len > count) {
331                         cmd = -1;
332                         break; /* incomplete cmd */
333                 }
334                 if (0xa0 <= cmd && cmd <= 0xdf)
335                         break; /* image i/o */
336
337                 pos += len;
338         }
339
340         *last_cmd = cmd;
341         return pos;
342 }
343
344 int do_cmd_list(uint32_t *list, int count, int *last_cmd) {
345         int pos = 0;
346
347         if (thread.running) {
348                 pos = scan_cmd_list(list, count, last_cmd);
349                 video_thread_queue_cmd(list, pos, *last_cmd);
350         } else {
351                 pos = real_do_cmd_list(list, count, last_cmd);
352                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
353         }
354         return pos;
355 }
356
357 int renderer_init(void) {
358         if (thread_rendering) {
359                 video_thread_start();
360         }
361         return real_renderer_init();
362 }
363
364 void renderer_finish(void) {
365         real_renderer_finish();
366
367         if (thread_rendering && thread.running) {
368                 video_thread_stop();
369         }
370 }
371
372 void renderer_sync_ecmds(uint32_t * ecmds) {
373         if (thread.running) {
374                 int dummy;
375                 do_cmd_list(&ecmds[1], 6, &dummy);
376         } else {
377                 real_renderer_sync_ecmds(ecmds);
378         }
379 }
380
381 void renderer_update_caches(int x, int y, int w, int h) {
382         renderer_sync();
383         real_renderer_update_caches(x, y, w, h);
384 }
385
386 void renderer_flush_queues(void) {
387         /* Called during DMA and updateLace. We want to sync if it's DMA,
388          * but not if it's updateLace. Instead of syncing here, there's a
389          * renderer_sync call during DMA. */
390         real_renderer_flush_queues();
391 }
392
393 /*
394  * Normally all GPU commands are processed before rendering the
395  * frame. For games that naturally run < 50/60fps, this is unnecessary
396  * -- it forces the game to render as if it was 60fps and leaves the
397  * GPU idle half the time on a 30fps game, for example.
398  *
399  * Allowing the renderer to wait until a frame is done before
400  * rendering it would give it double, triple, or quadruple the amount
401  * of time to finish before we have to wait for it.
402  *
403  * We can use a heuristic to figure out when to force a render.
404  *
405  * - If a frame isn't done when we're asked to render, wait for it and
406  *   put future GPU commands in a separate buffer (for the next frame)
407  *
408  * - If the frame is done, and had no future GPU commands, render it.
409  *
410  * - If we do have future GPU commands, it meant the frame took too
411  *   long to render and there's another frame waiting. Stop until the
412  *   first frame finishes, render it, and start processing the next
413  *   one.
414  *
415  * This may possibly add a frame or two of latency that shouldn't be
416  * different than the real device. It may skip rendering a frame
417  * entirely if a VRAM transfer happens while a frame is waiting, or in
418  * games that natively run at 60fps if frames are coming in too
419  * quickly to process. Depending on how the game treats "60fps," this
420  * may not be noticeable.
421  */
422 void renderer_notify_update_lace(int updated) {
423         if (!thread.running) return;
424
425         if (thread_rendering == THREAD_RENDERING_SYNC) {
426                 renderer_sync();
427                 return;
428         }
429
430         if (updated) {
431                 cmd_queue_swap();
432                 return;
433         }
434
435         pthread_mutex_lock(&thread.queue_lock);
436         if (thread.bg_queue->used) {
437                 /* We have commands for a future frame to run. Force a wait until
438                  * the current frame is finished, and start processing the next
439                  * frame after it's drawn (see the `updated` clause above). */
440                 pthread_mutex_unlock(&thread.queue_lock);
441                 renderer_wait();
442                 pthread_mutex_lock(&thread.queue_lock);
443
444                 /* We are no longer holding commands back, so the next frame may
445                  * get mixed into the following frame. This is usually fine, but can
446                  * result in frameskip-like effects for 60fps games. */
447                 hold_cmds = FALSE;
448                 needs_display = TRUE;
449                 gpu.state.fb_dirty = TRUE;
450         } else if (thread.queue->used) {
451                 /* We are still drawing during a vblank. Cut off the current frame
452                  * by sending new commands to the background queue and skip
453                  * drawing our partly rendered frame to the display. */
454                 hold_cmds = TRUE;
455                 needs_display = TRUE;
456                 gpu.state.fb_dirty = FALSE;
457         } else if (needs_display && !thread.queue->used) {
458                 /* We have processed all commands in the queue, render the
459                  * buffer. We know we have something to render, because
460                  * needs_display is TRUE. */
461                 hold_cmds = FALSE;
462                 needs_display = FALSE;
463                 gpu.state.fb_dirty = TRUE;
464         } else {
465                 /* Everything went normally, so do the normal thing. */
466         }
467
468         pthread_mutex_unlock(&thread.queue_lock);
469 }
470
471 void renderer_set_interlace(int enable, int is_odd) {
472         real_renderer_set_interlace(enable, is_odd);
473 }
474
475 void renderer_set_config(const struct rearmed_cbs *cbs) {
476         renderer_sync();
477         thread_rendering = cbs->thread_rendering;
478         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
479                 video_thread_start();
480         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
481                 video_thread_stop();
482         }
483         real_renderer_set_config(cbs);
484 }
485
486 void renderer_notify_res_change(void) {
487         renderer_sync();
488         real_renderer_notify_res_change();
489 }