Add a threaded renderer
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
1 /**************************************************************************
2 *   Copyright (C) 2020 The RetroArch Team                                 *
3 *                                                                         *
4 *   This program is free software; you can redistribute it and/or modify  *
5 *   it under the terms of the GNU General Public License as published by  *
6 *   the Free Software Foundation; either version 2 of the License, or     *
7 *   (at your option) any later version.                                   *
8 *                                                                         *
9 *   This program is distributed in the hope that it will be useful,       *
10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
12 *   GNU General Public License for more details.                          *
13 *                                                                         *
14 *   You should have received a copy of the GNU General Public License     *
15 *   along with this program; if not, write to the                         *
16 *   Free Software Foundation, Inc.,                                       *
17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
18 ***************************************************************************/
19
20 #include <stdlib.h>
21 #include <string.h>
22 #include <pthread.h>
23 #include "../gpulib/gpu.h"
24 #include "../../frontend/plugin_lib.h"
25 #include "gpulib_thread_if.h"
26
27 typedef struct {
28         uint32_t *cmd_list;
29         int count;
30         int last_cmd;
31 } video_thread_cmd;
32
33 #define QUEUE_SIZE 0x2000
34
35 typedef struct {
36         size_t start;
37         size_t end;
38         size_t used;
39         video_thread_cmd queue[QUEUE_SIZE];
40 } video_thread_queue;
41
42 typedef struct {
43         pthread_t thread;
44         pthread_mutex_t queue_lock;
45         pthread_cond_t cond_msg_avail;
46         pthread_cond_t cond_msg_done;
47         pthread_cond_t cond_queue_empty;
48         video_thread_queue *queue;
49         video_thread_queue *bg_queue;
50         bool running;
51 } video_thread_state;
52
53 static video_thread_state thread;
54 static video_thread_queue queues[2];
55 static int thread_rendering;
56 static bool hold_cmds;
57 static bool needs_display;
58
59 extern const unsigned char cmd_lengths[];
60
61 static void *video_thread_main(void *arg) {
62         video_thread_state *thread = (video_thread_state *)arg;
63         video_thread_cmd *cmd;
64         int i;
65         static int processed = 0;
66
67         while(1) {
68                 int result, last_cmd, start, end;
69                 video_thread_queue *queue;
70                 pthread_mutex_lock(&thread->queue_lock);
71
72                 while (!thread->queue->used && thread->running) {
73                         pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
74                 }
75
76                 if (!thread->running) {
77                         pthread_mutex_unlock(&thread->queue_lock);
78                         break;
79                 }
80
81                 queue = thread->queue;
82                 start = queue->start;
83                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
84                 queue->start = end % QUEUE_SIZE;
85                 pthread_mutex_unlock(&thread->queue_lock);
86
87                 for (i = start; i < end; i++) {
88                         cmd = &queue->queue[i];
89                         result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);
90
91                         if (result != cmd->count) {
92                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
93                         }
94
95 #ifdef _3DS
96                         /* Periodically yield so as not to starve other threads */
97                         processed += cmd->count;
98                         if (processed >= 512) {
99                                 svcSleepThread(1);
100                                 processed %= 512;
101                         }
102 #endif
103                 }
104
105                 pthread_mutex_lock(&thread->queue_lock);
106                 queue->used -= (end - start);
107
108                 if (!queue->used)
109                         pthread_cond_signal(&thread->cond_queue_empty);
110
111                 pthread_cond_signal(&thread->cond_msg_done);
112                 pthread_mutex_unlock(&thread->queue_lock);
113         }
114
115         return 0;
116 }
117
118 static void cmd_queue_swap() {
119         video_thread_queue *tmp;
120         if (!thread.bg_queue->used) return;
121
122         pthread_mutex_lock(&thread.queue_lock);
123         if (!thread.queue->used) {
124                 tmp = thread.queue;
125                 thread.queue = thread.bg_queue;
126                 thread.bg_queue = tmp;
127                 needs_display = true;
128                 pthread_cond_signal(&thread.cond_msg_avail);
129         }
130         pthread_mutex_unlock(&thread.queue_lock);
131 }
132
133 /* Waits for the main queue to completely finish. */
134 void renderer_wait() {
135         if (!thread.running) return;
136
137         /* Not completely safe, but should be fine since the render thread
138          * only decreases used, and we check again inside the lock. */
139         if (!thread.queue->used) {
140                 return;
141         }
142
143         pthread_mutex_lock(&thread.queue_lock);
144
145         while (thread.queue->used) {
146                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
147         }
148
149         pthread_mutex_unlock(&thread.queue_lock);
150 }
151
152 /* Waits for all GPU commands in both queues to finish, bringing VRAM
153  * completely up-to-date. */
154 void renderer_sync(void) {
155         if (!thread.running) return;
156
157         /* Not completely safe, but should be fine since the render thread
158          * only decreases used, and we check again inside the lock. */
159         if (!thread.queue->used && !thread.bg_queue->used) {
160                 return;
161         }
162
163         /* Flush both queues. This is necessary because gpulib could be
164          * trying to process a DMA write that a command in the queue should
165          * run beforehand. For example, Xenogears sprites write a black
166          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
167          * happens after the DMA, it will clear the DMA, resulting in
168          * flickering sprites. We need to be totally up-to-date. This may
169          * drop a frame. */
170         renderer_wait();
171         cmd_queue_swap();
172         hold_cmds = false;
173         renderer_wait();
174 }
175
176 static void video_thread_stop() {
177         int i;
178         renderer_sync();
179
180         if (thread.running) {
181                 thread.running = false;
182                 pthread_cond_signal(&thread.cond_msg_avail);
183                 pthread_join(thread.thread, NULL);
184         }
185
186         pthread_mutex_destroy(&thread.queue_lock);
187         pthread_cond_destroy(&thread.cond_msg_avail);
188         pthread_cond_destroy(&thread.cond_msg_done);
189         pthread_cond_destroy(&thread.cond_queue_empty);
190
191         for (i = 0; i < QUEUE_SIZE; i++) {
192                 video_thread_cmd *cmd = &thread.queue->queue[i];
193                 free(cmd->cmd_list);
194                 cmd->cmd_list = NULL;
195         }
196
197         for (i = 0; i < QUEUE_SIZE; i++) {
198                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
199                 free(cmd->cmd_list);
200                 cmd->cmd_list = NULL;
201         }
202 }
203
204 static void video_thread_start() {
205         fprintf(stdout, "Starting render thread\n");
206
207         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
208                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
209                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
210                         pthread_mutex_init(&thread.queue_lock, NULL) ||
211                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
212                 goto error;
213         }
214
215         thread.queue = &queues[0];
216         thread.bg_queue = &queues[1];
217
218         thread.running = true;
219         return;
220
221  error:
222         fprintf(stderr,"Failed to start rendering thread\n");
223         video_thread_stop();
224 }
225
226 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
227         video_thread_cmd *cmd;
228         uint32_t *cmd_list;
229         video_thread_queue *queue;
230         bool lock;
231
232         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
233
234         if (!cmd_list) {
235                 /* Out of memory, disable the thread and run sync from now on */
236                 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
237                 video_thread_stop();
238         }
239
240         memcpy(cmd_list, list, count * sizeof(uint32_t));
241
242         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
243                 /* If the bg queue is full, do a full sync to empty both queues
244                  * and clear space. This should be very rare, I've only seen it in
245                  * Tekken 3 post-battle-replay. */
246                 renderer_sync();
247         }
248
249         if (hold_cmds) {
250                 queue = thread.bg_queue;
251                 lock = false;
252         } else {
253                 queue = thread.queue;
254                 lock = true;
255         }
256
257         if (lock) {
258                 pthread_mutex_lock(&thread.queue_lock);
259
260                 while (queue->used >= QUEUE_SIZE) {
261                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
262                 }
263         }
264
265         cmd = &queue->queue[queue->end];
266         free(cmd->cmd_list);
267         cmd->cmd_list = cmd_list;
268         cmd->count = count;
269         cmd->last_cmd = last_cmd;
270         queue->end = (queue->end + 1) % QUEUE_SIZE;
271         queue->used++;
272
273         if (lock) {
274                 pthread_cond_signal(&thread.cond_msg_avail);
275                 pthread_mutex_unlock(&thread.queue_lock);
276         }
277 }
278
279 /* Slice off just the part of the list that can be handled async, and
280  * update ex_regs. */
281 static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
282 {
283         int cmd = 0, pos = 0, len, v;
284
285         while (pos < count) {
286                 uint32_t *list = data + pos;
287                 cmd = list[0] >> 24;
288                 len = 1 + cmd_lengths[cmd];
289
290                 switch (cmd) {
291                         case 0x02:
292                                 break;
293                         case 0x24 ... 0x27:
294                         case 0x2c ... 0x2f:
295                         case 0x34 ... 0x37:
296                         case 0x3c ... 0x3f:
297                                 gpu.ex_regs[1] &= ~0x1ff;
298                                 gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
299                                 break;
300                         case 0x48 ... 0x4F:
301                                 for (v = 3; pos + v < count; v++)
302                                 {
303                                         if ((list[v] & 0xf000f000) == 0x50005000)
304                                                 break;
305                                 }
306                                 len += v - 3;
307                                 break;
308                         case 0x58 ... 0x5F:
309                                 for (v = 4; pos + v < count; v += 2)
310                                 {
311                                         if ((list[v] & 0xf000f000) == 0x50005000)
312                                                 break;
313                                 }
314                                 len += v - 4;
315                                 break;
316                         default:
317                                 if ((cmd & 0xf8) == 0xe0)
318                                         gpu.ex_regs[cmd & 7] = list[0];
319                                 break;
320                 }
321
322                 if (pos + len > count) {
323                         cmd = -1;
324                         break; /* incomplete cmd */
325                 }
326                 if (0xa0 <= cmd && cmd <= 0xdf)
327                         break; /* image i/o */
328
329                 pos += len;
330         }
331
332         *last_cmd = cmd;
333         return pos;
334 }
335
336 int do_cmd_list(uint32_t *list, int count, int *last_cmd) {
337         int pos = 0;
338
339         if (thread.running) {
340                 pos = scan_cmd_list(list, count, last_cmd);
341                 video_thread_queue_cmd(list, pos, *last_cmd);
342         } else {
343                 pos = real_do_cmd_list(list, count, last_cmd);
344                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
345         }
346         return pos;
347 }
348
349 int renderer_init(void) {
350         if (thread_rendering) {
351                 video_thread_start();
352         }
353         return real_renderer_init();
354 }
355
356 void renderer_finish(void) {
357         real_renderer_finish();
358
359         if (thread_rendering && thread.running) {
360                 video_thread_stop();
361         }
362 }
363
364 void renderer_sync_ecmds(uint32_t * ecmds) {
365         if (thread.running) {
366                 int dummy;
367                 do_cmd_list(&ecmds[1], 6, &dummy);
368         } else {
369                 real_renderer_sync_ecmds(ecmds);
370         }
371 }
372
373 void renderer_update_caches(int x, int y, int w, int h) {
374         renderer_sync();
375         real_renderer_update_caches(x, y, w, h);
376 }
377
378 void renderer_flush_queues(void) {
379         /* Called during DMA and updateLace. We want to sync if it's DMA,
380          * but not if it's updateLace. Instead of syncing here, there's a
381          * renderer_sync call during DMA. */
382         real_renderer_flush_queues();
383 }
384
385 /*
386  * Normally all GPU commands are processed before rendering the
387  * frame. For games that naturally run < 50/60fps, this is unnecessary
388  * -- it forces the game to render as if it was 60fps and leaves the
389  * GPU idle half the time on a 30fps game, for example.
390  *
391  * Allowing the renderer to wait until a frame is done before
392  * rendering it would give it double, triple, or quadruple the amount
393  * of time to finish before we have to wait for it.
394  *
395  * We can use a heuristic to figure out when to force a render.
396  *
397  * - If a frame isn't done when we're asked to render, wait for it and
398  *   put future GPU commands in a separate buffer (for the next frame)
399  *
400  * - If the frame is done, and had no future GPU commands, render it.
401  *
402  * - If we do have future GPU commands, it meant the frame took too
403  *   long to render and there's another frame waiting. Stop until the
404  *   first frame finishes, render it, and start processing the next
405  *   one.
406  *
407  * This may possibly add a frame or two of latency that shouldn't be
408  * different than the real device. It may skip rendering a frame
409  * entirely if a VRAM transfer happens while a frame is waiting, or in
410  * games that natively run at 60fps if frames are coming in too
411  * quickly to process. Depending on how the game treats "60fps," this
412  * may not be noticeable.
413  */
414 void renderer_notify_update_lace(int updated) {
415         if (!thread.running) return;
416
417         if (thread_rendering == THREAD_RENDERING_SYNC) {
418                 renderer_sync();
419                 return;
420         }
421
422         if (updated) {
423                 cmd_queue_swap();
424                 return;
425         }
426
427         pthread_mutex_lock(&thread.queue_lock);
428         if (thread.bg_queue->used) {
429                 /* We have commands for a future frame to run. Force a wait until
430                  * the current frame is finished, and start processing the next
431                  * frame after it's drawn (see the `updated` clause above). */
432                 pthread_mutex_unlock(&thread.queue_lock);
433                 renderer_wait();
434                 pthread_mutex_lock(&thread.queue_lock);
435
436                 /* We are no longer holding commands back, so the next frame may
437                  * get mixed into the following frame. This is usually fine, but can
438                  * result in frameskip-like effects for 60fps games. */
439                 hold_cmds = false;
440                 needs_display = true;
441                 gpu.state.fb_dirty = true;
442         } else if (thread.queue->used) {
443                 /* We are still drawing during a vblank. Cut off the current frame
444                  * by sending new commands to the background queue and skip
445                  * drawing our partly rendered frame to the display. */
446                 hold_cmds = true;
447                 needs_display = true;
448                 gpu.state.fb_dirty = false;
449         } else if (needs_display && !thread.queue->used) {
450                 /* We have processed all commands in the queue, render the
451                  * buffer. We know we have something to render, because
452                  * needs_display is true. */
453                 hold_cmds = false;
454                 needs_display = false;
455                 gpu.state.fb_dirty = true;
456         } else {
457                 /* Everything went normally, so do the normal thing. */
458         }
459
460         pthread_mutex_unlock(&thread.queue_lock);
461 }
462
463 void renderer_set_interlace(int enable, int is_odd) {
464         real_renderer_set_interlace(enable, is_odd);
465 }
466
467 void renderer_set_config(const struct rearmed_cbs *cbs) {
468         renderer_sync();
469         thread_rendering = cbs->thread_rendering;
470         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
471                 video_thread_start();
472         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
473                 video_thread_stop();
474         }
475         real_renderer_set_config(cbs);
476 }
477
478 void renderer_notify_res_change(void) {
479         renderer_sync();
480         real_renderer_notify_res_change();
481 }