unbreak the build of gpu thread thing
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
1 /**************************************************************************
2 *   Copyright (C) 2020 The RetroArch Team                                 *
3 *                                                                         *
4 *   This program is free software; you can redistribute it and/or modify  *
5 *   it under the terms of the GNU General Public License as published by  *
6 *   the Free Software Foundation; either version 2 of the License, or     *
7 *   (at your option) any later version.                                   *
8 *                                                                         *
9 *   This program is distributed in the hope that it will be useful,       *
10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
12 *   GNU General Public License for more details.                          *
13 *                                                                         *
14 *   You should have received a copy of the GNU General Public License     *
15 *   along with this program; if not, write to the                         *
16 *   Free Software Foundation, Inc.,                                       *
17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
18 ***************************************************************************/
19
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <pthread.h>
24 #include "../gpulib/gpu.h"
25 #include "../../frontend/plugin_lib.h"
26 #include "gpulib_thread_if.h"
27
28 #define FALSE 0
29 #define TRUE 1
30 #define BOOL unsigned short
31
32 typedef struct {
33         uint32_t *cmd_list;
34         int count;
35         int last_cmd;
36 } video_thread_cmd;
37
38 #define QUEUE_SIZE 0x2000
39
40 typedef struct {
41         size_t start;
42         size_t end;
43         size_t used;
44         video_thread_cmd queue[QUEUE_SIZE];
45 } video_thread_queue;
46
47 typedef struct {
48         pthread_t thread;
49         pthread_mutex_t queue_lock;
50         pthread_cond_t cond_msg_avail;
51         pthread_cond_t cond_msg_done;
52         pthread_cond_t cond_queue_empty;
53         video_thread_queue *queue;
54         video_thread_queue *bg_queue;
55         BOOL running;
56 } video_thread_state;
57
58 static video_thread_state thread;
59 static video_thread_queue queues[2];
60 static int thread_rendering;
61 static BOOL hold_cmds;
62 static BOOL needs_display;
63 static BOOL flushed;
64
65 extern const unsigned char cmd_lengths[];
66
67 static void *video_thread_main(void *arg) {
68         video_thread_state *thread = (video_thread_state *)arg;
69         video_thread_cmd *cmd;
70         int i;
71
72 #ifdef _3DS
73         static int processed = 0;
74 #endif /* _3DS */
75
76         while(1) {
77                 int result, cpu_cycles = 0, last_cmd, start, end;
78                 video_thread_queue *queue;
79                 pthread_mutex_lock(&thread->queue_lock);
80
81                 while (!thread->queue->used && thread->running) {
82                         pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
83                 }
84
85                 if (!thread->running) {
86                         pthread_mutex_unlock(&thread->queue_lock);
87                         break;
88                 }
89
90                 queue = thread->queue;
91                 start = queue->start;
92                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
93                 queue->start = end % QUEUE_SIZE;
94                 pthread_mutex_unlock(&thread->queue_lock);
95
96                 for (i = start; i < end; i++) {
97                         cmd = &queue->queue[i];
98                         result = real_do_cmd_list(cmd->cmd_list, cmd->count,
99                                         &cpu_cycles, &last_cmd);
100                         if (result != cmd->count) {
101                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
102                         }
103
104 #ifdef _3DS
105                         /* Periodically yield so as not to starve other threads */
106                         processed += cmd->count;
107                         if (processed >= 512) {
108                                 svcSleepThread(1);
109                                 processed %= 512;
110                         }
111 #endif /* _3DS */
112                 }
113
114                 pthread_mutex_lock(&thread->queue_lock);
115                 queue->used -= (end - start);
116
117                 if (!queue->used)
118                         pthread_cond_signal(&thread->cond_queue_empty);
119
120                 pthread_cond_signal(&thread->cond_msg_done);
121                 pthread_mutex_unlock(&thread->queue_lock);
122         }
123
124         return 0;
125 }
126
127 static void cmd_queue_swap() {
128         video_thread_queue *tmp;
129         if (!thread.bg_queue->used) return;
130
131         pthread_mutex_lock(&thread.queue_lock);
132         if (!thread.queue->used) {
133                 tmp = thread.queue;
134                 thread.queue = thread.bg_queue;
135                 thread.bg_queue = tmp;
136                 pthread_cond_signal(&thread.cond_msg_avail);
137         }
138         pthread_mutex_unlock(&thread.queue_lock);
139 }
140
141 /* Waits for the main queue to completely finish. */
142 void renderer_wait() {
143         if (!thread.running) return;
144
145         /* Not completely safe, but should be fine since the render thread
146          * only decreases used, and we check again inside the lock. */
147         if (!thread.queue->used) {
148                 return;
149         }
150
151         pthread_mutex_lock(&thread.queue_lock);
152
153         while (thread.queue->used) {
154                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
155         }
156
157         pthread_mutex_unlock(&thread.queue_lock);
158 }
159
160 /* Waits for all GPU commands in both queues to finish, bringing VRAM
161  * completely up-to-date. */
162 void renderer_sync(void) {
163         if (!thread.running) return;
164
165         /* Not completely safe, but should be fine since the render thread
166          * only decreases used, and we check again inside the lock. */
167         if (!thread.queue->used && !thread.bg_queue->used) {
168                 return;
169         }
170
171         if (thread.bg_queue->used) {
172                 /* When we flush the background queue, the vblank handler can't
173                  * know that we had a frame pending, and we delay rendering too
174                  * long. Force it. */
175                 flushed = TRUE;
176         }
177
178         /* Flush both queues. This is necessary because gpulib could be
179          * trying to process a DMA write that a command in the queue should
180          * run beforehand. For example, Xenogears sprites write a black
181          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
182          * happens after the DMA, it will clear the DMA, resulting in
183          * flickering sprites. We need to be totally up-to-date. This may
184          * drop a frame. */
185         renderer_wait();
186         cmd_queue_swap();
187         hold_cmds = FALSE;
188         renderer_wait();
189 }
190
191 static void video_thread_stop() {
192         int i;
193         renderer_sync();
194
195         if (thread.running) {
196                 thread.running = FALSE;
197                 pthread_cond_signal(&thread.cond_msg_avail);
198                 pthread_join(thread.thread, NULL);
199         }
200
201         pthread_mutex_destroy(&thread.queue_lock);
202         pthread_cond_destroy(&thread.cond_msg_avail);
203         pthread_cond_destroy(&thread.cond_msg_done);
204         pthread_cond_destroy(&thread.cond_queue_empty);
205
206         for (i = 0; i < QUEUE_SIZE; i++) {
207                 video_thread_cmd *cmd = &thread.queue->queue[i];
208                 free(cmd->cmd_list);
209                 cmd->cmd_list = NULL;
210         }
211
212         for (i = 0; i < QUEUE_SIZE; i++) {
213                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
214                 free(cmd->cmd_list);
215                 cmd->cmd_list = NULL;
216         }
217 }
218
219 static void video_thread_start() {
220         fprintf(stdout, "Starting render thread\n");
221
222         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
223                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
224                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
225                         pthread_mutex_init(&thread.queue_lock, NULL) ||
226                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
227                 goto error;
228         }
229
230         thread.queue = &queues[0];
231         thread.bg_queue = &queues[1];
232
233         thread.running = TRUE;
234         return;
235
236  error:
237         fprintf(stderr,"Failed to start rendering thread\n");
238         video_thread_stop();
239 }
240
241 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
242         video_thread_cmd *cmd;
243         uint32_t *cmd_list;
244         video_thread_queue *queue;
245         BOOL lock;
246
247         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
248
249         if (!cmd_list) {
250                 /* Out of memory, disable the thread and run sync from now on */
251                 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
252                 video_thread_stop();
253         }
254
255         memcpy(cmd_list, list, count * sizeof(uint32_t));
256
257         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
258                 /* If the bg queue is full, do a full sync to empty both queues
259                  * and clear space. This should be very rare, I've only seen it in
260                  * Tekken 3 post-battle-replay. */
261                 renderer_sync();
262         }
263
264         if (hold_cmds) {
265                 queue = thread.bg_queue;
266                 lock = FALSE;
267         } else {
268                 queue = thread.queue;
269                 lock = TRUE;
270         }
271
272         if (lock) {
273                 pthread_mutex_lock(&thread.queue_lock);
274
275                 while (queue->used >= QUEUE_SIZE) {
276                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
277                 }
278         }
279
280         cmd = &queue->queue[queue->end];
281         free(cmd->cmd_list);
282         cmd->cmd_list = cmd_list;
283         cmd->count = count;
284         cmd->last_cmd = last_cmd;
285         queue->end = (queue->end + 1) % QUEUE_SIZE;
286         queue->used++;
287
288         if (lock) {
289                 pthread_cond_signal(&thread.cond_msg_avail);
290                 pthread_mutex_unlock(&thread.queue_lock);
291         }
292 }
293
294 /* Slice off just the part of the list that can be handled async, and
295  * update ex_regs. */
296 static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
297 {
298         int cmd = 0, pos = 0, len, v;
299
300         while (pos < count) {
301                 uint32_t *list = data + pos;
302                 cmd = list[0] >> 24;
303                 len = 1 + cmd_lengths[cmd];
304
305                 switch (cmd) {
306                         case 0x02:
307                                 break;
308                         case 0x24 ... 0x27:
309                         case 0x2c ... 0x2f:
310                         case 0x34 ... 0x37:
311                         case 0x3c ... 0x3f:
312                                 gpu.ex_regs[1] &= ~0x1ff;
313                                 gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
314                                 break;
315                         case 0x48 ... 0x4F:
316                                 for (v = 3; pos + v < count; v++)
317                                 {
318                                         if ((list[v] & 0xf000f000) == 0x50005000)
319                                                 break;
320                                 }
321                                 len += v - 3;
322                                 break;
323                         case 0x58 ... 0x5F:
324                                 for (v = 4; pos + v < count; v += 2)
325                                 {
326                                         if ((list[v] & 0xf000f000) == 0x50005000)
327                                                 break;
328                                 }
329                                 len += v - 4;
330                                 break;
331                         default:
332                                 if ((cmd & 0xf8) == 0xe0)
333                                         gpu.ex_regs[cmd & 7] = list[0];
334                                 break;
335                 }
336
337                 if (pos + len > count) {
338                         cmd = -1;
339                         break; /* incomplete cmd */
340                 }
341                 if (0xa0 <= cmd && cmd <= 0xdf)
342                         break; /* image i/o */
343
344                 pos += len;
345         }
346
347         *last_cmd = cmd;
348         return pos;
349 }
350
351 int do_cmd_list(uint32_t *list, int count, int *cycles, int *last_cmd) {
352         int pos = 0;
353
354         if (thread.running) {
355                 pos = scan_cmd_list(list, count, last_cmd);
356                 video_thread_queue_cmd(list, pos, *last_cmd);
357         } else {
358                 pos = real_do_cmd_list(list, count, cycles, last_cmd);
359                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
360         }
361         return pos;
362 }
363
364 int renderer_init(void) {
365         if (thread_rendering) {
366                 video_thread_start();
367         }
368         return real_renderer_init();
369 }
370
371 void renderer_finish(void) {
372         real_renderer_finish();
373
374         if (thread_rendering && thread.running) {
375                 video_thread_stop();
376         }
377 }
378
379 void renderer_sync_ecmds(uint32_t * ecmds) {
380         if (thread.running) {
381                 int dummy = 0;
382                 do_cmd_list(&ecmds[1], 6, &dummy, &dummy);
383         } else {
384                 real_renderer_sync_ecmds(ecmds);
385         }
386 }
387
388 void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
389         renderer_sync();
390         real_renderer_update_caches(x, y, w, h, state_changed);
391 }
392
393 void renderer_flush_queues(void) {
394         /* Called during DMA and updateLace. We want to sync if it's DMA,
395          * but not if it's updateLace. Instead of syncing here, there's a
396          * renderer_sync call during DMA. */
397         real_renderer_flush_queues();
398 }
399
400 /*
401  * Normally all GPU commands are processed before rendering the
402  * frame. For games that naturally run < 50/60fps, this is unnecessary
403  * -- it forces the game to render as if it was 60fps and leaves the
404  * GPU idle half the time on a 30fps game, for example.
405  *
406  * Allowing the renderer to wait until a frame is done before
407  * rendering it would give it double, triple, or quadruple the amount
408  * of time to finish before we have to wait for it.
409  *
410  * We can use a heuristic to figure out when to force a render.
411  *
412  * - If a frame isn't done when we're asked to render, wait for it and
413  *   put future GPU commands in a separate buffer (for the next frame)
414  *
415  * - If the frame is done, and had no future GPU commands, render it.
416  *
417  * - If we do have future GPU commands, it meant the frame took too
418  *   long to render and there's another frame waiting. Stop until the
419  *   first frame finishes, render it, and start processing the next
420  *   one.
421  *
422  * This may possibly add a frame or two of latency that shouldn't be
423  * different than the real device. It may skip rendering a frame
424  * entirely if a VRAM transfer happens while a frame is waiting, or in
425  * games that natively run at 60fps if frames are coming in too
426  * quickly to process. Depending on how the game treats "60fps," this
427  * may not be noticeable.
428  */
429 void renderer_notify_update_lace(int updated) {
430         if (!thread.running) return;
431
432         if (thread_rendering == THREAD_RENDERING_SYNC) {
433                 renderer_sync();
434                 return;
435         }
436
437         if (updated) {
438                 cmd_queue_swap();
439                 return;
440         }
441
442         pthread_mutex_lock(&thread.queue_lock);
443         if (thread.bg_queue->used || flushed) {
444                 /* We have commands for a future frame to run. Force a wait until
445                  * the current frame is finished, and start processing the next
446                  * frame after it's drawn (see the `updated` clause above). */
447                 pthread_mutex_unlock(&thread.queue_lock);
448                 renderer_wait();
449                 pthread_mutex_lock(&thread.queue_lock);
450
451                 /* We are no longer holding commands back, so the next frame may
452                  * get mixed into the following frame. This is usually fine, but can
453                  * result in frameskip-like effects for 60fps games. */
454                 flushed = FALSE;
455                 hold_cmds = FALSE;
456                 needs_display = TRUE;
457                 gpu.state.fb_dirty = TRUE;
458         } else if (thread.queue->used) {
459                 /* We are still drawing during a vblank. Cut off the current frame
460                  * by sending new commands to the background queue and skip
461                  * drawing our partly rendered frame to the display. */
462                 hold_cmds = TRUE;
463                 needs_display = TRUE;
464                 gpu.state.fb_dirty = FALSE;
465         } else if (needs_display && !thread.queue->used) {
466                 /* We have processed all commands in the queue, render the
467                  * buffer. We know we have something to render, because
468                  * needs_display is TRUE. */
469                 hold_cmds = FALSE;
470                 needs_display = FALSE;
471                 gpu.state.fb_dirty = TRUE;
472         } else {
473                 /* Everything went normally, so do the normal thing. */
474         }
475
476         pthread_mutex_unlock(&thread.queue_lock);
477 }
478
479 void renderer_set_interlace(int enable, int is_odd) {
480         real_renderer_set_interlace(enable, is_odd);
481 }
482
483 void renderer_set_config(const struct rearmed_cbs *cbs) {
484         renderer_sync();
485         thread_rendering = cbs->thread_rendering;
486         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
487                 video_thread_start();
488         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
489                 video_thread_stop();
490         }
491         real_renderer_set_config(cbs);
492 }
493
494 void renderer_notify_res_change(void) {
495         renderer_sync();
496         real_renderer_notify_res_change();
497 }