update gpulib_thread_if
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
1 /**************************************************************************
2 *   Copyright (C) 2020 The RetroArch Team                                 *
3 *                                                                         *
4 *   This program is free software; you can redistribute it and/or modify  *
5 *   it under the terms of the GNU General Public License as published by  *
6 *   the Free Software Foundation; either version 2 of the License, or     *
7 *   (at your option) any later version.                                   *
8 *                                                                         *
9 *   This program is distributed in the hope that it will be useful,       *
10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
12 *   GNU General Public License for more details.                          *
13 *                                                                         *
14 *   You should have received a copy of the GNU General Public License     *
15 *   along with this program; if not, write to the                         *
16 *   Free Software Foundation, Inc.,                                       *
17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
18 ***************************************************************************/
19
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <pthread.h>
24 #include "../gpulib/gpu.h"
25 #include "../../frontend/plugin_lib.h"
26 #include "gpu.h"
27 #include "gpu_timing.h"
28 #include "gpulib_thread_if.h"
29
30 #define FALSE 0
31 #define TRUE 1
32 #define BOOL unsigned short
33
34 typedef struct {
35         uint32_t *cmd_list;
36         int count;
37         int last_cmd;
38 } video_thread_cmd;
39
40 #define QUEUE_SIZE 0x2000
41
42 typedef struct {
43         size_t start;
44         size_t end;
45         size_t used;
46         video_thread_cmd queue[QUEUE_SIZE];
47 } video_thread_queue;
48
49 typedef struct {
50         pthread_t thread;
51         pthread_mutex_t queue_lock;
52         pthread_cond_t cond_msg_avail;
53         pthread_cond_t cond_msg_done;
54         pthread_cond_t cond_queue_empty;
55         video_thread_queue *queue;
56         video_thread_queue *bg_queue;
57         BOOL running;
58 } video_thread_state;
59
60 static video_thread_state thread;
61 static video_thread_queue queues[2];
62 static int thread_rendering;
63 static BOOL hold_cmds;
64 static BOOL needs_display;
65 static BOOL flushed;
66
67 extern const unsigned char cmd_lengths[];
68
69 static void *video_thread_main(void *arg) {
70         video_thread_state *thread = (video_thread_state *)arg;
71         video_thread_cmd *cmd;
72         int i;
73
74 #ifdef _3DS
75         static int processed = 0;
76 #endif /* _3DS */
77
78         while(1) {
79                 int result, cycles_dummy = 0, last_cmd, start, end;
80                 video_thread_queue *queue;
81                 pthread_mutex_lock(&thread->queue_lock);
82
83                 while (!thread->queue->used && thread->running) {
84                         pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
85                 }
86
87                 if (!thread->running) {
88                         pthread_mutex_unlock(&thread->queue_lock);
89                         break;
90                 }
91
92                 queue = thread->queue;
93                 start = queue->start;
94                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
95                 queue->start = end % QUEUE_SIZE;
96                 pthread_mutex_unlock(&thread->queue_lock);
97
98                 for (i = start; i < end; i++) {
99                         cmd = &queue->queue[i];
100                         result = real_do_cmd_list(cmd->cmd_list, cmd->count,
101                                         &cycles_dummy, &cycles_dummy, &last_cmd);
102                         if (result != cmd->count) {
103                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
104                         }
105
106 #ifdef _3DS
107                         /* Periodically yield so as not to starve other threads */
108                         processed += cmd->count;
109                         if (processed >= 512) {
110                                 svcSleepThread(1);
111                                 processed %= 512;
112                         }
113 #endif /* _3DS */
114                 }
115
116                 pthread_mutex_lock(&thread->queue_lock);
117                 queue->used -= (end - start);
118
119                 if (!queue->used)
120                         pthread_cond_signal(&thread->cond_queue_empty);
121
122                 pthread_cond_signal(&thread->cond_msg_done);
123                 pthread_mutex_unlock(&thread->queue_lock);
124         }
125
126         return 0;
127 }
128
129 static void cmd_queue_swap() {
130         video_thread_queue *tmp;
131         if (!thread.bg_queue->used) return;
132
133         pthread_mutex_lock(&thread.queue_lock);
134         if (!thread.queue->used) {
135                 tmp = thread.queue;
136                 thread.queue = thread.bg_queue;
137                 thread.bg_queue = tmp;
138                 pthread_cond_signal(&thread.cond_msg_avail);
139         }
140         pthread_mutex_unlock(&thread.queue_lock);
141 }
142
143 /* Waits for the main queue to completely finish. */
144 void renderer_wait() {
145         if (!thread.running) return;
146
147         /* Not completely safe, but should be fine since the render thread
148          * only decreases used, and we check again inside the lock. */
149         if (!thread.queue->used) {
150                 return;
151         }
152
153         pthread_mutex_lock(&thread.queue_lock);
154
155         while (thread.queue->used) {
156                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
157         }
158
159         pthread_mutex_unlock(&thread.queue_lock);
160 }
161
162 /* Waits for all GPU commands in both queues to finish, bringing VRAM
163  * completely up-to-date. */
164 void renderer_sync(void) {
165         if (!thread.running) return;
166
167         /* Not completely safe, but should be fine since the render thread
168          * only decreases used, and we check again inside the lock. */
169         if (!thread.queue->used && !thread.bg_queue->used) {
170                 return;
171         }
172
173         if (thread.bg_queue->used) {
174                 /* When we flush the background queue, the vblank handler can't
175                  * know that we had a frame pending, and we delay rendering too
176                  * long. Force it. */
177                 flushed = TRUE;
178         }
179
180         /* Flush both queues. This is necessary because gpulib could be
181          * trying to process a DMA write that a command in the queue should
182          * run beforehand. For example, Xenogears sprites write a black
183          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
184          * happens after the DMA, it will clear the DMA, resulting in
185          * flickering sprites. We need to be totally up-to-date. This may
186          * drop a frame. */
187         renderer_wait();
188         cmd_queue_swap();
189         hold_cmds = FALSE;
190         renderer_wait();
191 }
192
193 static void video_thread_stop() {
194         int i;
195         renderer_sync();
196
197         if (thread.running) {
198                 thread.running = FALSE;
199                 pthread_cond_signal(&thread.cond_msg_avail);
200                 pthread_join(thread.thread, NULL);
201         }
202
203         pthread_mutex_destroy(&thread.queue_lock);
204         pthread_cond_destroy(&thread.cond_msg_avail);
205         pthread_cond_destroy(&thread.cond_msg_done);
206         pthread_cond_destroy(&thread.cond_queue_empty);
207
208         for (i = 0; i < QUEUE_SIZE; i++) {
209                 video_thread_cmd *cmd = &thread.queue->queue[i];
210                 free(cmd->cmd_list);
211                 cmd->cmd_list = NULL;
212         }
213
214         for (i = 0; i < QUEUE_SIZE; i++) {
215                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
216                 free(cmd->cmd_list);
217                 cmd->cmd_list = NULL;
218         }
219 }
220
221 static void video_thread_start() {
222         fprintf(stdout, "Starting render thread\n");
223
224         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
225                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
226                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
227                         pthread_mutex_init(&thread.queue_lock, NULL) ||
228                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
229                 goto error;
230         }
231
232         thread.queue = &queues[0];
233         thread.bg_queue = &queues[1];
234
235         thread.running = TRUE;
236         return;
237
238  error:
239         fprintf(stderr,"Failed to start rendering thread\n");
240         video_thread_stop();
241 }
242
243 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
244         video_thread_cmd *cmd;
245         uint32_t *cmd_list;
246         video_thread_queue *queue;
247         BOOL lock;
248
249         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
250
251         if (!cmd_list) {
252                 /* Out of memory, disable the thread and run sync from now on */
253                 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
254                 video_thread_stop();
255         }
256
257         memcpy(cmd_list, list, count * sizeof(uint32_t));
258
259         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
260                 /* If the bg queue is full, do a full sync to empty both queues
261                  * and clear space. This should be very rare, I've only seen it in
262                  * Tekken 3 post-battle-replay. */
263                 renderer_sync();
264         }
265
266         if (hold_cmds) {
267                 queue = thread.bg_queue;
268                 lock = FALSE;
269         } else {
270                 queue = thread.queue;
271                 lock = TRUE;
272         }
273
274         if (lock) {
275                 pthread_mutex_lock(&thread.queue_lock);
276
277                 while (queue->used >= QUEUE_SIZE) {
278                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
279                 }
280         }
281
282         cmd = &queue->queue[queue->end];
283         free(cmd->cmd_list);
284         cmd->cmd_list = cmd_list;
285         cmd->count = count;
286         cmd->last_cmd = last_cmd;
287         queue->end = (queue->end + 1) % QUEUE_SIZE;
288         queue->used++;
289
290         if (lock) {
291                 pthread_cond_signal(&thread.cond_msg_avail);
292                 pthread_mutex_unlock(&thread.queue_lock);
293         }
294 }
295
296 /* Slice off just the part of the list that can be handled async, and
297  * update ex_regs. */
298 static int scan_cmd_list(uint32_t *data, int count,
299         int *cycles_sum_out, int *cycles_last, int *last_cmd)
300 {
301         int cpu_cycles_sum = 0, cpu_cycles = *cycles_last;
302         int cmd = 0, pos = 0, len, v;
303
304         while (pos < count) {
305                 uint32_t *list = data + pos;
306                 short *slist = (void *)list;
307                 cmd = LE32TOH(list[0]) >> 24;
308                 len = 1 + cmd_lengths[cmd];
309
310                 switch (cmd) {
311                         case 0x02:
312                                 gput_sum(cpu_cycles_sum, cpu_cycles,
313                                         gput_fill(LE16TOH(slist[4]) & 0x3ff,
314                                                 LE16TOH(slist[5]) & 0x1ff));
315                                 break;
316                         case 0x20 ... 0x23:
317                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
318                                 break;
319                         case 0x24 ... 0x27:
320                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
321                                 gpu.ex_regs[1] &= ~0x1ff;
322                                 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
323                                 break;
324                         case 0x28 ... 0x2b:
325                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
326                                 break;
327                         case 0x2c ... 0x2f:
328                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
329                                 gpu.ex_regs[1] &= ~0x1ff;
330                                 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
331                                 break;
332                         case 0x30 ... 0x33:
333                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
334                                 break;
335                         case 0x34 ... 0x37:
336                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
337                                 gpu.ex_regs[1] &= ~0x1ff;
338                                 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
339                                 break;
340                         case 0x38 ... 0x3b:
341                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
342                                 break;
343                         case 0x3c ... 0x3f:
344                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
345                                 gpu.ex_regs[1] &= ~0x1ff;
346                                 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
347                                 break;
348                         case 0x40 ... 0x47:
349                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
350                                 break;
351                         case 0x48 ... 0x4F:
352                                 for (v = 3; pos + v < count; v++)
353                                 {
354                                         gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
355                                         if ((list[v] & 0xf000f000) == 0x50005000)
356                                                 break;
357                                 }
358                                 len += v - 3;
359                                 break;
360                         case 0x50 ... 0x57:
361                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
362                                 break;
363                         case 0x58 ... 0x5F:
364                                 for (v = 4; pos + v < count; v += 2)
365                                 {
366                                         gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
367                                         if ((list[v] & 0xf000f000) == 0x50005000)
368                                                 break;
369                                 }
370                                 len += v - 4;
371                                 break;
372                         case 0x60 ... 0x63:
373                                 gput_sum(cpu_cycles_sum, cpu_cycles,
374                                         gput_sprite(LE16TOH(slist[4]) & 0x3ff,
375                                                 LE16TOH(slist[5]) & 0x1ff));
376                                 break;
377                         case 0x64 ... 0x67:
378                                 gput_sum(cpu_cycles_sum, cpu_cycles,
379                                         gput_sprite(LE16TOH(slist[6]) & 0x3ff,
380                                                 LE16TOH(slist[7]) & 0x1ff));
381                                 break;
382                         case 0x68 ... 0x6b:
383                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
384                                 break;
385                         case 0x70 ... 0x77:
386                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8));
387                                 break;
388                         case 0x78 ... 0x7f:
389                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16));
390                                 break;
391                         default:
392                                 if ((cmd & 0xf8) == 0xe0)
393                                         gpu.ex_regs[cmd & 7] = list[0];
394                                 break;
395                 }
396
397                 if (pos + len > count) {
398                         cmd = -1;
399                         break; /* incomplete cmd */
400                 }
401                 if (0x80 <= cmd && cmd <= 0xdf)
402                         break; /* image i/o */
403
404                 pos += len;
405         }
406
407         *cycles_sum_out += cpu_cycles_sum;
408         *cycles_last = cpu_cycles;
409         *last_cmd = cmd;
410         return pos;
411 }
412
413 int do_cmd_list(uint32_t *list, int count,
414  int *cycles_sum, int *cycles_last, int *last_cmd)
415 {
416         int pos = 0;
417
418         if (thread.running) {
419                 pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
420                 video_thread_queue_cmd(list, pos, *last_cmd);
421         } else {
422                 pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
423                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
424         }
425         return pos;
426 }
427
428 int renderer_init(void) {
429         if (thread_rendering) {
430                 video_thread_start();
431         }
432         return real_renderer_init();
433 }
434
435 void renderer_finish(void) {
436         real_renderer_finish();
437
438         if (thread_rendering && thread.running) {
439                 video_thread_stop();
440         }
441 }
442
443 void renderer_sync_ecmds(uint32_t * ecmds) {
444         if (thread.running) {
445                 int dummy = 0;
446                 do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy);
447         } else {
448                 real_renderer_sync_ecmds(ecmds);
449         }
450 }
451
452 void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
453         renderer_sync();
454         real_renderer_update_caches(x, y, w, h, state_changed);
455 }
456
457 void renderer_flush_queues(void) {
458         /* Called during DMA and updateLace. We want to sync if it's DMA,
459          * but not if it's updateLace. Instead of syncing here, there's a
460          * renderer_sync call during DMA. */
461         real_renderer_flush_queues();
462 }
463
464 /*
465  * Normally all GPU commands are processed before rendering the
466  * frame. For games that naturally run < 50/60fps, this is unnecessary
467  * -- it forces the game to render as if it was 60fps and leaves the
468  * GPU idle half the time on a 30fps game, for example.
469  *
470  * Allowing the renderer to wait until a frame is done before
471  * rendering it would give it double, triple, or quadruple the amount
472  * of time to finish before we have to wait for it.
473  *
474  * We can use a heuristic to figure out when to force a render.
475  *
476  * - If a frame isn't done when we're asked to render, wait for it and
477  *   put future GPU commands in a separate buffer (for the next frame)
478  *
479  * - If the frame is done, and had no future GPU commands, render it.
480  *
481  * - If we do have future GPU commands, it meant the frame took too
482  *   long to render and there's another frame waiting. Stop until the
483  *   first frame finishes, render it, and start processing the next
484  *   one.
485  *
486  * This may possibly add a frame or two of latency that shouldn't be
487  * different than the real device. It may skip rendering a frame
488  * entirely if a VRAM transfer happens while a frame is waiting, or in
489  * games that natively run at 60fps if frames are coming in too
490  * quickly to process. Depending on how the game treats "60fps," this
491  * may not be noticeable.
492  */
493 void renderer_notify_update_lace(int updated) {
494         if (!thread.running) return;
495
496         if (thread_rendering == THREAD_RENDERING_SYNC) {
497                 renderer_sync();
498                 return;
499         }
500
501         if (updated) {
502                 cmd_queue_swap();
503                 return;
504         }
505
506         pthread_mutex_lock(&thread.queue_lock);
507         if (thread.bg_queue->used || flushed) {
508                 /* We have commands for a future frame to run. Force a wait until
509                  * the current frame is finished, and start processing the next
510                  * frame after it's drawn (see the `updated` clause above). */
511                 pthread_mutex_unlock(&thread.queue_lock);
512                 renderer_wait();
513                 pthread_mutex_lock(&thread.queue_lock);
514
515                 /* We are no longer holding commands back, so the next frame may
516                  * get mixed into the following frame. This is usually fine, but can
517                  * result in frameskip-like effects for 60fps games. */
518                 flushed = FALSE;
519                 hold_cmds = FALSE;
520                 needs_display = TRUE;
521                 gpu.state.fb_dirty = TRUE;
522         } else if (thread.queue->used) {
523                 /* We are still drawing during a vblank. Cut off the current frame
524                  * by sending new commands to the background queue and skip
525                  * drawing our partly rendered frame to the display. */
526                 hold_cmds = TRUE;
527                 needs_display = TRUE;
528                 gpu.state.fb_dirty = FALSE;
529         } else if (needs_display && !thread.queue->used) {
530                 /* We have processed all commands in the queue, render the
531                  * buffer. We know we have something to render, because
532                  * needs_display is TRUE. */
533                 hold_cmds = FALSE;
534                 needs_display = FALSE;
535                 gpu.state.fb_dirty = TRUE;
536         } else {
537                 /* Everything went normally, so do the normal thing. */
538         }
539
540         pthread_mutex_unlock(&thread.queue_lock);
541 }
542
543 void renderer_set_interlace(int enable, int is_odd) {
544         real_renderer_set_interlace(enable, is_odd);
545 }
546
547 void renderer_set_config(const struct rearmed_cbs *cbs) {
548         renderer_sync();
549         thread_rendering = cbs->thread_rendering;
550         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
551                 video_thread_start();
552         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
553                 video_thread_stop();
554         }
555         real_renderer_set_config(cbs);
556 }
557
558 void renderer_notify_res_change(void) {
559         renderer_sync();
560         real_renderer_notify_res_change();
561 }