3654134d0f80d73a9e0522cde069e77fb828a5f9
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
1 /**************************************************************************
2 *   Copyright (C) 2020 The RetroArch Team                                 *
3 *                                                                         *
4 *   This program is free software; you can redistribute it and/or modify  *
5 *   it under the terms of the GNU General Public License as published by  *
6 *   the Free Software Foundation; either version 2 of the License, or     *
7 *   (at your option) any later version.                                   *
8 *                                                                         *
9 *   This program is distributed in the hope that it will be useful,       *
10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
12 *   GNU General Public License for more details.                          *
13 *                                                                         *
14 *   You should have received a copy of the GNU General Public License     *
15 *   along with this program; if not, write to the                         *
16 *   Free Software Foundation, Inc.,                                       *
17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
18 ***************************************************************************/
19
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <pthread.h>
24 #include "../gpulib/gpu.h"
25 #include "../../frontend/plugin_lib.h"
26 #include "gpu.h"
27 #include "gpu_timing.h"
28 #include "gpulib_thread_if.h"
29
30 extern void SysPrintf(const char *fmt, ...);
31
32 #define FALSE 0
33 #define TRUE 1
34 #define BOOL unsigned short
35
36 typedef struct {
37         uint32_t *cmd_list;
38         int count;
39         int last_cmd;
40 } video_thread_cmd;
41
42 #define QUEUE_SIZE 0x2000
43
44 typedef struct {
45         size_t start;
46         size_t end;
47         size_t used;
48         video_thread_cmd queue[QUEUE_SIZE];
49 } video_thread_queue;
50
51 typedef struct {
52         pthread_t thread;
53         pthread_mutex_t queue_lock;
54         pthread_cond_t cond_msg_avail;
55         pthread_cond_t cond_msg_done;
56         pthread_cond_t cond_queue_empty;
57         video_thread_queue *queue;
58         video_thread_queue *bg_queue;
59         BOOL running;
60 } video_thread_state;
61
62 static video_thread_state thread;
63 static video_thread_queue queues[2];
64 static int thread_rendering;
65 static BOOL hold_cmds;
66 static BOOL needs_display;
67 static BOOL flushed;
68
69 extern const unsigned char cmd_lengths[];
70
71 static void *video_thread_main(void *arg) {
72         video_thread_cmd *cmd;
73         int i;
74
75 #ifdef _3DS
76         static int processed = 0;
77 #endif /* _3DS */
78
79 #if defined(__arm__) && defined(__ARM_FP)
80         // RunFast mode
81         uint32_t fpscr = ~0;
82         __asm__ volatile("vmrs %0, fpscr" : "=r"(fpscr));
83         fpscr &= ~0x00009f9f;
84         fpscr |=  0x03000000; // DN | FZ
85         __asm__ volatile("vmsr fpscr, %0" :: "r"(fpscr));
86 #endif
87
88         while(1) {
89                 int result, cycles_dummy = 0, last_cmd, start, end;
90                 video_thread_queue *queue;
91                 pthread_mutex_lock(&thread.queue_lock);
92
93                 while (!thread.queue->used && thread.running) {
94                         pthread_cond_wait(&thread.cond_msg_avail, &thread.queue_lock);
95                 }
96
97                 if (!thread.running) {
98                         pthread_mutex_unlock(&thread.queue_lock);
99                         break;
100                 }
101
102                 queue = thread.queue;
103                 start = queue->start;
104                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
105                 queue->start = end % QUEUE_SIZE;
106                 pthread_mutex_unlock(&thread.queue_lock);
107
108                 for (i = start; i < end; i++) {
109                         cmd = &queue->queue[i];
110                         result = real_do_cmd_list(cmd->cmd_list, cmd->count,
111                                         &cycles_dummy, &cycles_dummy, &last_cmd);
112                         if (result != cmd->count) {
113                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
114                         }
115
116 #ifdef _3DS
117                         /* Periodically yield so as not to starve other threads */
118                         processed += cmd->count;
119                         if (processed >= 512) {
120                                 svcSleepThread(1);
121                                 processed %= 512;
122                         }
123 #endif /* _3DS */
124                 }
125
126                 pthread_mutex_lock(&thread.queue_lock);
127                 queue->used -= (end - start);
128
129                 if (!queue->used)
130                         pthread_cond_signal(&thread.cond_queue_empty);
131
132                 pthread_cond_signal(&thread.cond_msg_done);
133                 pthread_mutex_unlock(&thread.queue_lock);
134         }
135
136         return 0;
137 }
138
139 static void cmd_queue_swap() {
140         video_thread_queue *tmp;
141         if (!thread.bg_queue->used) return;
142
143         pthread_mutex_lock(&thread.queue_lock);
144         if (!thread.queue->used) {
145                 tmp = thread.queue;
146                 thread.queue = thread.bg_queue;
147                 thread.bg_queue = tmp;
148                 pthread_cond_signal(&thread.cond_msg_avail);
149         }
150         pthread_mutex_unlock(&thread.queue_lock);
151 }
152
153 /* Waits for the main queue to completely finish. */
154 void renderer_wait() {
155         if (!thread.running) return;
156
157         /* Not completely safe, but should be fine since the render thread
158          * only decreases used, and we check again inside the lock. */
159         if (!thread.queue->used) {
160                 return;
161         }
162
163         pthread_mutex_lock(&thread.queue_lock);
164
165         while (thread.queue->used) {
166                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
167         }
168
169         pthread_mutex_unlock(&thread.queue_lock);
170 }
171
172 /* Waits for all GPU commands in both queues to finish, bringing VRAM
173  * completely up-to-date. */
174 void renderer_sync(void) {
175         if (!thread.running) return;
176
177         /* Not completely safe, but should be fine since the render thread
178          * only decreases used, and we check again inside the lock. */
179         if (!thread.queue->used && !thread.bg_queue->used) {
180                 return;
181         }
182
183         if (thread.bg_queue->used) {
184                 /* When we flush the background queue, the vblank handler can't
185                  * know that we had a frame pending, and we delay rendering too
186                  * long. Force it. */
187                 flushed = TRUE;
188         }
189
190         /* Flush both queues. This is necessary because gpulib could be
191          * trying to process a DMA write that a command in the queue should
192          * run beforehand. For example, Xenogears sprites write a black
193          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
194          * happens after the DMA, it will clear the DMA, resulting in
195          * flickering sprites. We need to be totally up-to-date. This may
196          * drop a frame. */
197         renderer_wait();
198         cmd_queue_swap();
199         hold_cmds = FALSE;
200         renderer_wait();
201 }
202
203 static void video_thread_stop() {
204         int i;
205         renderer_sync();
206
207         if (thread.running) {
208                 thread.running = FALSE;
209                 pthread_cond_signal(&thread.cond_msg_avail);
210                 pthread_join(thread.thread, NULL);
211         }
212
213         pthread_mutex_destroy(&thread.queue_lock);
214         pthread_cond_destroy(&thread.cond_msg_avail);
215         pthread_cond_destroy(&thread.cond_msg_done);
216         pthread_cond_destroy(&thread.cond_queue_empty);
217
218         for (i = 0; i < QUEUE_SIZE; i++) {
219                 video_thread_cmd *cmd = &thread.queue->queue[i];
220                 free(cmd->cmd_list);
221                 cmd->cmd_list = NULL;
222         }
223
224         for (i = 0; i < QUEUE_SIZE; i++) {
225                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
226                 free(cmd->cmd_list);
227                 cmd->cmd_list = NULL;
228         }
229 }
230
231 static void video_thread_start() {
232         SysPrintf("Starting render thread\n");
233
234         thread.queue = &queues[0];
235         thread.bg_queue = &queues[1];
236         thread.running = TRUE;
237
238         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
239                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
240                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
241                         pthread_mutex_init(&thread.queue_lock, NULL) ||
242                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
243                 goto error;
244         }
245
246         return;
247
248  error:
249         SysPrintf("Failed to start rendering thread\n");
250         thread.running = FALSE;
251         video_thread_stop();
252 }
253
254 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
255         video_thread_cmd *cmd;
256         uint32_t *cmd_list;
257         video_thread_queue *queue;
258         BOOL lock;
259
260         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
261
262         if (!cmd_list) {
263                 /* Out of memory, disable the thread and run sync from now on */
264                 SysPrintf("Failed to allocate render thread command list, stopping thread\n");
265                 video_thread_stop();
266         }
267
268         memcpy(cmd_list, list, count * sizeof(uint32_t));
269
270         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
271                 /* If the bg queue is full, do a full sync to empty both queues
272                  * and clear space. This should be very rare, I've only seen it in
273                  * Tekken 3 post-battle-replay. */
274                 renderer_sync();
275         }
276
277         if (hold_cmds) {
278                 queue = thread.bg_queue;
279                 lock = FALSE;
280         } else {
281                 queue = thread.queue;
282                 lock = TRUE;
283         }
284
285         if (lock) {
286                 pthread_mutex_lock(&thread.queue_lock);
287
288                 while (queue->used >= QUEUE_SIZE) {
289                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
290                 }
291         }
292
293         cmd = &queue->queue[queue->end];
294         free(cmd->cmd_list);
295         cmd->cmd_list = cmd_list;
296         cmd->count = count;
297         cmd->last_cmd = last_cmd;
298         queue->end = (queue->end + 1) % QUEUE_SIZE;
299         queue->used++;
300
301         if (lock) {
302                 pthread_cond_signal(&thread.cond_msg_avail);
303                 pthread_mutex_unlock(&thread.queue_lock);
304         }
305 }
306
307 /* Slice off just the part of the list that can be handled async, and
308  * update ex_regs. */
309 static int scan_cmd_list(uint32_t *data, int count,
310         int *cycles_sum_out, int *cycles_last, int *last_cmd)
311 {
312         int cpu_cycles_sum = 0, cpu_cycles = *cycles_last;
313         int cmd = 0, pos = 0, len, v;
314
315         while (pos < count) {
316                 uint32_t *list = data + pos;
317                 short *slist = (void *)list;
318                 cmd = LE32TOH(list[0]) >> 24;
319                 len = 1 + cmd_lengths[cmd];
320
321                 switch (cmd) {
322                         case 0x02:
323                                 gput_sum(cpu_cycles_sum, cpu_cycles,
324                                         gput_fill(LE16TOH(slist[4]) & 0x3ff,
325                                                 LE16TOH(slist[5]) & 0x1ff));
326                                 break;
327                         case 0x20 ... 0x23:
328                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
329                                 break;
330                         case 0x24 ... 0x27:
331                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
332                                 gpu.ex_regs[1] &= ~0x1ff;
333                                 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
334                                 break;
335                         case 0x28 ... 0x2b:
336                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
337                                 break;
338                         case 0x2c ... 0x2f:
339                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
340                                 gpu.ex_regs[1] &= ~0x1ff;
341                                 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
342                                 break;
343                         case 0x30 ... 0x33:
344                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
345                                 break;
346                         case 0x34 ... 0x37:
347                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
348                                 gpu.ex_regs[1] &= ~0x1ff;
349                                 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
350                                 break;
351                         case 0x38 ... 0x3b:
352                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
353                                 break;
354                         case 0x3c ... 0x3f:
355                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
356                                 gpu.ex_regs[1] &= ~0x1ff;
357                                 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
358                                 break;
359                         case 0x40 ... 0x47:
360                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
361                                 break;
362                         case 0x48 ... 0x4F:
363                                 for (v = 3; pos + v < count; v++)
364                                 {
365                                         gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
366                                         if ((list[v] & 0xf000f000) == 0x50005000)
367                                                 break;
368                                 }
369                                 len += v - 3;
370                                 break;
371                         case 0x50 ... 0x57:
372                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
373                                 break;
374                         case 0x58 ... 0x5F:
375                                 for (v = 4; pos + v < count; v += 2)
376                                 {
377                                         gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
378                                         if ((list[v] & 0xf000f000) == 0x50005000)
379                                                 break;
380                                 }
381                                 len += v - 4;
382                                 break;
383                         case 0x60 ... 0x63:
384                                 gput_sum(cpu_cycles_sum, cpu_cycles,
385                                         gput_sprite(LE16TOH(slist[4]) & 0x3ff,
386                                                 LE16TOH(slist[5]) & 0x1ff));
387                                 break;
388                         case 0x64 ... 0x67:
389                                 gput_sum(cpu_cycles_sum, cpu_cycles,
390                                         gput_sprite(LE16TOH(slist[6]) & 0x3ff,
391                                                 LE16TOH(slist[7]) & 0x1ff));
392                                 break;
393                         case 0x68 ... 0x6b:
394                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
395                                 break;
396                         case 0x70 ... 0x77:
397                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8));
398                                 break;
399                         case 0x78 ... 0x7f:
400                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16));
401                                 break;
402                         default:
403                                 if ((cmd & 0xf8) == 0xe0)
404                                         gpu.ex_regs[cmd & 7] = list[0];
405                                 break;
406                 }
407
408                 if (pos + len > count) {
409                         cmd = -1;
410                         break; /* incomplete cmd */
411                 }
412                 if (0x80 <= cmd && cmd <= 0xdf)
413                         break; /* image i/o */
414
415                 pos += len;
416         }
417
418         *cycles_sum_out += cpu_cycles_sum;
419         *cycles_last = cpu_cycles;
420         *last_cmd = cmd;
421         return pos;
422 }
423
424 int do_cmd_list(uint32_t *list, int count,
425  int *cycles_sum, int *cycles_last, int *last_cmd)
426 {
427         int pos = 0;
428
429         if (thread.running) {
430                 pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
431                 video_thread_queue_cmd(list, pos, *last_cmd);
432         } else {
433                 pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
434                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
435         }
436         return pos;
437 }
438
439 int renderer_init(void) {
440         if (thread_rendering) {
441                 video_thread_start();
442         }
443         return real_renderer_init();
444 }
445
446 void renderer_finish(void) {
447         real_renderer_finish();
448
449         if (thread_rendering && thread.running) {
450                 video_thread_stop();
451         }
452 }
453
454 void renderer_sync_ecmds(uint32_t * ecmds) {
455         if (thread.running) {
456                 int dummy = 0;
457                 do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy);
458         } else {
459                 real_renderer_sync_ecmds(ecmds);
460         }
461 }
462
463 void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
464         renderer_sync();
465         real_renderer_update_caches(x, y, w, h, state_changed);
466 }
467
468 void renderer_flush_queues(void) {
469         /* Called during DMA and updateLace. We want to sync if it's DMA,
470          * but not if it's updateLace. Instead of syncing here, there's a
471          * renderer_sync call during DMA. */
472         real_renderer_flush_queues();
473 }
474
475 /*
476  * Normally all GPU commands are processed before rendering the
477  * frame. For games that naturally run < 50/60fps, this is unnecessary
478  * -- it forces the game to render as if it was 60fps and leaves the
479  * GPU idle half the time on a 30fps game, for example.
480  *
481  * Allowing the renderer to wait until a frame is done before
482  * rendering it would give it double, triple, or quadruple the amount
483  * of time to finish before we have to wait for it.
484  *
485  * We can use a heuristic to figure out when to force a render.
486  *
487  * - If a frame isn't done when we're asked to render, wait for it and
488  *   put future GPU commands in a separate buffer (for the next frame)
489  *
490  * - If the frame is done, and had no future GPU commands, render it.
491  *
492  * - If we do have future GPU commands, it meant the frame took too
493  *   long to render and there's another frame waiting. Stop until the
494  *   first frame finishes, render it, and start processing the next
495  *   one.
496  *
497  * This may possibly add a frame or two of latency that shouldn't be
498  * different than the real device. It may skip rendering a frame
499  * entirely if a VRAM transfer happens while a frame is waiting, or in
500  * games that natively run at 60fps if frames are coming in too
501  * quickly to process. Depending on how the game treats "60fps," this
502  * may not be noticeable.
503  */
504 void renderer_notify_update_lace(int updated) {
505         if (!thread.running) return;
506
507         if (thread_rendering == THREAD_RENDERING_SYNC) {
508                 renderer_sync();
509                 return;
510         }
511
512         if (updated) {
513                 cmd_queue_swap();
514                 return;
515         }
516
517         pthread_mutex_lock(&thread.queue_lock);
518         if (thread.bg_queue->used || flushed) {
519                 /* We have commands for a future frame to run. Force a wait until
520                  * the current frame is finished, and start processing the next
521                  * frame after it's drawn (see the `updated` clause above). */
522                 pthread_mutex_unlock(&thread.queue_lock);
523                 renderer_wait();
524                 pthread_mutex_lock(&thread.queue_lock);
525
526                 /* We are no longer holding commands back, so the next frame may
527                  * get mixed into the following frame. This is usually fine, but can
528                  * result in frameskip-like effects for 60fps games. */
529                 flushed = FALSE;
530                 hold_cmds = FALSE;
531                 needs_display = TRUE;
532                 gpu.state.fb_dirty = TRUE;
533         } else if (thread.queue->used) {
534                 /* We are still drawing during a vblank. Cut off the current frame
535                  * by sending new commands to the background queue and skip
536                  * drawing our partly rendered frame to the display. */
537                 hold_cmds = TRUE;
538                 needs_display = TRUE;
539                 gpu.state.fb_dirty = FALSE;
540         } else if (needs_display && !thread.queue->used) {
541                 /* We have processed all commands in the queue, render the
542                  * buffer. We know we have something to render, because
543                  * needs_display is TRUE. */
544                 hold_cmds = FALSE;
545                 needs_display = FALSE;
546                 gpu.state.fb_dirty = TRUE;
547         } else {
548                 /* Everything went normally, so do the normal thing. */
549         }
550
551         pthread_mutex_unlock(&thread.queue_lock);
552 }
553
554 void renderer_set_interlace(int enable, int is_odd) {
555         real_renderer_set_interlace(enable, is_odd);
556 }
557
558 void renderer_set_config(const struct rearmed_cbs *cbs) {
559         renderer_sync();
560         thread_rendering = cbs->thread_rendering;
561         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
562                 video_thread_start();
563         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
564                 video_thread_stop();
565         }
566         real_renderer_set_config(cbs);
567 }
568
569 void renderer_notify_res_change(void) {
570         renderer_sync();
571         real_renderer_notify_res_change();
572 }