frontend: update libpicofe, fix missed callbacks
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
... / ...
CommitLineData
1/**************************************************************************
2* Copyright (C) 2020 The RetroArch Team *
3* *
4* This program is free software; you can redistribute it and/or modify *
5* it under the terms of the GNU General Public License as published by *
6* the Free Software Foundation; either version 2 of the License, or *
7* (at your option) any later version. *
8* *
9* This program is distributed in the hope that it will be useful, *
10* but WITHOUT ANY WARRANTY; without even the implied warranty of *
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
12* GNU General Public License for more details. *
13* *
14* You should have received a copy of the GNU General Public License *
15* along with this program; if not, write to the *
16* Free Software Foundation, Inc., *
17* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. *
18***************************************************************************/
19
20#include <stdlib.h>
21#include <stdio.h>
22#include <string.h>
23#include <pthread.h>
24#include "../gpulib/gpu.h"
25#include "../../frontend/plugin_lib.h"
26#include "gpu.h"
27#include "gpu_timing.h"
28#include "gpulib_thread_if.h"
29
30extern void SysPrintf(const char *fmt, ...);
31
32#define FALSE 0
33#define TRUE 1
34#define BOOL unsigned short
35
36typedef struct {
37 uint32_t *cmd_list;
38 int count;
39 int last_cmd;
40} video_thread_cmd;
41
42#define QUEUE_SIZE 0x2000
43
44typedef struct {
45 size_t start;
46 size_t end;
47 size_t used;
48 video_thread_cmd queue[QUEUE_SIZE];
49} video_thread_queue;
50
51typedef struct {
52 pthread_t thread;
53 pthread_mutex_t queue_lock;
54 pthread_cond_t cond_msg_avail;
55 pthread_cond_t cond_msg_done;
56 pthread_cond_t cond_queue_empty;
57 video_thread_queue *queue;
58 video_thread_queue *bg_queue;
59 BOOL running;
60} video_thread_state;
61
62static video_thread_state thread;
63static video_thread_queue queues[2];
64static int thread_rendering;
65static BOOL hold_cmds;
66static BOOL needs_display;
67static BOOL flushed;
68
69extern const unsigned char cmd_lengths[];
70
71static void *video_thread_main(void *arg) {
72 video_thread_cmd *cmd;
73 int i;
74
75#ifdef _3DS
76 static int processed = 0;
77#endif /* _3DS */
78
79#if defined(__arm__) && defined(__ARM_FP)
80 // RunFast mode
81 uint32_t fpscr = ~0;
82 __asm__ volatile("vmrs %0, fpscr" : "=r"(fpscr));
83 fpscr &= ~0x00009f9f;
84 fpscr |= 0x03000000; // DN | FZ
85 __asm__ volatile("vmsr fpscr, %0" :: "r"(fpscr));
86#endif
87
88 while(1) {
89 int result, cycles_dummy = 0, last_cmd, start, end;
90 video_thread_queue *queue;
91 pthread_mutex_lock(&thread.queue_lock);
92
93 while (!thread.queue->used && thread.running) {
94 pthread_cond_wait(&thread.cond_msg_avail, &thread.queue_lock);
95 }
96
97 if (!thread.running) {
98 pthread_mutex_unlock(&thread.queue_lock);
99 break;
100 }
101
102 queue = thread.queue;
103 start = queue->start;
104 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
105 queue->start = end % QUEUE_SIZE;
106 pthread_mutex_unlock(&thread.queue_lock);
107
108 for (i = start; i < end; i++) {
109 cmd = &queue->queue[i];
110 result = real_do_cmd_list(cmd->cmd_list, cmd->count,
111 &cycles_dummy, &cycles_dummy, &last_cmd);
112 if (result != cmd->count) {
113 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
114 }
115
116#ifdef _3DS
117 /* Periodically yield so as not to starve other threads */
118 processed += cmd->count;
119 if (processed >= 512) {
120 svcSleepThread(1);
121 processed %= 512;
122 }
123#endif /* _3DS */
124 }
125
126 pthread_mutex_lock(&thread.queue_lock);
127 queue->used -= (end - start);
128
129 if (!queue->used)
130 pthread_cond_signal(&thread.cond_queue_empty);
131
132 pthread_cond_signal(&thread.cond_msg_done);
133 pthread_mutex_unlock(&thread.queue_lock);
134 }
135
136 return 0;
137}
138
139static void cmd_queue_swap() {
140 video_thread_queue *tmp;
141 if (!thread.bg_queue->used) return;
142
143 pthread_mutex_lock(&thread.queue_lock);
144 if (!thread.queue->used) {
145 tmp = thread.queue;
146 thread.queue = thread.bg_queue;
147 thread.bg_queue = tmp;
148 pthread_cond_signal(&thread.cond_msg_avail);
149 }
150 pthread_mutex_unlock(&thread.queue_lock);
151}
152
153/* Waits for the main queue to completely finish. */
154void renderer_wait() {
155 if (!thread.running) return;
156
157 /* Not completely safe, but should be fine since the render thread
158 * only decreases used, and we check again inside the lock. */
159 if (!thread.queue->used) {
160 return;
161 }
162
163 pthread_mutex_lock(&thread.queue_lock);
164
165 while (thread.queue->used) {
166 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
167 }
168
169 pthread_mutex_unlock(&thread.queue_lock);
170}
171
172/* Waits for all GPU commands in both queues to finish, bringing VRAM
173 * completely up-to-date. */
174void renderer_sync(void) {
175 if (!thread.running) return;
176
177 /* Not completely safe, but should be fine since the render thread
178 * only decreases used, and we check again inside the lock. */
179 if (!thread.queue->used && !thread.bg_queue->used) {
180 return;
181 }
182
183 if (thread.bg_queue->used) {
184 /* When we flush the background queue, the vblank handler can't
185 * know that we had a frame pending, and we delay rendering too
186 * long. Force it. */
187 flushed = TRUE;
188 }
189
190 /* Flush both queues. This is necessary because gpulib could be
191 * trying to process a DMA write that a command in the queue should
192 * run beforehand. For example, Xenogears sprites write a black
193 * rectangle over the to-be-DMA'd spot in VRAM -- if this write
194 * happens after the DMA, it will clear the DMA, resulting in
195 * flickering sprites. We need to be totally up-to-date. This may
196 * drop a frame. */
197 renderer_wait();
198 cmd_queue_swap();
199 hold_cmds = FALSE;
200 renderer_wait();
201}
202
203static void video_thread_stop() {
204 int i;
205 renderer_sync();
206
207 if (thread.running) {
208 thread.running = FALSE;
209 pthread_cond_signal(&thread.cond_msg_avail);
210 pthread_join(thread.thread, NULL);
211 }
212
213 pthread_mutex_destroy(&thread.queue_lock);
214 pthread_cond_destroy(&thread.cond_msg_avail);
215 pthread_cond_destroy(&thread.cond_msg_done);
216 pthread_cond_destroy(&thread.cond_queue_empty);
217
218 for (i = 0; i < QUEUE_SIZE; i++) {
219 video_thread_cmd *cmd = &thread.queue->queue[i];
220 free(cmd->cmd_list);
221 cmd->cmd_list = NULL;
222 }
223
224 for (i = 0; i < QUEUE_SIZE; i++) {
225 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
226 free(cmd->cmd_list);
227 cmd->cmd_list = NULL;
228 }
229}
230
231static void video_thread_start() {
232 SysPrintf("Starting render thread\n");
233
234 thread.queue = &queues[0];
235 thread.bg_queue = &queues[1];
236 thread.running = TRUE;
237
238 if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
239 pthread_cond_init(&thread.cond_msg_done, NULL) ||
240 pthread_cond_init(&thread.cond_queue_empty, NULL) ||
241 pthread_mutex_init(&thread.queue_lock, NULL) ||
242 pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
243 goto error;
244 }
245
246 return;
247
248 error:
249 SysPrintf("Failed to start rendering thread\n");
250 thread.running = FALSE;
251 video_thread_stop();
252}
253
254static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
255 video_thread_cmd *cmd;
256 uint32_t *cmd_list;
257 video_thread_queue *queue;
258 BOOL lock;
259
260 cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
261
262 if (!cmd_list) {
263 /* Out of memory, disable the thread and run sync from now on */
264 SysPrintf("Failed to allocate render thread command list, stopping thread\n");
265 video_thread_stop();
266 }
267
268 memcpy(cmd_list, list, count * sizeof(uint32_t));
269
270 if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
271 /* If the bg queue is full, do a full sync to empty both queues
272 * and clear space. This should be very rare, I've only seen it in
273 * Tekken 3 post-battle-replay. */
274 renderer_sync();
275 }
276
277 if (hold_cmds) {
278 queue = thread.bg_queue;
279 lock = FALSE;
280 } else {
281 queue = thread.queue;
282 lock = TRUE;
283 }
284
285 if (lock) {
286 pthread_mutex_lock(&thread.queue_lock);
287
288 while (queue->used >= QUEUE_SIZE) {
289 pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
290 }
291 }
292
293 cmd = &queue->queue[queue->end];
294 free(cmd->cmd_list);
295 cmd->cmd_list = cmd_list;
296 cmd->count = count;
297 cmd->last_cmd = last_cmd;
298 queue->end = (queue->end + 1) % QUEUE_SIZE;
299 queue->used++;
300
301 if (lock) {
302 pthread_cond_signal(&thread.cond_msg_avail);
303 pthread_mutex_unlock(&thread.queue_lock);
304 }
305}
306
307/* Slice off just the part of the list that can be handled async, and
308 * update ex_regs. */
309static int scan_cmd_list(uint32_t *data, int count,
310 int *cycles_sum_out, int *cycles_last, int *last_cmd)
311{
312 int cpu_cycles_sum = 0, cpu_cycles = *cycles_last;
313 int cmd = 0, pos = 0, len, v;
314
315 while (pos < count) {
316 uint32_t *list = data + pos;
317 short *slist = (void *)list;
318 cmd = LE32TOH(list[0]) >> 24;
319 len = 1 + cmd_lengths[cmd];
320
321 switch (cmd) {
322 case 0x02:
323 gput_sum(cpu_cycles_sum, cpu_cycles,
324 gput_fill(LE16TOH(slist[4]) & 0x3ff,
325 LE16TOH(slist[5]) & 0x1ff));
326 break;
327 case 0x20 ... 0x23:
328 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
329 break;
330 case 0x24 ... 0x27:
331 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
332 gpu.ex_regs[1] &= ~0x1ff;
333 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
334 break;
335 case 0x28 ... 0x2b:
336 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
337 break;
338 case 0x2c ... 0x2f:
339 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
340 gpu.ex_regs[1] &= ~0x1ff;
341 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
342 break;
343 case 0x30 ... 0x33:
344 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
345 break;
346 case 0x34 ... 0x37:
347 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
348 gpu.ex_regs[1] &= ~0x1ff;
349 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
350 break;
351 case 0x38 ... 0x3b:
352 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
353 break;
354 case 0x3c ... 0x3f:
355 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
356 gpu.ex_regs[1] &= ~0x1ff;
357 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
358 break;
359 case 0x40 ... 0x47:
360 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
361 break;
362 case 0x48 ... 0x4F:
363 for (v = 3; pos + v < count; v++)
364 {
365 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
366 if ((list[v] & 0xf000f000) == 0x50005000)
367 break;
368 }
369 len += v - 3;
370 break;
371 case 0x50 ... 0x57:
372 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
373 break;
374 case 0x58 ... 0x5F:
375 for (v = 4; pos + v < count; v += 2)
376 {
377 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
378 if ((list[v] & 0xf000f000) == 0x50005000)
379 break;
380 }
381 len += v - 4;
382 break;
383 case 0x60 ... 0x63:
384 gput_sum(cpu_cycles_sum, cpu_cycles,
385 gput_sprite(LE16TOH(slist[4]) & 0x3ff,
386 LE16TOH(slist[5]) & 0x1ff));
387 break;
388 case 0x64 ... 0x67:
389 gput_sum(cpu_cycles_sum, cpu_cycles,
390 gput_sprite(LE16TOH(slist[6]) & 0x3ff,
391 LE16TOH(slist[7]) & 0x1ff));
392 break;
393 case 0x68 ... 0x6b:
394 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
395 break;
396 case 0x70 ... 0x77:
397 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8));
398 break;
399 case 0x78 ... 0x7f:
400 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16));
401 break;
402 default:
403 if ((cmd & 0xf8) == 0xe0)
404 gpu.ex_regs[cmd & 7] = list[0];
405 break;
406 }
407
408 if (pos + len > count) {
409 cmd = -1;
410 break; /* incomplete cmd */
411 }
412 if (0x80 <= cmd && cmd <= 0xdf)
413 break; /* image i/o */
414
415 pos += len;
416 }
417
418 *cycles_sum_out += cpu_cycles_sum;
419 *cycles_last = cpu_cycles;
420 *last_cmd = cmd;
421 return pos;
422}
423
424int do_cmd_list(uint32_t *list, int count,
425 int *cycles_sum, int *cycles_last, int *last_cmd)
426{
427 int pos = 0;
428
429 if (thread.running) {
430 pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
431 video_thread_queue_cmd(list, pos, *last_cmd);
432 } else {
433 pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
434 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
435 }
436 return pos;
437}
438
439int renderer_init(void) {
440 if (thread_rendering) {
441 video_thread_start();
442 }
443 return real_renderer_init();
444}
445
446void renderer_finish(void) {
447 real_renderer_finish();
448
449 if (thread_rendering && thread.running) {
450 video_thread_stop();
451 }
452}
453
454void renderer_sync_ecmds(uint32_t * ecmds) {
455 if (thread.running) {
456 int dummy = 0;
457 do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy);
458 } else {
459 real_renderer_sync_ecmds(ecmds);
460 }
461}
462
463void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
464 renderer_sync();
465 real_renderer_update_caches(x, y, w, h, state_changed);
466}
467
468void renderer_flush_queues(void) {
469 /* Called during DMA and updateLace. We want to sync if it's DMA,
470 * but not if it's updateLace. Instead of syncing here, there's a
471 * renderer_sync call during DMA. */
472 real_renderer_flush_queues();
473}
474
475/*
476 * Normally all GPU commands are processed before rendering the
477 * frame. For games that naturally run < 50/60fps, this is unnecessary
478 * -- it forces the game to render as if it was 60fps and leaves the
479 * GPU idle half the time on a 30fps game, for example.
480 *
481 * Allowing the renderer to wait until a frame is done before
482 * rendering it would give it double, triple, or quadruple the amount
483 * of time to finish before we have to wait for it.
484 *
485 * We can use a heuristic to figure out when to force a render.
486 *
487 * - If a frame isn't done when we're asked to render, wait for it and
488 * put future GPU commands in a separate buffer (for the next frame)
489 *
490 * - If the frame is done, and had no future GPU commands, render it.
491 *
492 * - If we do have future GPU commands, it meant the frame took too
493 * long to render and there's another frame waiting. Stop until the
494 * first frame finishes, render it, and start processing the next
495 * one.
496 *
497 * This may possibly add a frame or two of latency that shouldn't be
498 * different than the real device. It may skip rendering a frame
499 * entirely if a VRAM transfer happens while a frame is waiting, or in
500 * games that natively run at 60fps if frames are coming in too
501 * quickly to process. Depending on how the game treats "60fps," this
502 * may not be noticeable.
503 */
504void renderer_notify_update_lace(int updated) {
505 if (!thread.running) return;
506
507 if (thread_rendering == THREAD_RENDERING_SYNC) {
508 renderer_sync();
509 return;
510 }
511
512 if (updated) {
513 cmd_queue_swap();
514 return;
515 }
516
517 pthread_mutex_lock(&thread.queue_lock);
518 if (thread.bg_queue->used || flushed) {
519 /* We have commands for a future frame to run. Force a wait until
520 * the current frame is finished, and start processing the next
521 * frame after it's drawn (see the `updated` clause above). */
522 pthread_mutex_unlock(&thread.queue_lock);
523 renderer_wait();
524 pthread_mutex_lock(&thread.queue_lock);
525
526 /* We are no longer holding commands back, so the next frame may
527 * get mixed into the following frame. This is usually fine, but can
528 * result in frameskip-like effects for 60fps games. */
529 flushed = FALSE;
530 hold_cmds = FALSE;
531 needs_display = TRUE;
532 gpu.state.fb_dirty = TRUE;
533 } else if (thread.queue->used) {
534 /* We are still drawing during a vblank. Cut off the current frame
535 * by sending new commands to the background queue and skip
536 * drawing our partly rendered frame to the display. */
537 hold_cmds = TRUE;
538 needs_display = TRUE;
539 gpu.state.fb_dirty = FALSE;
540 } else if (needs_display && !thread.queue->used) {
541 /* We have processed all commands in the queue, render the
542 * buffer. We know we have something to render, because
543 * needs_display is TRUE. */
544 hold_cmds = FALSE;
545 needs_display = FALSE;
546 gpu.state.fb_dirty = TRUE;
547 } else {
548 /* Everything went normally, so do the normal thing. */
549 }
550
551 pthread_mutex_unlock(&thread.queue_lock);
552}
553
554void renderer_set_interlace(int enable, int is_odd) {
555 real_renderer_set_interlace(enable, is_odd);
556}
557
558void renderer_set_config(const struct rearmed_cbs *cbs) {
559 renderer_sync();
560 thread_rendering = cbs->thread_rendering;
561 if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
562 video_thread_start();
563 } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
564 video_thread_stop();
565 }
566 real_renderer_set_config(cbs);
567}
568
569void renderer_notify_res_change(void) {
570 renderer_sync();
571 real_renderer_notify_res_change();
572}