update gpulib_thread_if
[pcsx_rearmed.git] / plugins / gpulib / gpulib_thread_if.c
CommitLineData
c765eb86
JW
1/**************************************************************************
2* Copyright (C) 2020 The RetroArch Team *
3* *
4* This program is free software; you can redistribute it and/or modify *
5* it under the terms of the GNU General Public License as published by *
6* the Free Software Foundation; either version 2 of the License, or *
7* (at your option) any later version. *
8* *
9* This program is distributed in the hope that it will be useful, *
10* but WITHOUT ANY WARRANTY; without even the implied warranty of *
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
12* GNU General Public License for more details. *
13* *
14* You should have received a copy of the GNU General Public License *
15* along with this program; if not, write to the *
16* Free Software Foundation, Inc., *
17* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. *
18***************************************************************************/
19
20#include <stdlib.h>
a903b131 21#include <stdio.h>
c765eb86
JW
22#include <string.h>
23#include <pthread.h>
24#include "../gpulib/gpu.h"
25#include "../../frontend/plugin_lib.h"
025b6fde 26#include "gpu.h"
27#include "gpu_timing.h"
c765eb86
JW
28#include "gpulib_thread_if.h"
29
a903b131
JW
30#define FALSE 0
31#define TRUE 1
32#define BOOL unsigned short
33
c765eb86
JW
34typedef struct {
35 uint32_t *cmd_list;
36 int count;
37 int last_cmd;
38} video_thread_cmd;
39
40#define QUEUE_SIZE 0x2000
41
42typedef struct {
43 size_t start;
44 size_t end;
45 size_t used;
46 video_thread_cmd queue[QUEUE_SIZE];
47} video_thread_queue;
48
49typedef struct {
50 pthread_t thread;
51 pthread_mutex_t queue_lock;
52 pthread_cond_t cond_msg_avail;
53 pthread_cond_t cond_msg_done;
54 pthread_cond_t cond_queue_empty;
55 video_thread_queue *queue;
56 video_thread_queue *bg_queue;
a903b131 57 BOOL running;
c765eb86
JW
58} video_thread_state;
59
60static video_thread_state thread;
61static video_thread_queue queues[2];
62static int thread_rendering;
a903b131
JW
63static BOOL hold_cmds;
64static BOOL needs_display;
847f57a0 65static BOOL flushed;
c765eb86
JW
66
67extern const unsigned char cmd_lengths[];
68
69static void *video_thread_main(void *arg) {
70 video_thread_state *thread = (video_thread_state *)arg;
71 video_thread_cmd *cmd;
72 int i;
a903b131
JW
73
74#ifdef _3DS
c765eb86 75 static int processed = 0;
a903b131 76#endif /* _3DS */
c765eb86
JW
77
78 while(1) {
025b6fde 79 int result, cycles_dummy = 0, last_cmd, start, end;
c765eb86
JW
80 video_thread_queue *queue;
81 pthread_mutex_lock(&thread->queue_lock);
82
83 while (!thread->queue->used && thread->running) {
84 pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
85 }
86
87 if (!thread->running) {
88 pthread_mutex_unlock(&thread->queue_lock);
89 break;
90 }
91
92 queue = thread->queue;
93 start = queue->start;
94 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
95 queue->start = end % QUEUE_SIZE;
96 pthread_mutex_unlock(&thread->queue_lock);
97
98 for (i = start; i < end; i++) {
99 cmd = &queue->queue[i];
ff3890db 100 result = real_do_cmd_list(cmd->cmd_list, cmd->count,
025b6fde 101 &cycles_dummy, &cycles_dummy, &last_cmd);
c765eb86
JW
102 if (result != cmd->count) {
103 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
104 }
105
106#ifdef _3DS
107 /* Periodically yield so as not to starve other threads */
108 processed += cmd->count;
109 if (processed >= 512) {
110 svcSleepThread(1);
111 processed %= 512;
112 }
a903b131 113#endif /* _3DS */
c765eb86
JW
114 }
115
116 pthread_mutex_lock(&thread->queue_lock);
117 queue->used -= (end - start);
118
119 if (!queue->used)
120 pthread_cond_signal(&thread->cond_queue_empty);
121
122 pthread_cond_signal(&thread->cond_msg_done);
123 pthread_mutex_unlock(&thread->queue_lock);
124 }
125
126 return 0;
127}
128
129static void cmd_queue_swap() {
130 video_thread_queue *tmp;
131 if (!thread.bg_queue->used) return;
132
133 pthread_mutex_lock(&thread.queue_lock);
134 if (!thread.queue->used) {
135 tmp = thread.queue;
136 thread.queue = thread.bg_queue;
137 thread.bg_queue = tmp;
c765eb86
JW
138 pthread_cond_signal(&thread.cond_msg_avail);
139 }
140 pthread_mutex_unlock(&thread.queue_lock);
141}
142
143/* Waits for the main queue to completely finish. */
144void renderer_wait() {
145 if (!thread.running) return;
146
147 /* Not completely safe, but should be fine since the render thread
148 * only decreases used, and we check again inside the lock. */
149 if (!thread.queue->used) {
150 return;
151 }
152
153 pthread_mutex_lock(&thread.queue_lock);
154
155 while (thread.queue->used) {
156 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
157 }
158
159 pthread_mutex_unlock(&thread.queue_lock);
160}
161
162/* Waits for all GPU commands in both queues to finish, bringing VRAM
163 * completely up-to-date. */
164void renderer_sync(void) {
165 if (!thread.running) return;
166
167 /* Not completely safe, but should be fine since the render thread
168 * only decreases used, and we check again inside the lock. */
169 if (!thread.queue->used && !thread.bg_queue->used) {
170 return;
171 }
172
847f57a0
JW
173 if (thread.bg_queue->used) {
174 /* When we flush the background queue, the vblank handler can't
175 * know that we had a frame pending, and we delay rendering too
176 * long. Force it. */
177 flushed = TRUE;
178 }
179
c765eb86
JW
180 /* Flush both queues. This is necessary because gpulib could be
181 * trying to process a DMA write that a command in the queue should
182 * run beforehand. For example, Xenogears sprites write a black
183 * rectangle over the to-be-DMA'd spot in VRAM -- if this write
184 * happens after the DMA, it will clear the DMA, resulting in
185 * flickering sprites. We need to be totally up-to-date. This may
186 * drop a frame. */
187 renderer_wait();
188 cmd_queue_swap();
a903b131 189 hold_cmds = FALSE;
c765eb86
JW
190 renderer_wait();
191}
192
193static void video_thread_stop() {
194 int i;
195 renderer_sync();
196
197 if (thread.running) {
a903b131 198 thread.running = FALSE;
c765eb86
JW
199 pthread_cond_signal(&thread.cond_msg_avail);
200 pthread_join(thread.thread, NULL);
201 }
202
203 pthread_mutex_destroy(&thread.queue_lock);
204 pthread_cond_destroy(&thread.cond_msg_avail);
205 pthread_cond_destroy(&thread.cond_msg_done);
206 pthread_cond_destroy(&thread.cond_queue_empty);
207
208 for (i = 0; i < QUEUE_SIZE; i++) {
209 video_thread_cmd *cmd = &thread.queue->queue[i];
210 free(cmd->cmd_list);
211 cmd->cmd_list = NULL;
212 }
213
214 for (i = 0; i < QUEUE_SIZE; i++) {
215 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
216 free(cmd->cmd_list);
217 cmd->cmd_list = NULL;
218 }
219}
220
221static void video_thread_start() {
222 fprintf(stdout, "Starting render thread\n");
223
224 if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
225 pthread_cond_init(&thread.cond_msg_done, NULL) ||
226 pthread_cond_init(&thread.cond_queue_empty, NULL) ||
227 pthread_mutex_init(&thread.queue_lock, NULL) ||
228 pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
229 goto error;
230 }
231
232 thread.queue = &queues[0];
233 thread.bg_queue = &queues[1];
234
a903b131 235 thread.running = TRUE;
c765eb86
JW
236 return;
237
238 error:
239 fprintf(stderr,"Failed to start rendering thread\n");
240 video_thread_stop();
241}
242
243static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
244 video_thread_cmd *cmd;
245 uint32_t *cmd_list;
246 video_thread_queue *queue;
a903b131 247 BOOL lock;
c765eb86
JW
248
249 cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
250
251 if (!cmd_list) {
252 /* Out of memory, disable the thread and run sync from now on */
253 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
254 video_thread_stop();
255 }
256
257 memcpy(cmd_list, list, count * sizeof(uint32_t));
258
259 if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
260 /* If the bg queue is full, do a full sync to empty both queues
261 * and clear space. This should be very rare, I've only seen it in
262 * Tekken 3 post-battle-replay. */
263 renderer_sync();
264 }
265
266 if (hold_cmds) {
267 queue = thread.bg_queue;
a903b131 268 lock = FALSE;
c765eb86
JW
269 } else {
270 queue = thread.queue;
a903b131 271 lock = TRUE;
c765eb86
JW
272 }
273
274 if (lock) {
275 pthread_mutex_lock(&thread.queue_lock);
276
277 while (queue->used >= QUEUE_SIZE) {
278 pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
279 }
280 }
281
282 cmd = &queue->queue[queue->end];
283 free(cmd->cmd_list);
284 cmd->cmd_list = cmd_list;
285 cmd->count = count;
286 cmd->last_cmd = last_cmd;
287 queue->end = (queue->end + 1) % QUEUE_SIZE;
288 queue->used++;
289
290 if (lock) {
291 pthread_cond_signal(&thread.cond_msg_avail);
292 pthread_mutex_unlock(&thread.queue_lock);
293 }
294}
295
296/* Slice off just the part of the list that can be handled async, and
297 * update ex_regs. */
025b6fde 298static int scan_cmd_list(uint32_t *data, int count,
299 int *cycles_sum_out, int *cycles_last, int *last_cmd)
c765eb86 300{
025b6fde 301 int cpu_cycles_sum = 0, cpu_cycles = *cycles_last;
c765eb86
JW
302 int cmd = 0, pos = 0, len, v;
303
304 while (pos < count) {
305 uint32_t *list = data + pos;
025b6fde 306 short *slist = (void *)list;
307 cmd = LE32TOH(list[0]) >> 24;
c765eb86
JW
308 len = 1 + cmd_lengths[cmd];
309
310 switch (cmd) {
311 case 0x02:
025b6fde 312 gput_sum(cpu_cycles_sum, cpu_cycles,
313 gput_fill(LE16TOH(slist[4]) & 0x3ff,
314 LE16TOH(slist[5]) & 0x1ff));
315 break;
316 case 0x20 ... 0x23:
317 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
c765eb86
JW
318 break;
319 case 0x24 ... 0x27:
025b6fde 320 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
321 gpu.ex_regs[1] &= ~0x1ff;
322 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
323 break;
324 case 0x28 ... 0x2b:
325 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
326 break;
c765eb86 327 case 0x2c ... 0x2f:
025b6fde 328 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
329 gpu.ex_regs[1] &= ~0x1ff;
330 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
331 break;
332 case 0x30 ... 0x33:
333 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
334 break;
c765eb86 335 case 0x34 ... 0x37:
025b6fde 336 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
337 gpu.ex_regs[1] &= ~0x1ff;
338 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
339 break;
340 case 0x38 ... 0x3b:
341 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
342 break;
c765eb86 343 case 0x3c ... 0x3f:
025b6fde 344 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
c765eb86 345 gpu.ex_regs[1] &= ~0x1ff;
025b6fde 346 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
347 break;
348 case 0x40 ... 0x47:
349 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
c765eb86
JW
350 break;
351 case 0x48 ... 0x4F:
352 for (v = 3; pos + v < count; v++)
353 {
025b6fde 354 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
c765eb86
JW
355 if ((list[v] & 0xf000f000) == 0x50005000)
356 break;
357 }
358 len += v - 3;
359 break;
025b6fde 360 case 0x50 ... 0x57:
361 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
362 break;
c765eb86
JW
363 case 0x58 ... 0x5F:
364 for (v = 4; pos + v < count; v += 2)
365 {
025b6fde 366 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
c765eb86
JW
367 if ((list[v] & 0xf000f000) == 0x50005000)
368 break;
369 }
370 len += v - 4;
371 break;
025b6fde 372 case 0x60 ... 0x63:
373 gput_sum(cpu_cycles_sum, cpu_cycles,
374 gput_sprite(LE16TOH(slist[4]) & 0x3ff,
375 LE16TOH(slist[5]) & 0x1ff));
376 break;
377 case 0x64 ... 0x67:
378 gput_sum(cpu_cycles_sum, cpu_cycles,
379 gput_sprite(LE16TOH(slist[6]) & 0x3ff,
380 LE16TOH(slist[7]) & 0x1ff));
381 break;
382 case 0x68 ... 0x6b:
383 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
384 break;
385 case 0x70 ... 0x77:
386 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8));
387 break;
388 case 0x78 ... 0x7f:
389 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16));
390 break;
c765eb86
JW
391 default:
392 if ((cmd & 0xf8) == 0xe0)
393 gpu.ex_regs[cmd & 7] = list[0];
394 break;
395 }
396
397 if (pos + len > count) {
398 cmd = -1;
399 break; /* incomplete cmd */
400 }
025b6fde 401 if (0x80 <= cmd && cmd <= 0xdf)
c765eb86
JW
402 break; /* image i/o */
403
404 pos += len;
405 }
406
025b6fde 407 *cycles_sum_out += cpu_cycles_sum;
408 *cycles_last = cpu_cycles;
c765eb86
JW
409 *last_cmd = cmd;
410 return pos;
411}
412
025b6fde 413int do_cmd_list(uint32_t *list, int count,
414 int *cycles_sum, int *cycles_last, int *last_cmd)
415{
c765eb86
JW
416 int pos = 0;
417
418 if (thread.running) {
025b6fde 419 pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
c765eb86
JW
420 video_thread_queue_cmd(list, pos, *last_cmd);
421 } else {
025b6fde 422 pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
c765eb86
JW
423 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
424 }
425 return pos;
426}
427
428int renderer_init(void) {
429 if (thread_rendering) {
430 video_thread_start();
431 }
432 return real_renderer_init();
433}
434
435void renderer_finish(void) {
436 real_renderer_finish();
437
438 if (thread_rendering && thread.running) {
439 video_thread_stop();
440 }
441}
442
443void renderer_sync_ecmds(uint32_t * ecmds) {
444 if (thread.running) {
ff3890db 445 int dummy = 0;
025b6fde 446 do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy);
c765eb86
JW
447 } else {
448 real_renderer_sync_ecmds(ecmds);
449 }
450}
451
adca9bef 452void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
c765eb86 453 renderer_sync();
adca9bef 454 real_renderer_update_caches(x, y, w, h, state_changed);
c765eb86
JW
455}
456
457void renderer_flush_queues(void) {
458 /* Called during DMA and updateLace. We want to sync if it's DMA,
459 * but not if it's updateLace. Instead of syncing here, there's a
460 * renderer_sync call during DMA. */
461 real_renderer_flush_queues();
462}
463
464/*
465 * Normally all GPU commands are processed before rendering the
466 * frame. For games that naturally run < 50/60fps, this is unnecessary
467 * -- it forces the game to render as if it was 60fps and leaves the
468 * GPU idle half the time on a 30fps game, for example.
469 *
470 * Allowing the renderer to wait until a frame is done before
471 * rendering it would give it double, triple, or quadruple the amount
472 * of time to finish before we have to wait for it.
473 *
474 * We can use a heuristic to figure out when to force a render.
475 *
476 * - If a frame isn't done when we're asked to render, wait for it and
477 * put future GPU commands in a separate buffer (for the next frame)
478 *
479 * - If the frame is done, and had no future GPU commands, render it.
480 *
481 * - If we do have future GPU commands, it meant the frame took too
482 * long to render and there's another frame waiting. Stop until the
483 * first frame finishes, render it, and start processing the next
484 * one.
485 *
486 * This may possibly add a frame or two of latency that shouldn't be
487 * different than the real device. It may skip rendering a frame
488 * entirely if a VRAM transfer happens while a frame is waiting, or in
489 * games that natively run at 60fps if frames are coming in too
490 * quickly to process. Depending on how the game treats "60fps," this
491 * may not be noticeable.
492 */
493void renderer_notify_update_lace(int updated) {
494 if (!thread.running) return;
495
496 if (thread_rendering == THREAD_RENDERING_SYNC) {
497 renderer_sync();
498 return;
499 }
500
501 if (updated) {
502 cmd_queue_swap();
503 return;
504 }
505
506 pthread_mutex_lock(&thread.queue_lock);
847f57a0 507 if (thread.bg_queue->used || flushed) {
c765eb86
JW
508 /* We have commands for a future frame to run. Force a wait until
509 * the current frame is finished, and start processing the next
510 * frame after it's drawn (see the `updated` clause above). */
511 pthread_mutex_unlock(&thread.queue_lock);
512 renderer_wait();
513 pthread_mutex_lock(&thread.queue_lock);
514
515 /* We are no longer holding commands back, so the next frame may
516 * get mixed into the following frame. This is usually fine, but can
517 * result in frameskip-like effects for 60fps games. */
847f57a0 518 flushed = FALSE;
a903b131
JW
519 hold_cmds = FALSE;
520 needs_display = TRUE;
521 gpu.state.fb_dirty = TRUE;
c765eb86
JW
522 } else if (thread.queue->used) {
523 /* We are still drawing during a vblank. Cut off the current frame
524 * by sending new commands to the background queue and skip
525 * drawing our partly rendered frame to the display. */
a903b131
JW
526 hold_cmds = TRUE;
527 needs_display = TRUE;
528 gpu.state.fb_dirty = FALSE;
c765eb86
JW
529 } else if (needs_display && !thread.queue->used) {
530 /* We have processed all commands in the queue, render the
531 * buffer. We know we have something to render, because
a903b131
JW
532 * needs_display is TRUE. */
533 hold_cmds = FALSE;
534 needs_display = FALSE;
535 gpu.state.fb_dirty = TRUE;
c765eb86
JW
536 } else {
537 /* Everything went normally, so do the normal thing. */
538 }
539
540 pthread_mutex_unlock(&thread.queue_lock);
541}
542
543void renderer_set_interlace(int enable, int is_odd) {
544 real_renderer_set_interlace(enable, is_odd);
545}
546
547void renderer_set_config(const struct rearmed_cbs *cbs) {
548 renderer_sync();
549 thread_rendering = cbs->thread_rendering;
550 if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
551 video_thread_start();
552 } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
553 video_thread_stop();
554 }
555 real_renderer_set_config(cbs);
556}
557
558void renderer_notify_res_change(void) {
559 renderer_sync();
560 real_renderer_notify_res_change();
561}