79c4d434 |
1 | /************************************************************************** |
2 | * Copyright (C) 2020 The RetroArch Team * |
3 | * * |
4 | * This program is free software; you can redistribute it and/or modify * |
5 | * it under the terms of the GNU General Public License as published by * |
6 | * the Free Software Foundation; either version 2 of the License, or * |
7 | * (at your option) any later version. * |
8 | * * |
9 | * This program is distributed in the hope that it will be useful, * |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of * |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
12 | * GNU General Public License for more details. * |
13 | * * |
14 | * You should have received a copy of the GNU General Public License * |
15 | * along with this program; if not, write to the * |
16 | * Free Software Foundation, Inc., * |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * |
18 | ***************************************************************************/ |
19 | |
20 | #include <stdlib.h> |
21 | #include <stdio.h> |
22 | #include <string.h> |
23 | #include <pthread.h> |
24 | #include "../gpulib/gpu.h" |
25 | #include "../../frontend/plugin_lib.h" |
26 | #include "gpu.h" |
27 | #include "gpu_timing.h" |
28 | #include "gpulib_thread_if.h" |
29 | |
30 | extern void SysPrintf(const char *fmt, ...); |
31 | |
32 | #define FALSE 0 |
33 | #define TRUE 1 |
34 | #define BOOL unsigned short |
35 | |
36 | typedef struct { |
37 | uint32_t *cmd_list; |
38 | int count; |
39 | int last_cmd; |
40 | } video_thread_cmd; |
41 | |
42 | #define QUEUE_SIZE 0x2000 |
43 | |
44 | typedef struct { |
45 | size_t start; |
46 | size_t end; |
47 | size_t used; |
48 | video_thread_cmd queue[QUEUE_SIZE]; |
49 | } video_thread_queue; |
50 | |
51 | typedef struct { |
52 | pthread_t thread; |
53 | pthread_mutex_t queue_lock; |
54 | pthread_cond_t cond_msg_avail; |
55 | pthread_cond_t cond_msg_done; |
56 | pthread_cond_t cond_queue_empty; |
57 | video_thread_queue *queue; |
58 | video_thread_queue *bg_queue; |
59 | BOOL running; |
60 | } video_thread_state; |
61 | |
62 | static video_thread_state thread; |
63 | static video_thread_queue queues[2]; |
64 | static int thread_rendering; |
65 | static BOOL hold_cmds; |
66 | static BOOL needs_display; |
67 | static BOOL flushed; |
68 | |
69 | extern const unsigned char cmd_lengths[]; |
70 | |
71 | static void *video_thread_main(void *arg) { |
72 | video_thread_cmd *cmd; |
73 | int i; |
74 | |
75 | #ifdef _3DS |
76 | static int processed = 0; |
77 | #endif /* _3DS */ |
78 | |
30431e75 |
79 | #if defined(__arm__) && defined(__ARM_FP) |
f6ede72d |
80 | // RunFast mode |
22082ea1 |
81 | uint32_t fpscr = ~0; |
30431e75 |
82 | __asm__ volatile("vmrs %0, fpscr" : "=r"(fpscr)); |
f6ede72d |
83 | fpscr &= ~0x00009f9f; |
84 | fpscr |= 0x03000000; // DN | FZ |
30431e75 |
85 | __asm__ volatile("vmsr fpscr, %0" :: "r"(fpscr)); |
f6ede72d |
86 | #endif |
87 | |
79c4d434 |
88 | while(1) { |
89 | int result, cycles_dummy = 0, last_cmd, start, end; |
90 | video_thread_queue *queue; |
91 | pthread_mutex_lock(&thread.queue_lock); |
92 | |
93 | while (!thread.queue->used && thread.running) { |
94 | pthread_cond_wait(&thread.cond_msg_avail, &thread.queue_lock); |
95 | } |
96 | |
97 | if (!thread.running) { |
98 | pthread_mutex_unlock(&thread.queue_lock); |
99 | break; |
100 | } |
101 | |
102 | queue = thread.queue; |
103 | start = queue->start; |
104 | end = queue->end > queue->start ? queue->end : QUEUE_SIZE; |
105 | queue->start = end % QUEUE_SIZE; |
106 | pthread_mutex_unlock(&thread.queue_lock); |
107 | |
108 | for (i = start; i < end; i++) { |
109 | cmd = &queue->queue[i]; |
110 | result = real_do_cmd_list(cmd->cmd_list, cmd->count, |
111 | &cycles_dummy, &cycles_dummy, &last_cmd); |
112 | if (result != cmd->count) { |
113 | fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result); |
114 | } |
115 | |
116 | #ifdef _3DS |
117 | /* Periodically yield so as not to starve other threads */ |
118 | processed += cmd->count; |
119 | if (processed >= 512) { |
120 | svcSleepThread(1); |
121 | processed %= 512; |
122 | } |
123 | #endif /* _3DS */ |
124 | } |
125 | |
126 | pthread_mutex_lock(&thread.queue_lock); |
127 | queue->used -= (end - start); |
128 | |
129 | if (!queue->used) |
130 | pthread_cond_signal(&thread.cond_queue_empty); |
131 | |
132 | pthread_cond_signal(&thread.cond_msg_done); |
133 | pthread_mutex_unlock(&thread.queue_lock); |
134 | } |
135 | |
136 | return 0; |
137 | } |
138 | |
139 | static void cmd_queue_swap() { |
140 | video_thread_queue *tmp; |
141 | if (!thread.bg_queue->used) return; |
142 | |
143 | pthread_mutex_lock(&thread.queue_lock); |
144 | if (!thread.queue->used) { |
145 | tmp = thread.queue; |
146 | thread.queue = thread.bg_queue; |
147 | thread.bg_queue = tmp; |
148 | pthread_cond_signal(&thread.cond_msg_avail); |
149 | } |
150 | pthread_mutex_unlock(&thread.queue_lock); |
151 | } |
152 | |
153 | /* Waits for the main queue to completely finish. */ |
154 | void renderer_wait() { |
155 | if (!thread.running) return; |
156 | |
157 | /* Not completely safe, but should be fine since the render thread |
158 | * only decreases used, and we check again inside the lock. */ |
159 | if (!thread.queue->used) { |
160 | return; |
161 | } |
162 | |
163 | pthread_mutex_lock(&thread.queue_lock); |
164 | |
165 | while (thread.queue->used) { |
166 | pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock); |
167 | } |
168 | |
169 | pthread_mutex_unlock(&thread.queue_lock); |
170 | } |
171 | |
172 | /* Waits for all GPU commands in both queues to finish, bringing VRAM |
173 | * completely up-to-date. */ |
174 | void renderer_sync(void) { |
175 | if (!thread.running) return; |
176 | |
177 | /* Not completely safe, but should be fine since the render thread |
178 | * only decreases used, and we check again inside the lock. */ |
179 | if (!thread.queue->used && !thread.bg_queue->used) { |
180 | return; |
181 | } |
182 | |
183 | if (thread.bg_queue->used) { |
184 | /* When we flush the background queue, the vblank handler can't |
185 | * know that we had a frame pending, and we delay rendering too |
186 | * long. Force it. */ |
187 | flushed = TRUE; |
188 | } |
189 | |
190 | /* Flush both queues. This is necessary because gpulib could be |
191 | * trying to process a DMA write that a command in the queue should |
192 | * run beforehand. For example, Xenogears sprites write a black |
193 | * rectangle over the to-be-DMA'd spot in VRAM -- if this write |
194 | * happens after the DMA, it will clear the DMA, resulting in |
195 | * flickering sprites. We need to be totally up-to-date. This may |
196 | * drop a frame. */ |
197 | renderer_wait(); |
198 | cmd_queue_swap(); |
199 | hold_cmds = FALSE; |
200 | renderer_wait(); |
201 | } |
202 | |
203 | static void video_thread_stop() { |
204 | int i; |
205 | renderer_sync(); |
206 | |
207 | if (thread.running) { |
208 | thread.running = FALSE; |
209 | pthread_cond_signal(&thread.cond_msg_avail); |
210 | pthread_join(thread.thread, NULL); |
211 | } |
212 | |
213 | pthread_mutex_destroy(&thread.queue_lock); |
214 | pthread_cond_destroy(&thread.cond_msg_avail); |
215 | pthread_cond_destroy(&thread.cond_msg_done); |
216 | pthread_cond_destroy(&thread.cond_queue_empty); |
217 | |
218 | for (i = 0; i < QUEUE_SIZE; i++) { |
219 | video_thread_cmd *cmd = &thread.queue->queue[i]; |
220 | free(cmd->cmd_list); |
221 | cmd->cmd_list = NULL; |
222 | } |
223 | |
224 | for (i = 0; i < QUEUE_SIZE; i++) { |
225 | video_thread_cmd *cmd = &thread.bg_queue->queue[i]; |
226 | free(cmd->cmd_list); |
227 | cmd->cmd_list = NULL; |
228 | } |
229 | } |
230 | |
231 | static void video_thread_start() { |
232 | SysPrintf("Starting render thread\n"); |
233 | |
234 | thread.queue = &queues[0]; |
235 | thread.bg_queue = &queues[1]; |
236 | thread.running = TRUE; |
237 | |
238 | if (pthread_cond_init(&thread.cond_msg_avail, NULL) || |
239 | pthread_cond_init(&thread.cond_msg_done, NULL) || |
240 | pthread_cond_init(&thread.cond_queue_empty, NULL) || |
241 | pthread_mutex_init(&thread.queue_lock, NULL) || |
242 | pthread_create(&thread.thread, NULL, video_thread_main, &thread)) { |
243 | goto error; |
244 | } |
245 | |
246 | return; |
247 | |
248 | error: |
249 | SysPrintf("Failed to start rendering thread\n"); |
250 | thread.running = FALSE; |
251 | video_thread_stop(); |
252 | } |
253 | |
254 | static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) { |
255 | video_thread_cmd *cmd; |
256 | uint32_t *cmd_list; |
257 | video_thread_queue *queue; |
258 | BOOL lock; |
259 | |
260 | cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t)); |
261 | |
262 | if (!cmd_list) { |
263 | /* Out of memory, disable the thread and run sync from now on */ |
264 | SysPrintf("Failed to allocate render thread command list, stopping thread\n"); |
265 | video_thread_stop(); |
266 | } |
267 | |
268 | memcpy(cmd_list, list, count * sizeof(uint32_t)); |
269 | |
270 | if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) { |
271 | /* If the bg queue is full, do a full sync to empty both queues |
272 | * and clear space. This should be very rare, I've only seen it in |
273 | * Tekken 3 post-battle-replay. */ |
274 | renderer_sync(); |
275 | } |
276 | |
277 | if (hold_cmds) { |
278 | queue = thread.bg_queue; |
279 | lock = FALSE; |
280 | } else { |
281 | queue = thread.queue; |
282 | lock = TRUE; |
283 | } |
284 | |
285 | if (lock) { |
286 | pthread_mutex_lock(&thread.queue_lock); |
287 | |
288 | while (queue->used >= QUEUE_SIZE) { |
289 | pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock); |
290 | } |
291 | } |
292 | |
293 | cmd = &queue->queue[queue->end]; |
294 | free(cmd->cmd_list); |
295 | cmd->cmd_list = cmd_list; |
296 | cmd->count = count; |
297 | cmd->last_cmd = last_cmd; |
298 | queue->end = (queue->end + 1) % QUEUE_SIZE; |
299 | queue->used++; |
300 | |
301 | if (lock) { |
302 | pthread_cond_signal(&thread.cond_msg_avail); |
303 | pthread_mutex_unlock(&thread.queue_lock); |
304 | } |
305 | } |
306 | |
307 | /* Slice off just the part of the list that can be handled async, and |
308 | * update ex_regs. */ |
309 | static int scan_cmd_list(uint32_t *data, int count, |
310 | int *cycles_sum_out, int *cycles_last, int *last_cmd) |
311 | { |
312 | int cpu_cycles_sum = 0, cpu_cycles = *cycles_last; |
313 | int cmd = 0, pos = 0, len, v; |
314 | |
315 | while (pos < count) { |
316 | uint32_t *list = data + pos; |
317 | short *slist = (void *)list; |
318 | cmd = LE32TOH(list[0]) >> 24; |
319 | len = 1 + cmd_lengths[cmd]; |
320 | |
321 | switch (cmd) { |
322 | case 0x02: |
323 | gput_sum(cpu_cycles_sum, cpu_cycles, |
324 | gput_fill(LE16TOH(slist[4]) & 0x3ff, |
325 | LE16TOH(slist[5]) & 0x1ff)); |
326 | break; |
327 | case 0x20 ... 0x23: |
328 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base()); |
329 | break; |
330 | case 0x24 ... 0x27: |
331 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t()); |
332 | gpu.ex_regs[1] &= ~0x1ff; |
333 | gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff; |
334 | break; |
335 | case 0x28 ... 0x2b: |
336 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base()); |
337 | break; |
338 | case 0x2c ... 0x2f: |
339 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t()); |
340 | gpu.ex_regs[1] &= ~0x1ff; |
341 | gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff; |
342 | break; |
343 | case 0x30 ... 0x33: |
344 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g()); |
345 | break; |
346 | case 0x34 ... 0x37: |
347 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt()); |
348 | gpu.ex_regs[1] &= ~0x1ff; |
349 | gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff; |
350 | break; |
351 | case 0x38 ... 0x3b: |
352 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g()); |
353 | break; |
354 | case 0x3c ... 0x3f: |
355 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt()); |
356 | gpu.ex_regs[1] &= ~0x1ff; |
357 | gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff; |
358 | break; |
359 | case 0x40 ... 0x47: |
360 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0)); |
361 | break; |
362 | case 0x48 ... 0x4F: |
363 | for (v = 3; pos + v < count; v++) |
364 | { |
365 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0)); |
366 | if ((list[v] & 0xf000f000) == 0x50005000) |
367 | break; |
368 | } |
369 | len += v - 3; |
370 | break; |
371 | case 0x50 ... 0x57: |
372 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0)); |
373 | break; |
374 | case 0x58 ... 0x5F: |
375 | for (v = 4; pos + v < count; v += 2) |
376 | { |
377 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0)); |
378 | if ((list[v] & 0xf000f000) == 0x50005000) |
379 | break; |
380 | } |
381 | len += v - 4; |
382 | break; |
383 | case 0x60 ... 0x63: |
384 | gput_sum(cpu_cycles_sum, cpu_cycles, |
385 | gput_sprite(LE16TOH(slist[4]) & 0x3ff, |
386 | LE16TOH(slist[5]) & 0x1ff)); |
387 | break; |
388 | case 0x64 ... 0x67: |
389 | gput_sum(cpu_cycles_sum, cpu_cycles, |
390 | gput_sprite(LE16TOH(slist[6]) & 0x3ff, |
391 | LE16TOH(slist[7]) & 0x1ff)); |
392 | break; |
393 | case 0x68 ... 0x6b: |
394 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1)); |
395 | break; |
396 | case 0x70 ... 0x77: |
397 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8)); |
398 | break; |
399 | case 0x78 ... 0x7f: |
400 | gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16)); |
401 | break; |
402 | default: |
403 | if ((cmd & 0xf8) == 0xe0) |
404 | gpu.ex_regs[cmd & 7] = list[0]; |
405 | break; |
406 | } |
407 | |
408 | if (pos + len > count) { |
409 | cmd = -1; |
410 | break; /* incomplete cmd */ |
411 | } |
412 | if (0x80 <= cmd && cmd <= 0xdf) |
413 | break; /* image i/o */ |
414 | |
415 | pos += len; |
416 | } |
417 | |
418 | *cycles_sum_out += cpu_cycles_sum; |
419 | *cycles_last = cpu_cycles; |
420 | *last_cmd = cmd; |
421 | return pos; |
422 | } |
423 | |
424 | int do_cmd_list(uint32_t *list, int count, |
425 | int *cycles_sum, int *cycles_last, int *last_cmd) |
426 | { |
427 | int pos = 0; |
428 | |
429 | if (thread.running) { |
430 | pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd); |
431 | video_thread_queue_cmd(list, pos, *last_cmd); |
432 | } else { |
433 | pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd); |
434 | memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs)); |
435 | } |
436 | return pos; |
437 | } |
438 | |
439 | int renderer_init(void) { |
440 | if (thread_rendering) { |
441 | video_thread_start(); |
442 | } |
443 | return real_renderer_init(); |
444 | } |
445 | |
446 | void renderer_finish(void) { |
447 | real_renderer_finish(); |
448 | |
449 | if (thread_rendering && thread.running) { |
450 | video_thread_stop(); |
451 | } |
452 | } |
453 | |
454 | void renderer_sync_ecmds(uint32_t * ecmds) { |
455 | if (thread.running) { |
456 | int dummy = 0; |
457 | do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy); |
458 | } else { |
459 | real_renderer_sync_ecmds(ecmds); |
460 | } |
461 | } |
462 | |
463 | void renderer_update_caches(int x, int y, int w, int h, int state_changed) { |
464 | renderer_sync(); |
465 | real_renderer_update_caches(x, y, w, h, state_changed); |
466 | } |
467 | |
468 | void renderer_flush_queues(void) { |
469 | /* Called during DMA and updateLace. We want to sync if it's DMA, |
470 | * but not if it's updateLace. Instead of syncing here, there's a |
471 | * renderer_sync call during DMA. */ |
472 | real_renderer_flush_queues(); |
473 | } |
474 | |
475 | /* |
476 | * Normally all GPU commands are processed before rendering the |
477 | * frame. For games that naturally run < 50/60fps, this is unnecessary |
478 | * -- it forces the game to render as if it was 60fps and leaves the |
479 | * GPU idle half the time on a 30fps game, for example. |
480 | * |
481 | * Allowing the renderer to wait until a frame is done before |
482 | * rendering it would give it double, triple, or quadruple the amount |
483 | * of time to finish before we have to wait for it. |
484 | * |
485 | * We can use a heuristic to figure out when to force a render. |
486 | * |
487 | * - If a frame isn't done when we're asked to render, wait for it and |
488 | * put future GPU commands in a separate buffer (for the next frame) |
489 | * |
490 | * - If the frame is done, and had no future GPU commands, render it. |
491 | * |
492 | * - If we do have future GPU commands, it meant the frame took too |
493 | * long to render and there's another frame waiting. Stop until the |
494 | * first frame finishes, render it, and start processing the next |
495 | * one. |
496 | * |
497 | * This may possibly add a frame or two of latency that shouldn't be |
498 | * different than the real device. It may skip rendering a frame |
499 | * entirely if a VRAM transfer happens while a frame is waiting, or in |
500 | * games that natively run at 60fps if frames are coming in too |
501 | * quickly to process. Depending on how the game treats "60fps," this |
502 | * may not be noticeable. |
503 | */ |
504 | void renderer_notify_update_lace(int updated) { |
505 | if (!thread.running) return; |
506 | |
507 | if (thread_rendering == THREAD_RENDERING_SYNC) { |
508 | renderer_sync(); |
509 | return; |
510 | } |
511 | |
512 | if (updated) { |
513 | cmd_queue_swap(); |
514 | return; |
515 | } |
516 | |
517 | pthread_mutex_lock(&thread.queue_lock); |
518 | if (thread.bg_queue->used || flushed) { |
519 | /* We have commands for a future frame to run. Force a wait until |
520 | * the current frame is finished, and start processing the next |
521 | * frame after it's drawn (see the `updated` clause above). */ |
522 | pthread_mutex_unlock(&thread.queue_lock); |
523 | renderer_wait(); |
524 | pthread_mutex_lock(&thread.queue_lock); |
525 | |
526 | /* We are no longer holding commands back, so the next frame may |
527 | * get mixed into the following frame. This is usually fine, but can |
528 | * result in frameskip-like effects for 60fps games. */ |
529 | flushed = FALSE; |
530 | hold_cmds = FALSE; |
531 | needs_display = TRUE; |
532 | gpu.state.fb_dirty = TRUE; |
533 | } else if (thread.queue->used) { |
534 | /* We are still drawing during a vblank. Cut off the current frame |
535 | * by sending new commands to the background queue and skip |
536 | * drawing our partly rendered frame to the display. */ |
537 | hold_cmds = TRUE; |
538 | needs_display = TRUE; |
539 | gpu.state.fb_dirty = FALSE; |
540 | } else if (needs_display && !thread.queue->used) { |
541 | /* We have processed all commands in the queue, render the |
542 | * buffer. We know we have something to render, because |
543 | * needs_display is TRUE. */ |
544 | hold_cmds = FALSE; |
545 | needs_display = FALSE; |
546 | gpu.state.fb_dirty = TRUE; |
547 | } else { |
548 | /* Everything went normally, so do the normal thing. */ |
549 | } |
550 | |
551 | pthread_mutex_unlock(&thread.queue_lock); |
552 | } |
553 | |
554 | void renderer_set_interlace(int enable, int is_odd) { |
555 | real_renderer_set_interlace(enable, is_odd); |
556 | } |
557 | |
558 | void renderer_set_config(const struct rearmed_cbs *cbs) { |
559 | renderer_sync(); |
560 | thread_rendering = cbs->thread_rendering; |
561 | if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) { |
562 | video_thread_start(); |
563 | } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) { |
564 | video_thread_stop(); |
565 | } |
566 | real_renderer_set_config(cbs); |
567 | } |
568 | |
569 | void renderer_notify_res_change(void) { |
570 | renderer_sync(); |
571 | real_renderer_notify_res_change(); |
572 | } |