| 1 | /************************************************************************** |
| 2 | * Copyright (C) 2020 The RetroArch Team * |
| 3 | * * |
| 4 | * This program is free software; you can redistribute it and/or modify * |
| 5 | * it under the terms of the GNU General Public License as published by * |
| 6 | * the Free Software Foundation; either version 2 of the License, or * |
| 7 | * (at your option) any later version. * |
| 8 | * * |
| 9 | * This program is distributed in the hope that it will be useful, * |
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of * |
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
| 12 | * GNU General Public License for more details. * |
| 13 | * * |
| 14 | * You should have received a copy of the GNU General Public License * |
| 15 | * along with this program; if not, write to the * |
| 16 | * Free Software Foundation, Inc., * |
| 17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * |
| 18 | ***************************************************************************/ |
| 19 | |
| 20 | #include <stdlib.h> |
| 21 | #include <stdio.h> |
| 22 | #include <string.h> |
| 23 | #include <pthread.h> |
| 24 | #include "../gpulib/gpu.h" |
| 25 | #include "../../frontend/plugin_lib.h" |
| 26 | #include "gpulib_thread_if.h" |
| 27 | |
| 28 | #define FALSE 0 |
| 29 | #define TRUE 1 |
| 30 | #define BOOL unsigned short |
| 31 | |
| 32 | typedef struct { |
| 33 | uint32_t *cmd_list; |
| 34 | int count; |
| 35 | int last_cmd; |
| 36 | } video_thread_cmd; |
| 37 | |
| 38 | #define QUEUE_SIZE 0x2000 |
| 39 | |
| 40 | typedef struct { |
| 41 | size_t start; |
| 42 | size_t end; |
| 43 | size_t used; |
| 44 | video_thread_cmd queue[QUEUE_SIZE]; |
| 45 | } video_thread_queue; |
| 46 | |
| 47 | typedef struct { |
| 48 | pthread_t thread; |
| 49 | pthread_mutex_t queue_lock; |
| 50 | pthread_cond_t cond_msg_avail; |
| 51 | pthread_cond_t cond_msg_done; |
| 52 | pthread_cond_t cond_queue_empty; |
| 53 | video_thread_queue *queue; |
| 54 | video_thread_queue *bg_queue; |
| 55 | BOOL running; |
| 56 | } video_thread_state; |
| 57 | |
| 58 | static video_thread_state thread; |
| 59 | static video_thread_queue queues[2]; |
| 60 | static int thread_rendering; |
| 61 | static BOOL hold_cmds; |
| 62 | static BOOL needs_display; |
| 63 | static BOOL flushed; |
| 64 | |
| 65 | extern const unsigned char cmd_lengths[]; |
| 66 | |
| 67 | static void *video_thread_main(void *arg) { |
| 68 | video_thread_state *thread = (video_thread_state *)arg; |
| 69 | video_thread_cmd *cmd; |
| 70 | int i; |
| 71 | |
| 72 | #ifdef _3DS |
| 73 | static int processed = 0; |
| 74 | #endif /* _3DS */ |
| 75 | |
| 76 | while(1) { |
| 77 | int result, last_cmd, start, end; |
| 78 | video_thread_queue *queue; |
| 79 | pthread_mutex_lock(&thread->queue_lock); |
| 80 | |
| 81 | while (!thread->queue->used && thread->running) { |
| 82 | pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock); |
| 83 | } |
| 84 | |
| 85 | if (!thread->running) { |
| 86 | pthread_mutex_unlock(&thread->queue_lock); |
| 87 | break; |
| 88 | } |
| 89 | |
| 90 | queue = thread->queue; |
| 91 | start = queue->start; |
| 92 | end = queue->end > queue->start ? queue->end : QUEUE_SIZE; |
| 93 | queue->start = end % QUEUE_SIZE; |
| 94 | pthread_mutex_unlock(&thread->queue_lock); |
| 95 | |
| 96 | for (i = start; i < end; i++) { |
| 97 | cmd = &queue->queue[i]; |
| 98 | result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd); |
| 99 | |
| 100 | if (result != cmd->count) { |
| 101 | fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result); |
| 102 | } |
| 103 | |
| 104 | #ifdef _3DS |
| 105 | /* Periodically yield so as not to starve other threads */ |
| 106 | processed += cmd->count; |
| 107 | if (processed >= 512) { |
| 108 | svcSleepThread(1); |
| 109 | processed %= 512; |
| 110 | } |
| 111 | #endif /* _3DS */ |
| 112 | } |
| 113 | |
| 114 | pthread_mutex_lock(&thread->queue_lock); |
| 115 | queue->used -= (end - start); |
| 116 | |
| 117 | if (!queue->used) |
| 118 | pthread_cond_signal(&thread->cond_queue_empty); |
| 119 | |
| 120 | pthread_cond_signal(&thread->cond_msg_done); |
| 121 | pthread_mutex_unlock(&thread->queue_lock); |
| 122 | } |
| 123 | |
| 124 | return 0; |
| 125 | } |
| 126 | |
| 127 | static void cmd_queue_swap() { |
| 128 | video_thread_queue *tmp; |
| 129 | if (!thread.bg_queue->used) return; |
| 130 | |
| 131 | pthread_mutex_lock(&thread.queue_lock); |
| 132 | if (!thread.queue->used) { |
| 133 | tmp = thread.queue; |
| 134 | thread.queue = thread.bg_queue; |
| 135 | thread.bg_queue = tmp; |
| 136 | pthread_cond_signal(&thread.cond_msg_avail); |
| 137 | } |
| 138 | pthread_mutex_unlock(&thread.queue_lock); |
| 139 | } |
| 140 | |
| 141 | /* Waits for the main queue to completely finish. */ |
| 142 | void renderer_wait() { |
| 143 | if (!thread.running) return; |
| 144 | |
| 145 | /* Not completely safe, but should be fine since the render thread |
| 146 | * only decreases used, and we check again inside the lock. */ |
| 147 | if (!thread.queue->used) { |
| 148 | return; |
| 149 | } |
| 150 | |
| 151 | pthread_mutex_lock(&thread.queue_lock); |
| 152 | |
| 153 | while (thread.queue->used) { |
| 154 | pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock); |
| 155 | } |
| 156 | |
| 157 | pthread_mutex_unlock(&thread.queue_lock); |
| 158 | } |
| 159 | |
| 160 | /* Waits for all GPU commands in both queues to finish, bringing VRAM |
| 161 | * completely up-to-date. */ |
| 162 | void renderer_sync(void) { |
| 163 | if (!thread.running) return; |
| 164 | |
| 165 | /* Not completely safe, but should be fine since the render thread |
| 166 | * only decreases used, and we check again inside the lock. */ |
| 167 | if (!thread.queue->used && !thread.bg_queue->used) { |
| 168 | return; |
| 169 | } |
| 170 | |
| 171 | if (thread.bg_queue->used) { |
| 172 | /* When we flush the background queue, the vblank handler can't |
| 173 | * know that we had a frame pending, and we delay rendering too |
| 174 | * long. Force it. */ |
| 175 | flushed = TRUE; |
| 176 | } |
| 177 | |
| 178 | /* Flush both queues. This is necessary because gpulib could be |
| 179 | * trying to process a DMA write that a command in the queue should |
| 180 | * run beforehand. For example, Xenogears sprites write a black |
| 181 | * rectangle over the to-be-DMA'd spot in VRAM -- if this write |
| 182 | * happens after the DMA, it will clear the DMA, resulting in |
| 183 | * flickering sprites. We need to be totally up-to-date. This may |
| 184 | * drop a frame. */ |
| 185 | renderer_wait(); |
| 186 | cmd_queue_swap(); |
| 187 | hold_cmds = FALSE; |
| 188 | renderer_wait(); |
| 189 | } |
| 190 | |
| 191 | static void video_thread_stop() { |
| 192 | int i; |
| 193 | renderer_sync(); |
| 194 | |
| 195 | if (thread.running) { |
| 196 | thread.running = FALSE; |
| 197 | pthread_cond_signal(&thread.cond_msg_avail); |
| 198 | pthread_join(thread.thread, NULL); |
| 199 | } |
| 200 | |
| 201 | pthread_mutex_destroy(&thread.queue_lock); |
| 202 | pthread_cond_destroy(&thread.cond_msg_avail); |
| 203 | pthread_cond_destroy(&thread.cond_msg_done); |
| 204 | pthread_cond_destroy(&thread.cond_queue_empty); |
| 205 | |
| 206 | for (i = 0; i < QUEUE_SIZE; i++) { |
| 207 | video_thread_cmd *cmd = &thread.queue->queue[i]; |
| 208 | free(cmd->cmd_list); |
| 209 | cmd->cmd_list = NULL; |
| 210 | } |
| 211 | |
| 212 | for (i = 0; i < QUEUE_SIZE; i++) { |
| 213 | video_thread_cmd *cmd = &thread.bg_queue->queue[i]; |
| 214 | free(cmd->cmd_list); |
| 215 | cmd->cmd_list = NULL; |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | static void video_thread_start() { |
| 220 | fprintf(stdout, "Starting render thread\n"); |
| 221 | |
| 222 | if (pthread_cond_init(&thread.cond_msg_avail, NULL) || |
| 223 | pthread_cond_init(&thread.cond_msg_done, NULL) || |
| 224 | pthread_cond_init(&thread.cond_queue_empty, NULL) || |
| 225 | pthread_mutex_init(&thread.queue_lock, NULL) || |
| 226 | pthread_create(&thread.thread, NULL, video_thread_main, &thread)) { |
| 227 | goto error; |
| 228 | } |
| 229 | |
| 230 | thread.queue = &queues[0]; |
| 231 | thread.bg_queue = &queues[1]; |
| 232 | |
| 233 | thread.running = TRUE; |
| 234 | return; |
| 235 | |
| 236 | error: |
| 237 | fprintf(stderr,"Failed to start rendering thread\n"); |
| 238 | video_thread_stop(); |
| 239 | } |
| 240 | |
| 241 | static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) { |
| 242 | video_thread_cmd *cmd; |
| 243 | uint32_t *cmd_list; |
| 244 | video_thread_queue *queue; |
| 245 | BOOL lock; |
| 246 | |
| 247 | cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t)); |
| 248 | |
| 249 | if (!cmd_list) { |
| 250 | /* Out of memory, disable the thread and run sync from now on */ |
| 251 | fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n"); |
| 252 | video_thread_stop(); |
| 253 | } |
| 254 | |
| 255 | memcpy(cmd_list, list, count * sizeof(uint32_t)); |
| 256 | |
| 257 | if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) { |
| 258 | /* If the bg queue is full, do a full sync to empty both queues |
| 259 | * and clear space. This should be very rare, I've only seen it in |
| 260 | * Tekken 3 post-battle-replay. */ |
| 261 | renderer_sync(); |
| 262 | } |
| 263 | |
| 264 | if (hold_cmds) { |
| 265 | queue = thread.bg_queue; |
| 266 | lock = FALSE; |
| 267 | } else { |
| 268 | queue = thread.queue; |
| 269 | lock = TRUE; |
| 270 | } |
| 271 | |
| 272 | if (lock) { |
| 273 | pthread_mutex_lock(&thread.queue_lock); |
| 274 | |
| 275 | while (queue->used >= QUEUE_SIZE) { |
| 276 | pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock); |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | cmd = &queue->queue[queue->end]; |
| 281 | free(cmd->cmd_list); |
| 282 | cmd->cmd_list = cmd_list; |
| 283 | cmd->count = count; |
| 284 | cmd->last_cmd = last_cmd; |
| 285 | queue->end = (queue->end + 1) % QUEUE_SIZE; |
| 286 | queue->used++; |
| 287 | |
| 288 | if (lock) { |
| 289 | pthread_cond_signal(&thread.cond_msg_avail); |
| 290 | pthread_mutex_unlock(&thread.queue_lock); |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | /* Slice off just the part of the list that can be handled async, and |
| 295 | * update ex_regs. */ |
| 296 | static int scan_cmd_list(uint32_t *data, int count, int *last_cmd) |
| 297 | { |
| 298 | int cmd = 0, pos = 0, len, v; |
| 299 | |
| 300 | while (pos < count) { |
| 301 | uint32_t *list = data + pos; |
| 302 | cmd = list[0] >> 24; |
| 303 | len = 1 + cmd_lengths[cmd]; |
| 304 | |
| 305 | switch (cmd) { |
| 306 | case 0x02: |
| 307 | break; |
| 308 | case 0x24 ... 0x27: |
| 309 | case 0x2c ... 0x2f: |
| 310 | case 0x34 ... 0x37: |
| 311 | case 0x3c ... 0x3f: |
| 312 | gpu.ex_regs[1] &= ~0x1ff; |
| 313 | gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff; |
| 314 | break; |
| 315 | case 0x48 ... 0x4F: |
| 316 | for (v = 3; pos + v < count; v++) |
| 317 | { |
| 318 | if ((list[v] & 0xf000f000) == 0x50005000) |
| 319 | break; |
| 320 | } |
| 321 | len += v - 3; |
| 322 | break; |
| 323 | case 0x58 ... 0x5F: |
| 324 | for (v = 4; pos + v < count; v += 2) |
| 325 | { |
| 326 | if ((list[v] & 0xf000f000) == 0x50005000) |
| 327 | break; |
| 328 | } |
| 329 | len += v - 4; |
| 330 | break; |
| 331 | default: |
| 332 | if ((cmd & 0xf8) == 0xe0) |
| 333 | gpu.ex_regs[cmd & 7] = list[0]; |
| 334 | break; |
| 335 | } |
| 336 | |
| 337 | if (pos + len > count) { |
| 338 | cmd = -1; |
| 339 | break; /* incomplete cmd */ |
| 340 | } |
| 341 | if (0xa0 <= cmd && cmd <= 0xdf) |
| 342 | break; /* image i/o */ |
| 343 | |
| 344 | pos += len; |
| 345 | } |
| 346 | |
| 347 | *last_cmd = cmd; |
| 348 | return pos; |
| 349 | } |
| 350 | |
| 351 | int do_cmd_list(uint32_t *list, int count, int *last_cmd) { |
| 352 | int pos = 0; |
| 353 | |
| 354 | if (thread.running) { |
| 355 | pos = scan_cmd_list(list, count, last_cmd); |
| 356 | video_thread_queue_cmd(list, pos, *last_cmd); |
| 357 | } else { |
| 358 | pos = real_do_cmd_list(list, count, last_cmd); |
| 359 | memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs)); |
| 360 | } |
| 361 | return pos; |
| 362 | } |
| 363 | |
| 364 | int renderer_init(void) { |
| 365 | if (thread_rendering) { |
| 366 | video_thread_start(); |
| 367 | } |
| 368 | return real_renderer_init(); |
| 369 | } |
| 370 | |
| 371 | void renderer_finish(void) { |
| 372 | real_renderer_finish(); |
| 373 | |
| 374 | if (thread_rendering && thread.running) { |
| 375 | video_thread_stop(); |
| 376 | } |
| 377 | } |
| 378 | |
| 379 | void renderer_sync_ecmds(uint32_t * ecmds) { |
| 380 | if (thread.running) { |
| 381 | int dummy; |
| 382 | do_cmd_list(&ecmds[1], 6, &dummy); |
| 383 | } else { |
| 384 | real_renderer_sync_ecmds(ecmds); |
| 385 | } |
| 386 | } |
| 387 | |
| 388 | void renderer_update_caches(int x, int y, int w, int h) { |
| 389 | renderer_sync(); |
| 390 | real_renderer_update_caches(x, y, w, h); |
| 391 | } |
| 392 | |
| 393 | void renderer_flush_queues(void) { |
| 394 | /* Called during DMA and updateLace. We want to sync if it's DMA, |
| 395 | * but not if it's updateLace. Instead of syncing here, there's a |
| 396 | * renderer_sync call during DMA. */ |
| 397 | real_renderer_flush_queues(); |
| 398 | } |
| 399 | |
| 400 | /* |
| 401 | * Normally all GPU commands are processed before rendering the |
| 402 | * frame. For games that naturally run < 50/60fps, this is unnecessary |
| 403 | * -- it forces the game to render as if it was 60fps and leaves the |
| 404 | * GPU idle half the time on a 30fps game, for example. |
| 405 | * |
| 406 | * Allowing the renderer to wait until a frame is done before |
| 407 | * rendering it would give it double, triple, or quadruple the amount |
| 408 | * of time to finish before we have to wait for it. |
| 409 | * |
| 410 | * We can use a heuristic to figure out when to force a render. |
| 411 | * |
| 412 | * - If a frame isn't done when we're asked to render, wait for it and |
| 413 | * put future GPU commands in a separate buffer (for the next frame) |
| 414 | * |
| 415 | * - If the frame is done, and had no future GPU commands, render it. |
| 416 | * |
| 417 | * - If we do have future GPU commands, it meant the frame took too |
| 418 | * long to render and there's another frame waiting. Stop until the |
| 419 | * first frame finishes, render it, and start processing the next |
| 420 | * one. |
| 421 | * |
| 422 | * This may possibly add a frame or two of latency that shouldn't be |
| 423 | * different than the real device. It may skip rendering a frame |
| 424 | * entirely if a VRAM transfer happens while a frame is waiting, or in |
| 425 | * games that natively run at 60fps if frames are coming in too |
| 426 | * quickly to process. Depending on how the game treats "60fps," this |
| 427 | * may not be noticeable. |
| 428 | */ |
| 429 | void renderer_notify_update_lace(int updated) { |
| 430 | if (!thread.running) return; |
| 431 | |
| 432 | if (thread_rendering == THREAD_RENDERING_SYNC) { |
| 433 | renderer_sync(); |
| 434 | return; |
| 435 | } |
| 436 | |
| 437 | if (updated) { |
| 438 | cmd_queue_swap(); |
| 439 | return; |
| 440 | } |
| 441 | |
| 442 | pthread_mutex_lock(&thread.queue_lock); |
| 443 | if (thread.bg_queue->used || flushed) { |
| 444 | /* We have commands for a future frame to run. Force a wait until |
| 445 | * the current frame is finished, and start processing the next |
| 446 | * frame after it's drawn (see the `updated` clause above). */ |
| 447 | pthread_mutex_unlock(&thread.queue_lock); |
| 448 | renderer_wait(); |
| 449 | pthread_mutex_lock(&thread.queue_lock); |
| 450 | |
| 451 | /* We are no longer holding commands back, so the next frame may |
| 452 | * get mixed into the following frame. This is usually fine, but can |
| 453 | * result in frameskip-like effects for 60fps games. */ |
| 454 | flushed = FALSE; |
| 455 | hold_cmds = FALSE; |
| 456 | needs_display = TRUE; |
| 457 | gpu.state.fb_dirty = TRUE; |
| 458 | } else if (thread.queue->used) { |
| 459 | /* We are still drawing during a vblank. Cut off the current frame |
| 460 | * by sending new commands to the background queue and skip |
| 461 | * drawing our partly rendered frame to the display. */ |
| 462 | hold_cmds = TRUE; |
| 463 | needs_display = TRUE; |
| 464 | gpu.state.fb_dirty = FALSE; |
| 465 | } else if (needs_display && !thread.queue->used) { |
| 466 | /* We have processed all commands in the queue, render the |
| 467 | * buffer. We know we have something to render, because |
| 468 | * needs_display is TRUE. */ |
| 469 | hold_cmds = FALSE; |
| 470 | needs_display = FALSE; |
| 471 | gpu.state.fb_dirty = TRUE; |
| 472 | } else { |
| 473 | /* Everything went normally, so do the normal thing. */ |
| 474 | } |
| 475 | |
| 476 | pthread_mutex_unlock(&thread.queue_lock); |
| 477 | } |
| 478 | |
| 479 | void renderer_set_interlace(int enable, int is_odd) { |
| 480 | real_renderer_set_interlace(enable, is_odd); |
| 481 | } |
| 482 | |
| 483 | void renderer_set_config(const struct rearmed_cbs *cbs) { |
| 484 | renderer_sync(); |
| 485 | thread_rendering = cbs->thread_rendering; |
| 486 | if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) { |
| 487 | video_thread_start(); |
| 488 | } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) { |
| 489 | video_thread_stop(); |
| 490 | } |
| 491 | real_renderer_set_config(cbs); |
| 492 | } |
| 493 | |
| 494 | void renderer_notify_res_change(void) { |
| 495 | renderer_sync(); |
| 496 | real_renderer_notify_res_change(); |
| 497 | } |