plugins/gpulib/gpulib_thread_if.c

   1 /**************************************************************************
   2 *   Copyright (C) 2020 The RetroArch Team                                 *
   3 *                                                                         *
   4 *   This program is free software; you can redistribute it and/or modify  *
   5 *   it under the terms of the GNU General Public License as published by  *
   6 *   the Free Software Foundation; either version 2 of the License, or     *
   7 *   (at your option) any later version.                                   *
   8 *                                                                         *
   9 *   This program is distributed in the hope that it will be useful,       *
  10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  12 *   GNU General Public License for more details.                          *
  13 *                                                                         *
  14 *   You should have received a copy of the GNU General Public License     *
  15 *   along with this program; if not, write to the                         *
  16 *   Free Software Foundation, Inc.,                                       *
  17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
  18 ***************************************************************************/
  19
  20 #include <stdlib.h>
  21 #include <stdio.h>
  22 #include <string.h>
  23 #include <pthread.h>
  24 #include "../gpulib/gpu.h"
  25 #include "../../frontend/plugin_lib.h"
  26 #include "gpulib_thread_if.h"
  27
  28 #define FALSE 0
  29 #define TRUE 1
  30 #define BOOL unsigned short
  31
  32 typedef struct {
  33         uint32_t *cmd_list;
  34         int count;
  35         int last_cmd;
  36 } video_thread_cmd;
  37
  38 #define QUEUE_SIZE 0x2000
  39
  40 typedef struct {
  41         size_t start;
  42         size_t end;
  43         size_t used;
  44         video_thread_cmd queue[QUEUE_SIZE];
  45 } video_thread_queue;
  46
  47 typedef struct {
  48         pthread_t thread;
  49         pthread_mutex_t queue_lock;
  50         pthread_cond_t cond_msg_avail;
  51         pthread_cond_t cond_msg_done;
  52         pthread_cond_t cond_queue_empty;
  53         video_thread_queue *queue;
  54         video_thread_queue *bg_queue;
  55         BOOL running;
  56 } video_thread_state;
  57
  58 static video_thread_state thread;
  59 static video_thread_queue queues[2];
  60 static int thread_rendering;
  61 static BOOL hold_cmds;
  62 static BOOL needs_display;
  63
  64 extern const unsigned char cmd_lengths[];
  65
  66 static void *video_thread_main(void *arg) {
  67         video_thread_state *thread = (video_thread_state *)arg;
  68         video_thread_cmd *cmd;
  69         int i;
  70
  71 #ifdef _3DS
  72         static int processed = 0;
  73 #endif /* _3DS */
  74
  75         while(1) {
  76                 int result, last_cmd, start, end;
  77                 video_thread_queue *queue;
  78                 pthread_mutex_lock(&thread->queue_lock);
  79
  80                 while (!thread->queue->used && thread->running) {
  81                         pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
  82                 }
  83
  84                 if (!thread->running) {
  85                         pthread_mutex_unlock(&thread->queue_lock);
  86                         break;
  87                 }
  88
  89                 queue = thread->queue;
  90                 start = queue->start;
  91                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
  92                 queue->start = end % QUEUE_SIZE;
  93                 pthread_mutex_unlock(&thread->queue_lock);
  94
  95                 for (i = start; i < end; i++) {
  96                         cmd = &queue->queue[i];
  97                         result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);
  98
  99                         if (result != cmd->count) {
 100                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
 101                         }
 102
 103 #ifdef _3DS
 104                         /* Periodically yield so as not to starve other threads */
 105                         processed += cmd->count;
 106                         if (processed >= 512) {
 107                                 svcSleepThread(1);
 108                                 processed %= 512;
 109                         }
 110 #endif /* _3DS */
 111                 }
 112
 113                 pthread_mutex_lock(&thread->queue_lock);
 114                 queue->used -= (end - start);
 115
 116                 if (!queue->used)
 117                         pthread_cond_signal(&thread->cond_queue_empty);
 118
 119                 pthread_cond_signal(&thread->cond_msg_done);
 120                 pthread_mutex_unlock(&thread->queue_lock);
 121         }
 122
 123         return 0;
 124 }
 125
 126 static void cmd_queue_swap() {
 127         video_thread_queue *tmp;
 128         if (!thread.bg_queue->used) return;
 129
 130         pthread_mutex_lock(&thread.queue_lock);
 131         if (!thread.queue->used) {
 132                 tmp = thread.queue;
 133                 thread.queue = thread.bg_queue;
 134                 thread.bg_queue = tmp;
 135                 needs_display = TRUE;
 136                 pthread_cond_signal(&thread.cond_msg_avail);
 137         }
 138         pthread_mutex_unlock(&thread.queue_lock);
 139 }
 140
 141 /* Waits for the main queue to completely finish. */
 142 void renderer_wait() {
 143         if (!thread.running) return;
 144
 145         /* Not completely safe, but should be fine since the render thread
 146          * only decreases used, and we check again inside the lock. */
 147         if (!thread.queue->used) {
 148                 return;
 149         }
 150
 151         pthread_mutex_lock(&thread.queue_lock);
 152
 153         while (thread.queue->used) {
 154                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
 155         }
 156
 157         pthread_mutex_unlock(&thread.queue_lock);
 158 }
 159
 160 /* Waits for all GPU commands in both queues to finish, bringing VRAM
 161  * completely up-to-date. */
 162 void renderer_sync(void) {
 163         if (!thread.running) return;
 164
 165         /* Not completely safe, but should be fine since the render thread
 166          * only decreases used, and we check again inside the lock. */
 167         if (!thread.queue->used && !thread.bg_queue->used) {
 168                 return;
 169         }
 170
 171         /* Flush both queues. This is necessary because gpulib could be
 172          * trying to process a DMA write that a command in the queue should
 173          * run beforehand. For example, Xenogears sprites write a black
 174          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
 175          * happens after the DMA, it will clear the DMA, resulting in
 176          * flickering sprites. We need to be totally up-to-date. This may
 177          * drop a frame. */
 178         renderer_wait();
 179         cmd_queue_swap();
 180         hold_cmds = FALSE;
 181         renderer_wait();
 182 }
 183
 184 static void video_thread_stop() {
 185         int i;
 186         renderer_sync();
 187
 188         if (thread.running) {
 189                 thread.running = FALSE;
 190                 pthread_cond_signal(&thread.cond_msg_avail);
 191                 pthread_join(thread.thread, NULL);
 192         }
 193
 194         pthread_mutex_destroy(&thread.queue_lock);
 195         pthread_cond_destroy(&thread.cond_msg_avail);
 196         pthread_cond_destroy(&thread.cond_msg_done);
 197         pthread_cond_destroy(&thread.cond_queue_empty);
 198
 199         for (i = 0; i < QUEUE_SIZE; i++) {
 200                 video_thread_cmd *cmd = &thread.queue->queue[i];
 201                 free(cmd->cmd_list);
 202                 cmd->cmd_list = NULL;
 203         }
 204
 205         for (i = 0; i < QUEUE_SIZE; i++) {
 206                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
 207                 free(cmd->cmd_list);
 208                 cmd->cmd_list = NULL;
 209         }
 210 }
 211
 212 static void video_thread_start() {
 213         fprintf(stdout, "Starting render thread\n");
 214
 215         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
 216                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
 217                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
 218                         pthread_mutex_init(&thread.queue_lock, NULL) ||
 219                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
 220                 goto error;
 221         }
 222
 223         thread.queue = &queues[0];
 224         thread.bg_queue = &queues[1];
 225
 226         thread.running = TRUE;
 227         return;
 228
 229  error:
 230         fprintf(stderr,"Failed to start rendering thread\n");
 231         video_thread_stop();
 232 }
 233
 234 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
 235         video_thread_cmd *cmd;
 236         uint32_t *cmd_list;
 237         video_thread_queue *queue;
 238         BOOL lock;
 239
 240         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
 241
 242         if (!cmd_list) {
 243                 /* Out of memory, disable the thread and run sync from now on */
 244                 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
 245                 video_thread_stop();
 246         }
 247
 248         memcpy(cmd_list, list, count * sizeof(uint32_t));
 249
 250         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
 251                 /* If the bg queue is full, do a full sync to empty both queues
 252                  * and clear space. This should be very rare, I've only seen it in
 253                  * Tekken 3 post-battle-replay. */
 254                 renderer_sync();
 255         }
 256
 257         if (hold_cmds) {
 258                 queue = thread.bg_queue;
 259                 lock = FALSE;
 260         } else {
 261                 queue = thread.queue;
 262                 lock = TRUE;
 263         }
 264
 265         if (lock) {
 266                 pthread_mutex_lock(&thread.queue_lock);
 267
 268                 while (queue->used >= QUEUE_SIZE) {
 269                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
 270                 }
 271         }
 272
 273         cmd = &queue->queue[queue->end];
 274         free(cmd->cmd_list);
 275         cmd->cmd_list = cmd_list;
 276         cmd->count = count;
 277         cmd->last_cmd = last_cmd;
 278         queue->end = (queue->end + 1) % QUEUE_SIZE;
 279         queue->used++;
 280
 281         if (lock) {
 282                 pthread_cond_signal(&thread.cond_msg_avail);
 283                 pthread_mutex_unlock(&thread.queue_lock);
 284         }
 285 }
 286
 287 /* Slice off just the part of the list that can be handled async, and
 288  * update ex_regs. */
 289 static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
 290 {
 291         int cmd = 0, pos = 0, len, v;
 292
 293         while (pos < count) {
 294                 uint32_t *list = data + pos;
 295                 cmd = list[0] >> 24;
 296                 len = 1 + cmd_lengths[cmd];
 297
 298                 switch (cmd) {
 299                         case 0x02:
 300                                 break;
 301                         case 0x24 ... 0x27:
 302                         case 0x2c ... 0x2f:
 303                         case 0x34 ... 0x37:
 304                         case 0x3c ... 0x3f:
 305                                 gpu.ex_regs[1] &= ~0x1ff;
 306                                 gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
 307                                 break;
 308                         case 0x48 ... 0x4F:
 309                                 for (v = 3; pos + v < count; v++)
 310                                 {
 311                                         if ((list[v] & 0xf000f000) == 0x50005000)
 312                                                 break;
 313                                 }
 314                                 len += v - 3;
 315                                 break;
 316                         case 0x58 ... 0x5F:
 317                                 for (v = 4; pos + v < count; v += 2)
 318                                 {
 319                                         if ((list[v] & 0xf000f000) == 0x50005000)
 320                                                 break;
 321                                 }
 322                                 len += v - 4;
 323                                 break;
 324                         default:
 325                                 if ((cmd & 0xf8) == 0xe0)
 326                                         gpu.ex_regs[cmd & 7] = list[0];
 327                                 break;
 328                 }
 329
 330                 if (pos + len > count) {
 331                         cmd = -1;
 332                         break; /* incomplete cmd */
 333                 }
 334                 if (0xa0 <= cmd && cmd <= 0xdf)
 335                         break; /* image i/o */
 336
 337                 pos += len;
 338         }
 339
 340         *last_cmd = cmd;
 341         return pos;
 342 }
 343
 344 int do_cmd_list(uint32_t *list, int count, int *last_cmd) {
 345         int pos = 0;
 346
 347         if (thread.running) {
 348                 pos = scan_cmd_list(list, count, last_cmd);
 349                 video_thread_queue_cmd(list, pos, *last_cmd);
 350         } else {
 351                 pos = real_do_cmd_list(list, count, last_cmd);
 352                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
 353         }
 354         return pos;
 355 }
 356
 357 int renderer_init(void) {
 358         if (thread_rendering) {
 359                 video_thread_start();
 360         }
 361         return real_renderer_init();
 362 }
 363
 364 void renderer_finish(void) {
 365         real_renderer_finish();
 366
 367         if (thread_rendering && thread.running) {
 368                 video_thread_stop();
 369         }
 370 }
 371
 372 void renderer_sync_ecmds(uint32_t * ecmds) {
 373         if (thread.running) {
 374                 int dummy;
 375                 do_cmd_list(&ecmds[1], 6, &dummy);
 376         } else {
 377                 real_renderer_sync_ecmds(ecmds);
 378         }
 379 }
 380
 381 void renderer_update_caches(int x, int y, int w, int h) {
 382         renderer_sync();
 383         real_renderer_update_caches(x, y, w, h);
 384 }
 385
 386 void renderer_flush_queues(void) {
 387         /* Called during DMA and updateLace. We want to sync if it's DMA,
 388          * but not if it's updateLace. Instead of syncing here, there's a
 389          * renderer_sync call during DMA. */
 390         real_renderer_flush_queues();
 391 }
 392
 393 /*
 394  * Normally all GPU commands are processed before rendering the
 395  * frame. For games that naturally run < 50/60fps, this is unnecessary
 396  * -- it forces the game to render as if it was 60fps and leaves the
 397  * GPU idle half the time on a 30fps game, for example.
 398  *
 399  * Allowing the renderer to wait until a frame is done before
 400  * rendering it would give it double, triple, or quadruple the amount
 401  * of time to finish before we have to wait for it.
 402  *
 403  * We can use a heuristic to figure out when to force a render.
 404  *
 405  * - If a frame isn't done when we're asked to render, wait for it and
 406  *   put future GPU commands in a separate buffer (for the next frame)
 407  *
 408  * - If the frame is done, and had no future GPU commands, render it.
 409  *
 410  * - If we do have future GPU commands, it meant the frame took too
 411  *   long to render and there's another frame waiting. Stop until the
 412  *   first frame finishes, render it, and start processing the next
 413  *   one.
 414  *
 415  * This may possibly add a frame or two of latency that shouldn't be
 416  * different than the real device. It may skip rendering a frame
 417  * entirely if a VRAM transfer happens while a frame is waiting, or in
 418  * games that natively run at 60fps if frames are coming in too
 419  * quickly to process. Depending on how the game treats "60fps," this
 420  * may not be noticeable.
 421  */
 422 void renderer_notify_update_lace(int updated) {
 423         if (!thread.running) return;
 424
 425         if (thread_rendering == THREAD_RENDERING_SYNC) {
 426                 renderer_sync();
 427                 return;
 428         }
 429
 430         if (updated) {
 431                 cmd_queue_swap();
 432                 return;
 433         }
 434
 435         pthread_mutex_lock(&thread.queue_lock);
 436         if (thread.bg_queue->used) {
 437                 /* We have commands for a future frame to run. Force a wait until
 438                  * the current frame is finished, and start processing the next
 439                  * frame after it's drawn (see the `updated` clause above). */
 440                 pthread_mutex_unlock(&thread.queue_lock);
 441                 renderer_wait();
 442                 pthread_mutex_lock(&thread.queue_lock);
 443
 444                 /* We are no longer holding commands back, so the next frame may
 445                  * get mixed into the following frame. This is usually fine, but can
 446                  * result in frameskip-like effects for 60fps games. */
 447                 hold_cmds = FALSE;
 448                 needs_display = TRUE;
 449                 gpu.state.fb_dirty = TRUE;
 450         } else if (thread.queue->used) {
 451                 /* We are still drawing during a vblank. Cut off the current frame
 452                  * by sending new commands to the background queue and skip
 453                  * drawing our partly rendered frame to the display. */
 454                 hold_cmds = TRUE;
 455                 needs_display = TRUE;
 456                 gpu.state.fb_dirty = FALSE;
 457         } else if (needs_display && !thread.queue->used) {
 458                 /* We have processed all commands in the queue, render the
 459                  * buffer. We know we have something to render, because
 460                  * needs_display is TRUE. */
 461                 hold_cmds = FALSE;
 462                 needs_display = FALSE;
 463                 gpu.state.fb_dirty = TRUE;
 464         } else {
 465                 /* Everything went normally, so do the normal thing. */
 466         }
 467
 468         pthread_mutex_unlock(&thread.queue_lock);
 469 }
 470
 471 void renderer_set_interlace(int enable, int is_odd) {
 472         real_renderer_set_interlace(enable, is_odd);
 473 }
 474
 475 void renderer_set_config(const struct rearmed_cbs *cbs) {
 476         renderer_sync();
 477         thread_rendering = cbs->thread_rendering;
 478         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
 479                 video_thread_start();
 480         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
 481                 video_thread_stop();
 482         }
 483         real_renderer_set_config(cbs);
 484 }
 485
 486 void renderer_notify_res_change(void) {
 487         renderer_sync();
 488         real_renderer_notify_res_change();
 489 }