plugins/gpulib/gpulib_thread_if.c

   1 /**************************************************************************
   2 *   Copyright (C) 2020 The RetroArch Team                                 *
   3 *                                                                         *
   4 *   This program is free software; you can redistribute it and/or modify  *
   5 *   it under the terms of the GNU General Public License as published by  *
   6 *   the Free Software Foundation; either version 2 of the License, or     *
   7 *   (at your option) any later version.                                   *
   8 *                                                                         *
   9 *   This program is distributed in the hope that it will be useful,       *
  10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  12 *   GNU General Public License for more details.                          *
  13 *                                                                         *
  14 *   You should have received a copy of the GNU General Public License     *
  15 *   along with this program; if not, write to the                         *
  16 *   Free Software Foundation, Inc.,                                       *
  17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
  18 ***************************************************************************/
  19
  20 #include <stdlib.h>
  21 #include <stdio.h>
  22 #include <string.h>
  23 #include <pthread.h>
  24 #include "../gpulib/gpu.h"
  25 #include "../../frontend/plugin_lib.h"
  26 #include "gpulib_thread_if.h"
  27
  28 #define FALSE 0
  29 #define TRUE 1
  30 #define BOOL unsigned short
  31
  32 typedef struct {
  33         uint32_t *cmd_list;
  34         int count;
  35         int last_cmd;
  36 } video_thread_cmd;
  37
  38 #define QUEUE_SIZE 0x2000
  39
  40 typedef struct {
  41         size_t start;
  42         size_t end;
  43         size_t used;
  44         video_thread_cmd queue[QUEUE_SIZE];
  45 } video_thread_queue;
  46
  47 typedef struct {
  48         pthread_t thread;
  49         pthread_mutex_t queue_lock;
  50         pthread_cond_t cond_msg_avail;
  51         pthread_cond_t cond_msg_done;
  52         pthread_cond_t cond_queue_empty;
  53         video_thread_queue *queue;
  54         video_thread_queue *bg_queue;
  55         BOOL running;
  56 } video_thread_state;
  57
  58 static video_thread_state thread;
  59 static video_thread_queue queues[2];
  60 static int thread_rendering;
  61 static BOOL hold_cmds;
  62 static BOOL needs_display;
  63 static BOOL flushed;
  64
  65 extern const unsigned char cmd_lengths[];
  66
  67 static void *video_thread_main(void *arg) {
  68         video_thread_state *thread = (video_thread_state *)arg;
  69         video_thread_cmd *cmd;
  70         int i;
  71
  72 #ifdef _3DS
  73         static int processed = 0;
  74 #endif /* _3DS */
  75
  76         while(1) {
  77                 int result, cpu_cycles = 0, last_cmd, start, end;
  78                 video_thread_queue *queue;
  79                 pthread_mutex_lock(&thread->queue_lock);
  80
  81                 while (!thread->queue->used && thread->running) {
  82                         pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
  83                 }
  84
  85                 if (!thread->running) {
  86                         pthread_mutex_unlock(&thread->queue_lock);
  87                         break;
  88                 }
  89
  90                 queue = thread->queue;
  91                 start = queue->start;
  92                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
  93                 queue->start = end % QUEUE_SIZE;
  94                 pthread_mutex_unlock(&thread->queue_lock);
  95
  96                 for (i = start; i < end; i++) {
  97                         cmd = &queue->queue[i];
  98                         result = real_do_cmd_list(cmd->cmd_list, cmd->count,
  99                                         &cpu_cycles, &last_cmd);
 100                         if (result != cmd->count) {
 101                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
 102                         }
 103
 104 #ifdef _3DS
 105                         /* Periodically yield so as not to starve other threads */
 106                         processed += cmd->count;
 107                         if (processed >= 512) {
 108                                 svcSleepThread(1);
 109                                 processed %= 512;
 110                         }
 111 #endif /* _3DS */
 112                 }
 113
 114                 pthread_mutex_lock(&thread->queue_lock);
 115                 queue->used -= (end - start);
 116
 117                 if (!queue->used)
 118                         pthread_cond_signal(&thread->cond_queue_empty);
 119
 120                 pthread_cond_signal(&thread->cond_msg_done);
 121                 pthread_mutex_unlock(&thread->queue_lock);
 122         }
 123
 124         return 0;
 125 }
 126
 127 static void cmd_queue_swap() {
 128         video_thread_queue *tmp;
 129         if (!thread.bg_queue->used) return;
 130
 131         pthread_mutex_lock(&thread.queue_lock);
 132         if (!thread.queue->used) {
 133                 tmp = thread.queue;
 134                 thread.queue = thread.bg_queue;
 135                 thread.bg_queue = tmp;
 136                 pthread_cond_signal(&thread.cond_msg_avail);
 137         }
 138         pthread_mutex_unlock(&thread.queue_lock);
 139 }
 140
 141 /* Waits for the main queue to completely finish. */
 142 void renderer_wait() {
 143         if (!thread.running) return;
 144
 145         /* Not completely safe, but should be fine since the render thread
 146          * only decreases used, and we check again inside the lock. */
 147         if (!thread.queue->used) {
 148                 return;
 149         }
 150
 151         pthread_mutex_lock(&thread.queue_lock);
 152
 153         while (thread.queue->used) {
 154                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
 155         }
 156
 157         pthread_mutex_unlock(&thread.queue_lock);
 158 }
 159
 160 /* Waits for all GPU commands in both queues to finish, bringing VRAM
 161  * completely up-to-date. */
 162 void renderer_sync(void) {
 163         if (!thread.running) return;
 164
 165         /* Not completely safe, but should be fine since the render thread
 166          * only decreases used, and we check again inside the lock. */
 167         if (!thread.queue->used && !thread.bg_queue->used) {
 168                 return;
 169         }
 170
 171         if (thread.bg_queue->used) {
 172                 /* When we flush the background queue, the vblank handler can't
 173                  * know that we had a frame pending, and we delay rendering too
 174                  * long. Force it. */
 175                 flushed = TRUE;
 176         }
 177
 178         /* Flush both queues. This is necessary because gpulib could be
 179          * trying to process a DMA write that a command in the queue should
 180          * run beforehand. For example, Xenogears sprites write a black
 181          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
 182          * happens after the DMA, it will clear the DMA, resulting in
 183          * flickering sprites. We need to be totally up-to-date. This may
 184          * drop a frame. */
 185         renderer_wait();
 186         cmd_queue_swap();
 187         hold_cmds = FALSE;
 188         renderer_wait();
 189 }
 190
 191 static void video_thread_stop() {
 192         int i;
 193         renderer_sync();
 194
 195         if (thread.running) {
 196                 thread.running = FALSE;
 197                 pthread_cond_signal(&thread.cond_msg_avail);
 198                 pthread_join(thread.thread, NULL);
 199         }
 200
 201         pthread_mutex_destroy(&thread.queue_lock);
 202         pthread_cond_destroy(&thread.cond_msg_avail);
 203         pthread_cond_destroy(&thread.cond_msg_done);
 204         pthread_cond_destroy(&thread.cond_queue_empty);
 205
 206         for (i = 0; i < QUEUE_SIZE; i++) {
 207                 video_thread_cmd *cmd = &thread.queue->queue[i];
 208                 free(cmd->cmd_list);
 209                 cmd->cmd_list = NULL;
 210         }
 211
 212         for (i = 0; i < QUEUE_SIZE; i++) {
 213                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
 214                 free(cmd->cmd_list);
 215                 cmd->cmd_list = NULL;
 216         }
 217 }
 218
 219 static void video_thread_start() {
 220         fprintf(stdout, "Starting render thread\n");
 221
 222         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
 223                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
 224                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
 225                         pthread_mutex_init(&thread.queue_lock, NULL) ||
 226                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
 227                 goto error;
 228         }
 229
 230         thread.queue = &queues[0];
 231         thread.bg_queue = &queues[1];
 232
 233         thread.running = TRUE;
 234         return;
 235
 236  error:
 237         fprintf(stderr,"Failed to start rendering thread\n");
 238         video_thread_stop();
 239 }
 240
 241 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
 242         video_thread_cmd *cmd;
 243         uint32_t *cmd_list;
 244         video_thread_queue *queue;
 245         BOOL lock;
 246
 247         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
 248
 249         if (!cmd_list) {
 250                 /* Out of memory, disable the thread and run sync from now on */
 251                 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
 252                 video_thread_stop();
 253         }
 254
 255         memcpy(cmd_list, list, count * sizeof(uint32_t));
 256
 257         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
 258                 /* If the bg queue is full, do a full sync to empty both queues
 259                  * and clear space. This should be very rare, I've only seen it in
 260                  * Tekken 3 post-battle-replay. */
 261                 renderer_sync();
 262         }
 263
 264         if (hold_cmds) {
 265                 queue = thread.bg_queue;
 266                 lock = FALSE;
 267         } else {
 268                 queue = thread.queue;
 269                 lock = TRUE;
 270         }
 271
 272         if (lock) {
 273                 pthread_mutex_lock(&thread.queue_lock);
 274
 275                 while (queue->used >= QUEUE_SIZE) {
 276                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
 277                 }
 278         }
 279
 280         cmd = &queue->queue[queue->end];
 281         free(cmd->cmd_list);
 282         cmd->cmd_list = cmd_list;
 283         cmd->count = count;
 284         cmd->last_cmd = last_cmd;
 285         queue->end = (queue->end + 1) % QUEUE_SIZE;
 286         queue->used++;
 287
 288         if (lock) {
 289                 pthread_cond_signal(&thread.cond_msg_avail);
 290                 pthread_mutex_unlock(&thread.queue_lock);
 291         }
 292 }
 293
 294 /* Slice off just the part of the list that can be handled async, and
 295  * update ex_regs. */
 296 static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
 297 {
 298         int cmd = 0, pos = 0, len, v;
 299
 300         while (pos < count) {
 301                 uint32_t *list = data + pos;
 302                 cmd = list[0] >> 24;
 303                 len = 1 + cmd_lengths[cmd];
 304
 305                 switch (cmd) {
 306                         case 0x02:
 307                                 break;
 308                         case 0x24 ... 0x27:
 309                         case 0x2c ... 0x2f:
 310                         case 0x34 ... 0x37:
 311                         case 0x3c ... 0x3f:
 312                                 gpu.ex_regs[1] &= ~0x1ff;
 313                                 gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
 314                                 break;
 315                         case 0x48 ... 0x4F:
 316                                 for (v = 3; pos + v < count; v++)
 317                                 {
 318                                         if ((list[v] & 0xf000f000) == 0x50005000)
 319                                                 break;
 320                                 }
 321                                 len += v - 3;
 322                                 break;
 323                         case 0x58 ... 0x5F:
 324                                 for (v = 4; pos + v < count; v += 2)
 325                                 {
 326                                         if ((list[v] & 0xf000f000) == 0x50005000)
 327                                                 break;
 328                                 }
 329                                 len += v - 4;
 330                                 break;
 331                         default:
 332                                 if ((cmd & 0xf8) == 0xe0)
 333                                         gpu.ex_regs[cmd & 7] = list[0];
 334                                 break;
 335                 }
 336
 337                 if (pos + len > count) {
 338                         cmd = -1;
 339                         break; /* incomplete cmd */
 340                 }
 341                 if (0xa0 <= cmd && cmd <= 0xdf)
 342                         break; /* image i/o */
 343
 344                 pos += len;
 345         }
 346
 347         *last_cmd = cmd;
 348         return pos;
 349 }
 350
 351 int do_cmd_list(uint32_t *list, int count, int *cycles, int *last_cmd) {
 352         int pos = 0;
 353
 354         if (thread.running) {
 355                 pos = scan_cmd_list(list, count, last_cmd);
 356                 video_thread_queue_cmd(list, pos, *last_cmd);
 357         } else {
 358                 pos = real_do_cmd_list(list, count, cycles, last_cmd);
 359                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
 360         }
 361         return pos;
 362 }
 363
 364 int renderer_init(void) {
 365         if (thread_rendering) {
 366                 video_thread_start();
 367         }
 368         return real_renderer_init();
 369 }
 370
 371 void renderer_finish(void) {
 372         real_renderer_finish();
 373
 374         if (thread_rendering && thread.running) {
 375                 video_thread_stop();
 376         }
 377 }
 378
 379 void renderer_sync_ecmds(uint32_t * ecmds) {
 380         if (thread.running) {
 381                 int dummy = 0;
 382                 do_cmd_list(&ecmds[1], 6, &dummy, &dummy);
 383         } else {
 384                 real_renderer_sync_ecmds(ecmds);
 385         }
 386 }
 387
 388 void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
 389         renderer_sync();
 390         real_renderer_update_caches(x, y, w, h, state_changed);
 391 }
 392
 393 void renderer_flush_queues(void) {
 394         /* Called during DMA and updateLace. We want to sync if it's DMA,
 395          * but not if it's updateLace. Instead of syncing here, there's a
 396          * renderer_sync call during DMA. */
 397         real_renderer_flush_queues();
 398 }
 399
 400 /*
 401  * Normally all GPU commands are processed before rendering the
 402  * frame. For games that naturally run < 50/60fps, this is unnecessary
 403  * -- it forces the game to render as if it was 60fps and leaves the
 404  * GPU idle half the time on a 30fps game, for example.
 405  *
 406  * Allowing the renderer to wait until a frame is done before
 407  * rendering it would give it double, triple, or quadruple the amount
 408  * of time to finish before we have to wait for it.
 409  *
 410  * We can use a heuristic to figure out when to force a render.
 411  *
 412  * - If a frame isn't done when we're asked to render, wait for it and
 413  *   put future GPU commands in a separate buffer (for the next frame)
 414  *
 415  * - If the frame is done, and had no future GPU commands, render it.
 416  *
 417  * - If we do have future GPU commands, it meant the frame took too
 418  *   long to render and there's another frame waiting. Stop until the
 419  *   first frame finishes, render it, and start processing the next
 420  *   one.
 421  *
 422  * This may possibly add a frame or two of latency that shouldn't be
 423  * different than the real device. It may skip rendering a frame
 424  * entirely if a VRAM transfer happens while a frame is waiting, or in
 425  * games that natively run at 60fps if frames are coming in too
 426  * quickly to process. Depending on how the game treats "60fps," this
 427  * may not be noticeable.
 428  */
 429 void renderer_notify_update_lace(int updated) {
 430         if (!thread.running) return;
 431
 432         if (thread_rendering == THREAD_RENDERING_SYNC) {
 433                 renderer_sync();
 434                 return;
 435         }
 436
 437         if (updated) {
 438                 cmd_queue_swap();
 439                 return;
 440         }
 441
 442         pthread_mutex_lock(&thread.queue_lock);
 443         if (thread.bg_queue->used || flushed) {
 444                 /* We have commands for a future frame to run. Force a wait until
 445                  * the current frame is finished, and start processing the next
 446                  * frame after it's drawn (see the `updated` clause above). */
 447                 pthread_mutex_unlock(&thread.queue_lock);
 448                 renderer_wait();
 449                 pthread_mutex_lock(&thread.queue_lock);
 450
 451                 /* We are no longer holding commands back, so the next frame may
 452                  * get mixed into the following frame. This is usually fine, but can
 453                  * result in frameskip-like effects for 60fps games. */
 454                 flushed = FALSE;
 455                 hold_cmds = FALSE;
 456                 needs_display = TRUE;
 457                 gpu.state.fb_dirty = TRUE;
 458         } else if (thread.queue->used) {
 459                 /* We are still drawing during a vblank. Cut off the current frame
 460                  * by sending new commands to the background queue and skip
 461                  * drawing our partly rendered frame to the display. */
 462                 hold_cmds = TRUE;
 463                 needs_display = TRUE;
 464                 gpu.state.fb_dirty = FALSE;
 465         } else if (needs_display && !thread.queue->used) {
 466                 /* We have processed all commands in the queue, render the
 467                  * buffer. We know we have something to render, because
 468                  * needs_display is TRUE. */
 469                 hold_cmds = FALSE;
 470                 needs_display = FALSE;
 471                 gpu.state.fb_dirty = TRUE;
 472         } else {
 473                 /* Everything went normally, so do the normal thing. */
 474         }
 475
 476         pthread_mutex_unlock(&thread.queue_lock);
 477 }
 478
 479 void renderer_set_interlace(int enable, int is_odd) {
 480         real_renderer_set_interlace(enable, is_odd);
 481 }
 482
 483 void renderer_set_config(const struct rearmed_cbs *cbs) {
 484         renderer_sync();
 485         thread_rendering = cbs->thread_rendering;
 486         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
 487                 video_thread_start();
 488         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
 489                 video_thread_stop();
 490         }
 491         real_renderer_set_config(cbs);
 492 }
 493
 494 void renderer_notify_res_change(void) {
 495         renderer_sync();
 496         real_renderer_notify_res_change();
 497 }