plugins/gpulib/gpulib_thread_if.c

   1 /**************************************************************************
   2 *   Copyright (C) 2020 The RetroArch Team                                 *
   3 *                                                                         *
   4 *   This program is free software; you can redistribute it and/or modify  *
   5 *   it under the terms of the GNU General Public License as published by  *
   6 *   the Free Software Foundation; either version 2 of the License, or     *
   7 *   (at your option) any later version.                                   *
   8 *                                                                         *
   9 *   This program is distributed in the hope that it will be useful,       *
  10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  12 *   GNU General Public License for more details.                          *
  13 *                                                                         *
  14 *   You should have received a copy of the GNU General Public License     *
  15 *   along with this program; if not, write to the                         *
  16 *   Free Software Foundation, Inc.,                                       *
  17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
  18 ***************************************************************************/
  19
  20 #include <stdlib.h>
  21 #include <stdio.h>
  22 #include <string.h>
  23 #include <pthread.h>
  24 #include "../gpulib/gpu.h"
  25 #include "../../frontend/plugin_lib.h"
  26 #include "gpu.h"
  27 #include "gpu_timing.h"
  28 #include "gpulib_thread_if.h"
  29
  30 extern void SysPrintf(const char *fmt, ...);
  31
  32 #define FALSE 0
  33 #define TRUE 1
  34 #define BOOL unsigned short
  35
  36 typedef struct {
  37         uint32_t *cmd_list;
  38         int count;
  39         int last_cmd;
  40 } video_thread_cmd;
  41
  42 #define QUEUE_SIZE 0x2000
  43
  44 typedef struct {
  45         size_t start;
  46         size_t end;
  47         size_t used;
  48         video_thread_cmd queue[QUEUE_SIZE];
  49 } video_thread_queue;
  50
  51 typedef struct {
  52         pthread_t thread;
  53         pthread_mutex_t queue_lock;
  54         pthread_cond_t cond_msg_avail;
  55         pthread_cond_t cond_msg_done;
  56         pthread_cond_t cond_queue_empty;
  57         video_thread_queue *queue;
  58         video_thread_queue *bg_queue;
  59         BOOL running;
  60 } video_thread_state;
  61
  62 static video_thread_state thread;
  63 static video_thread_queue queues[2];
  64 static int thread_rendering;
  65 static BOOL hold_cmds;
  66 static BOOL needs_display;
  67 static BOOL flushed;
  68
  69 extern const unsigned char cmd_lengths[];
  70
  71 static void *video_thread_main(void *arg) {
  72         video_thread_cmd *cmd;
  73         int i;
  74
  75 #ifdef _3DS
  76         static int processed = 0;
  77 #endif /* _3DS */
  78
  79 #if defined(__arm__) && defined(__ARM_FP)
  80         // RunFast mode
  81         uint32_t fpscr = ~0;
  82         __asm__ volatile("vmrs %0, fpscr" : "=r"(fpscr));
  83         fpscr &= ~0x00009f9f;
  84         fpscr |=  0x03000000; // DN | FZ
  85         __asm__ volatile("vmsr fpscr, %0" :: "r"(fpscr));
  86 #endif
  87
  88         while(1) {
  89                 int result, cycles_dummy = 0, last_cmd, start, end;
  90                 video_thread_queue *queue;
  91                 pthread_mutex_lock(&thread.queue_lock);
  92
  93                 while (!thread.queue->used && thread.running) {
  94                         pthread_cond_wait(&thread.cond_msg_avail, &thread.queue_lock);
  95                 }
  96
  97                 if (!thread.running) {
  98                         pthread_mutex_unlock(&thread.queue_lock);
  99                         break;
 100                 }
 101
 102                 queue = thread.queue;
 103                 start = queue->start;
 104                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
 105                 queue->start = end % QUEUE_SIZE;
 106                 pthread_mutex_unlock(&thread.queue_lock);
 107
 108                 for (i = start; i < end; i++) {
 109                         cmd = &queue->queue[i];
 110                         result = real_do_cmd_list(cmd->cmd_list, cmd->count,
 111                                         &cycles_dummy, &cycles_dummy, &last_cmd);
 112                         if (result != cmd->count) {
 113                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
 114                         }
 115
 116 #ifdef _3DS
 117                         /* Periodically yield so as not to starve other threads */
 118                         processed += cmd->count;
 119                         if (processed >= 512) {
 120                                 svcSleepThread(1);
 121                                 processed %= 512;
 122                         }
 123 #endif /* _3DS */
 124                 }
 125
 126                 pthread_mutex_lock(&thread.queue_lock);
 127                 queue->used -= (end - start);
 128
 129                 if (!queue->used)
 130                         pthread_cond_signal(&thread.cond_queue_empty);
 131
 132                 pthread_cond_signal(&thread.cond_msg_done);
 133                 pthread_mutex_unlock(&thread.queue_lock);
 134         }
 135
 136         return 0;
 137 }
 138
 139 static void cmd_queue_swap() {
 140         video_thread_queue *tmp;
 141         if (!thread.bg_queue->used) return;
 142
 143         pthread_mutex_lock(&thread.queue_lock);
 144         if (!thread.queue->used) {
 145                 tmp = thread.queue;
 146                 thread.queue = thread.bg_queue;
 147                 thread.bg_queue = tmp;
 148                 pthread_cond_signal(&thread.cond_msg_avail);
 149         }
 150         pthread_mutex_unlock(&thread.queue_lock);
 151 }
 152
 153 /* Waits for the main queue to completely finish. */
 154 void renderer_wait() {
 155         if (!thread.running) return;
 156
 157         /* Not completely safe, but should be fine since the render thread
 158          * only decreases used, and we check again inside the lock. */
 159         if (!thread.queue->used) {
 160                 return;
 161         }
 162
 163         pthread_mutex_lock(&thread.queue_lock);
 164
 165         while (thread.queue->used) {
 166                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
 167         }
 168
 169         pthread_mutex_unlock(&thread.queue_lock);
 170 }
 171
 172 /* Waits for all GPU commands in both queues to finish, bringing VRAM
 173  * completely up-to-date. */
 174 void renderer_sync(void) {
 175         if (!thread.running) return;
 176
 177         /* Not completely safe, but should be fine since the render thread
 178          * only decreases used, and we check again inside the lock. */
 179         if (!thread.queue->used && !thread.bg_queue->used) {
 180                 return;
 181         }
 182
 183         if (thread.bg_queue->used) {
 184                 /* When we flush the background queue, the vblank handler can't
 185                  * know that we had a frame pending, and we delay rendering too
 186                  * long. Force it. */
 187                 flushed = TRUE;
 188         }
 189
 190         /* Flush both queues. This is necessary because gpulib could be
 191          * trying to process a DMA write that a command in the queue should
 192          * run beforehand. For example, Xenogears sprites write a black
 193          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
 194          * happens after the DMA, it will clear the DMA, resulting in
 195          * flickering sprites. We need to be totally up-to-date. This may
 196          * drop a frame. */
 197         renderer_wait();
 198         cmd_queue_swap();
 199         hold_cmds = FALSE;
 200         renderer_wait();
 201 }
 202
 203 static void video_thread_stop() {
 204         int i;
 205         renderer_sync();
 206
 207         if (thread.running) {
 208                 thread.running = FALSE;
 209                 pthread_cond_signal(&thread.cond_msg_avail);
 210                 pthread_join(thread.thread, NULL);
 211         }
 212
 213         pthread_mutex_destroy(&thread.queue_lock);
 214         pthread_cond_destroy(&thread.cond_msg_avail);
 215         pthread_cond_destroy(&thread.cond_msg_done);
 216         pthread_cond_destroy(&thread.cond_queue_empty);
 217
 218         for (i = 0; i < QUEUE_SIZE; i++) {
 219                 video_thread_cmd *cmd = &thread.queue->queue[i];
 220                 free(cmd->cmd_list);
 221                 cmd->cmd_list = NULL;
 222         }
 223
 224         for (i = 0; i < QUEUE_SIZE; i++) {
 225                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
 226                 free(cmd->cmd_list);
 227                 cmd->cmd_list = NULL;
 228         }
 229 }
 230
 231 static void video_thread_start() {
 232         SysPrintf("Starting render thread\n");
 233
 234         thread.queue = &queues[0];
 235         thread.bg_queue = &queues[1];
 236         thread.running = TRUE;
 237
 238         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
 239                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
 240                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
 241                         pthread_mutex_init(&thread.queue_lock, NULL) ||
 242                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
 243                 goto error;
 244         }
 245
 246         return;
 247
 248  error:
 249         SysPrintf("Failed to start rendering thread\n");
 250         thread.running = FALSE;
 251         video_thread_stop();
 252 }
 253
 254 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
 255         video_thread_cmd *cmd;
 256         uint32_t *cmd_list;
 257         video_thread_queue *queue;
 258         BOOL lock;
 259
 260         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
 261
 262         if (!cmd_list) {
 263                 /* Out of memory, disable the thread and run sync from now on */
 264                 SysPrintf("Failed to allocate render thread command list, stopping thread\n");
 265                 video_thread_stop();
 266         }
 267
 268         memcpy(cmd_list, list, count * sizeof(uint32_t));
 269
 270         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
 271                 /* If the bg queue is full, do a full sync to empty both queues
 272                  * and clear space. This should be very rare, I've only seen it in
 273                  * Tekken 3 post-battle-replay. */
 274                 renderer_sync();
 275         }
 276
 277         if (hold_cmds) {
 278                 queue = thread.bg_queue;
 279                 lock = FALSE;
 280         } else {
 281                 queue = thread.queue;
 282                 lock = TRUE;
 283         }
 284
 285         if (lock) {
 286                 pthread_mutex_lock(&thread.queue_lock);
 287
 288                 while (queue->used >= QUEUE_SIZE) {
 289                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
 290                 }
 291         }
 292
 293         cmd = &queue->queue[queue->end];
 294         free(cmd->cmd_list);
 295         cmd->cmd_list = cmd_list;
 296         cmd->count = count;
 297         cmd->last_cmd = last_cmd;
 298         queue->end = (queue->end + 1) % QUEUE_SIZE;
 299         queue->used++;
 300
 301         if (lock) {
 302                 pthread_cond_signal(&thread.cond_msg_avail);
 303                 pthread_mutex_unlock(&thread.queue_lock);
 304         }
 305 }
 306
 307 /* Slice off just the part of the list that can be handled async, and
 308  * update ex_regs. */
 309 static int scan_cmd_list(uint32_t *data, int count,
 310         int *cycles_sum_out, int *cycles_last, int *last_cmd)
 311 {
 312         int cpu_cycles_sum = 0, cpu_cycles = *cycles_last;
 313         int cmd = 0, pos = 0, len, v;
 314
 315         while (pos < count) {
 316                 uint32_t *list = data + pos;
 317                 short *slist = (void *)list;
 318                 cmd = LE32TOH(list[0]) >> 24;
 319                 len = 1 + cmd_lengths[cmd];
 320
 321                 switch (cmd) {
 322                         case 0x02:
 323                                 gput_sum(cpu_cycles_sum, cpu_cycles,
 324                                         gput_fill(LE16TOH(slist[4]) & 0x3ff,
 325                                                 LE16TOH(slist[5]) & 0x1ff));
 326                                 break;
 327                         case 0x20 ... 0x23:
 328                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
 329                                 break;
 330                         case 0x24 ... 0x27:
 331                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
 332                                 gpu.ex_regs[1] &= ~0x1ff;
 333                                 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
 334                                 break;
 335                         case 0x28 ... 0x2b:
 336                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
 337                                 break;
 338                         case 0x2c ... 0x2f:
 339                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
 340                                 gpu.ex_regs[1] &= ~0x1ff;
 341                                 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
 342                                 break;
 343                         case 0x30 ... 0x33:
 344                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
 345                                 break;
 346                         case 0x34 ... 0x37:
 347                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
 348                                 gpu.ex_regs[1] &= ~0x1ff;
 349                                 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
 350                                 break;
 351                         case 0x38 ... 0x3b:
 352                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
 353                                 break;
 354                         case 0x3c ... 0x3f:
 355                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
 356                                 gpu.ex_regs[1] &= ~0x1ff;
 357                                 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
 358                                 break;
 359                         case 0x40 ... 0x47:
 360                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 361                                 break;
 362                         case 0x48 ... 0x4F:
 363                                 for (v = 3; pos + v < count; v++)
 364                                 {
 365                                         gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 366                                         if ((list[v] & 0xf000f000) == 0x50005000)
 367                                                 break;
 368                                 }
 369                                 len += v - 3;
 370                                 break;
 371                         case 0x50 ... 0x57:
 372                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 373                                 break;
 374                         case 0x58 ... 0x5F:
 375                                 for (v = 4; pos + v < count; v += 2)
 376                                 {
 377                                         gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 378                                         if ((list[v] & 0xf000f000) == 0x50005000)
 379                                                 break;
 380                                 }
 381                                 len += v - 4;
 382                                 break;
 383                         case 0x60 ... 0x63:
 384                                 gput_sum(cpu_cycles_sum, cpu_cycles,
 385                                         gput_sprite(LE16TOH(slist[4]) & 0x3ff,
 386                                                 LE16TOH(slist[5]) & 0x1ff));
 387                                 break;
 388                         case 0x64 ... 0x67:
 389                                 gput_sum(cpu_cycles_sum, cpu_cycles,
 390                                         gput_sprite(LE16TOH(slist[6]) & 0x3ff,
 391                                                 LE16TOH(slist[7]) & 0x1ff));
 392                                 break;
 393                         case 0x68 ... 0x6b:
 394                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
 395                                 break;
 396                         case 0x70 ... 0x77:
 397                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8));
 398                                 break;
 399                         case 0x78 ... 0x7f:
 400                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16));
 401                                 break;
 402                         default:
 403                                 if ((cmd & 0xf8) == 0xe0)
 404                                         gpu.ex_regs[cmd & 7] = list[0];
 405                                 break;
 406                 }
 407
 408                 if (pos + len > count) {
 409                         cmd = -1;
 410                         break; /* incomplete cmd */
 411                 }
 412                 if (0x80 <= cmd && cmd <= 0xdf)
 413                         break; /* image i/o */
 414
 415                 pos += len;
 416         }
 417
 418         *cycles_sum_out += cpu_cycles_sum;
 419         *cycles_last = cpu_cycles;
 420         *last_cmd = cmd;
 421         return pos;
 422 }
 423
 424 int do_cmd_list(uint32_t *list, int count,
 425  int *cycles_sum, int *cycles_last, int *last_cmd)
 426 {
 427         int pos = 0;
 428
 429         if (thread.running) {
 430                 pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
 431                 video_thread_queue_cmd(list, pos, *last_cmd);
 432         } else {
 433                 pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
 434                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
 435         }
 436         return pos;
 437 }
 438
 439 int renderer_init(void) {
 440         if (thread_rendering) {
 441                 video_thread_start();
 442         }
 443         return real_renderer_init();
 444 }
 445
 446 void renderer_finish(void) {
 447         real_renderer_finish();
 448
 449         if (thread_rendering && thread.running) {
 450                 video_thread_stop();
 451         }
 452 }
 453
 454 void renderer_sync_ecmds(uint32_t * ecmds) {
 455         if (thread.running) {
 456                 int dummy = 0;
 457                 do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy);
 458         } else {
 459                 real_renderer_sync_ecmds(ecmds);
 460         }
 461 }
 462
 463 void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
 464         renderer_sync();
 465         real_renderer_update_caches(x, y, w, h, state_changed);
 466 }
 467
 468 void renderer_flush_queues(void) {
 469         /* Called during DMA and updateLace. We want to sync if it's DMA,
 470          * but not if it's updateLace. Instead of syncing here, there's a
 471          * renderer_sync call during DMA. */
 472         real_renderer_flush_queues();
 473 }
 474
 475 /*
 476  * Normally all GPU commands are processed before rendering the
 477  * frame. For games that naturally run < 50/60fps, this is unnecessary
 478  * -- it forces the game to render as if it was 60fps and leaves the
 479  * GPU idle half the time on a 30fps game, for example.
 480  *
 481  * Allowing the renderer to wait until a frame is done before
 482  * rendering it would give it double, triple, or quadruple the amount
 483  * of time to finish before we have to wait for it.
 484  *
 485  * We can use a heuristic to figure out when to force a render.
 486  *
 487  * - If a frame isn't done when we're asked to render, wait for it and
 488  *   put future GPU commands in a separate buffer (for the next frame)
 489  *
 490  * - If the frame is done, and had no future GPU commands, render it.
 491  *
 492  * - If we do have future GPU commands, it meant the frame took too
 493  *   long to render and there's another frame waiting. Stop until the
 494  *   first frame finishes, render it, and start processing the next
 495  *   one.
 496  *
 497  * This may possibly add a frame or two of latency that shouldn't be
 498  * different than the real device. It may skip rendering a frame
 499  * entirely if a VRAM transfer happens while a frame is waiting, or in
 500  * games that natively run at 60fps if frames are coming in too
 501  * quickly to process. Depending on how the game treats "60fps," this
 502  * may not be noticeable.
 503  */
 504 void renderer_notify_update_lace(int updated) {
 505         if (!thread.running) return;
 506
 507         if (thread_rendering == THREAD_RENDERING_SYNC) {
 508                 renderer_sync();
 509                 return;
 510         }
 511
 512         if (updated) {
 513                 cmd_queue_swap();
 514                 return;
 515         }
 516
 517         pthread_mutex_lock(&thread.queue_lock);
 518         if (thread.bg_queue->used || flushed) {
 519                 /* We have commands for a future frame to run. Force a wait until
 520                  * the current frame is finished, and start processing the next
 521                  * frame after it's drawn (see the `updated` clause above). */
 522                 pthread_mutex_unlock(&thread.queue_lock);
 523                 renderer_wait();
 524                 pthread_mutex_lock(&thread.queue_lock);
 525
 526                 /* We are no longer holding commands back, so the next frame may
 527                  * get mixed into the following frame. This is usually fine, but can
 528                  * result in frameskip-like effects for 60fps games. */
 529                 flushed = FALSE;
 530                 hold_cmds = FALSE;
 531                 needs_display = TRUE;
 532                 gpu.state.fb_dirty = TRUE;
 533         } else if (thread.queue->used) {
 534                 /* We are still drawing during a vblank. Cut off the current frame
 535                  * by sending new commands to the background queue and skip
 536                  * drawing our partly rendered frame to the display. */
 537                 hold_cmds = TRUE;
 538                 needs_display = TRUE;
 539                 gpu.state.fb_dirty = FALSE;
 540         } else if (needs_display && !thread.queue->used) {
 541                 /* We have processed all commands in the queue, render the
 542                  * buffer. We know we have something to render, because
 543                  * needs_display is TRUE. */
 544                 hold_cmds = FALSE;
 545                 needs_display = FALSE;
 546                 gpu.state.fb_dirty = TRUE;
 547         } else {
 548                 /* Everything went normally, so do the normal thing. */
 549         }
 550
 551         pthread_mutex_unlock(&thread.queue_lock);
 552 }
 553
 554 void renderer_set_interlace(int enable, int is_odd) {
 555         real_renderer_set_interlace(enable, is_odd);
 556 }
 557
 558 void renderer_set_config(const struct rearmed_cbs *cbs) {
 559         renderer_sync();
 560         thread_rendering = cbs->thread_rendering;
 561         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
 562                 video_thread_start();
 563         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
 564                 video_thread_stop();
 565         }
 566         real_renderer_set_config(cbs);
 567 }
 568
 569 void renderer_notify_res_change(void) {
 570         renderer_sync();
 571         real_renderer_notify_res_change();
 572 }