plugins/gpulib/gpulib_thread_if.c

   1 /**************************************************************************
   2 *   Copyright (C) 2020 The RetroArch Team                                 *
   3 *                                                                         *
   4 *   This program is free software; you can redistribute it and/or modify  *
   5 *   it under the terms of the GNU General Public License as published by  *
   6 *   the Free Software Foundation; either version 2 of the License, or     *
   7 *   (at your option) any later version.                                   *
   8 *                                                                         *
   9 *   This program is distributed in the hope that it will be useful,       *
  10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  12 *   GNU General Public License for more details.                          *
  13 *                                                                         *
  14 *   You should have received a copy of the GNU General Public License     *
  15 *   along with this program; if not, write to the                         *
  16 *   Free Software Foundation, Inc.,                                       *
  17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
  18 ***************************************************************************/
  19
  20 #include <stdlib.h>
  21 #include <stdio.h>
  22 #include <string.h>
  23 #include <pthread.h>
  24 #include "../gpulib/gpu.h"
  25 #include "../../frontend/plugin_lib.h"
  26 #include "gpu.h"
  27 #include "gpu_timing.h"
  28 #include "gpulib_thread_if.h"
  29
  30 #define FALSE 0
  31 #define TRUE 1
  32 #define BOOL unsigned short
  33
  34 typedef struct {
  35         uint32_t *cmd_list;
  36         int count;
  37         int last_cmd;
  38 } video_thread_cmd;
  39
  40 #define QUEUE_SIZE 0x2000
  41
  42 typedef struct {
  43         size_t start;
  44         size_t end;
  45         size_t used;
  46         video_thread_cmd queue[QUEUE_SIZE];
  47 } video_thread_queue;
  48
  49 typedef struct {
  50         pthread_t thread;
  51         pthread_mutex_t queue_lock;
  52         pthread_cond_t cond_msg_avail;
  53         pthread_cond_t cond_msg_done;
  54         pthread_cond_t cond_queue_empty;
  55         video_thread_queue *queue;
  56         video_thread_queue *bg_queue;
  57         BOOL running;
  58 } video_thread_state;
  59
  60 static video_thread_state thread;
  61 static video_thread_queue queues[2];
  62 static int thread_rendering;
  63 static BOOL hold_cmds;
  64 static BOOL needs_display;
  65 static BOOL flushed;
  66
  67 extern const unsigned char cmd_lengths[];
  68
  69 static void *video_thread_main(void *arg) {
  70         video_thread_state *thread = (video_thread_state *)arg;
  71         video_thread_cmd *cmd;
  72         int i;
  73
  74 #ifdef _3DS
  75         static int processed = 0;
  76 #endif /* _3DS */
  77
  78         while(1) {
  79                 int result, cycles_dummy = 0, last_cmd, start, end;
  80                 video_thread_queue *queue;
  81                 pthread_mutex_lock(&thread->queue_lock);
  82
  83                 while (!thread->queue->used && thread->running) {
  84                         pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
  85                 }
  86
  87                 if (!thread->running) {
  88                         pthread_mutex_unlock(&thread->queue_lock);
  89                         break;
  90                 }
  91
  92                 queue = thread->queue;
  93                 start = queue->start;
  94                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
  95                 queue->start = end % QUEUE_SIZE;
  96                 pthread_mutex_unlock(&thread->queue_lock);
  97
  98                 for (i = start; i < end; i++) {
  99                         cmd = &queue->queue[i];
 100                         result = real_do_cmd_list(cmd->cmd_list, cmd->count,
 101                                         &cycles_dummy, &cycles_dummy, &last_cmd);
 102                         if (result != cmd->count) {
 103                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
 104                         }
 105
 106 #ifdef _3DS
 107                         /* Periodically yield so as not to starve other threads */
 108                         processed += cmd->count;
 109                         if (processed >= 512) {
 110                                 svcSleepThread(1);
 111                                 processed %= 512;
 112                         }
 113 #endif /* _3DS */
 114                 }
 115
 116                 pthread_mutex_lock(&thread->queue_lock);
 117                 queue->used -= (end - start);
 118
 119                 if (!queue->used)
 120                         pthread_cond_signal(&thread->cond_queue_empty);
 121
 122                 pthread_cond_signal(&thread->cond_msg_done);
 123                 pthread_mutex_unlock(&thread->queue_lock);
 124         }
 125
 126         return 0;
 127 }
 128
 129 static void cmd_queue_swap() {
 130         video_thread_queue *tmp;
 131         if (!thread.bg_queue->used) return;
 132
 133         pthread_mutex_lock(&thread.queue_lock);
 134         if (!thread.queue->used) {
 135                 tmp = thread.queue;
 136                 thread.queue = thread.bg_queue;
 137                 thread.bg_queue = tmp;
 138                 pthread_cond_signal(&thread.cond_msg_avail);
 139         }
 140         pthread_mutex_unlock(&thread.queue_lock);
 141 }
 142
 143 /* Waits for the main queue to completely finish. */
 144 void renderer_wait() {
 145         if (!thread.running) return;
 146
 147         /* Not completely safe, but should be fine since the render thread
 148          * only decreases used, and we check again inside the lock. */
 149         if (!thread.queue->used) {
 150                 return;
 151         }
 152
 153         pthread_mutex_lock(&thread.queue_lock);
 154
 155         while (thread.queue->used) {
 156                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
 157         }
 158
 159         pthread_mutex_unlock(&thread.queue_lock);
 160 }
 161
 162 /* Waits for all GPU commands in both queues to finish, bringing VRAM
 163  * completely up-to-date. */
 164 void renderer_sync(void) {
 165         if (!thread.running) return;
 166
 167         /* Not completely safe, but should be fine since the render thread
 168          * only decreases used, and we check again inside the lock. */
 169         if (!thread.queue->used && !thread.bg_queue->used) {
 170                 return;
 171         }
 172
 173         if (thread.bg_queue->used) {
 174                 /* When we flush the background queue, the vblank handler can't
 175                  * know that we had a frame pending, and we delay rendering too
 176                  * long. Force it. */
 177                 flushed = TRUE;
 178         }
 179
 180         /* Flush both queues. This is necessary because gpulib could be
 181          * trying to process a DMA write that a command in the queue should
 182          * run beforehand. For example, Xenogears sprites write a black
 183          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
 184          * happens after the DMA, it will clear the DMA, resulting in
 185          * flickering sprites. We need to be totally up-to-date. This may
 186          * drop a frame. */
 187         renderer_wait();
 188         cmd_queue_swap();
 189         hold_cmds = FALSE;
 190         renderer_wait();
 191 }
 192
 193 static void video_thread_stop() {
 194         int i;
 195         renderer_sync();
 196
 197         if (thread.running) {
 198                 thread.running = FALSE;
 199                 pthread_cond_signal(&thread.cond_msg_avail);
 200                 pthread_join(thread.thread, NULL);
 201         }
 202
 203         pthread_mutex_destroy(&thread.queue_lock);
 204         pthread_cond_destroy(&thread.cond_msg_avail);
 205         pthread_cond_destroy(&thread.cond_msg_done);
 206         pthread_cond_destroy(&thread.cond_queue_empty);
 207
 208         for (i = 0; i < QUEUE_SIZE; i++) {
 209                 video_thread_cmd *cmd = &thread.queue->queue[i];
 210                 free(cmd->cmd_list);
 211                 cmd->cmd_list = NULL;
 212         }
 213
 214         for (i = 0; i < QUEUE_SIZE; i++) {
 215                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
 216                 free(cmd->cmd_list);
 217                 cmd->cmd_list = NULL;
 218         }
 219 }
 220
 221 static void video_thread_start() {
 222         fprintf(stdout, "Starting render thread\n");
 223
 224         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
 225                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
 226                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
 227                         pthread_mutex_init(&thread.queue_lock, NULL) ||
 228                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
 229                 goto error;
 230         }
 231
 232         thread.queue = &queues[0];
 233         thread.bg_queue = &queues[1];
 234
 235         thread.running = TRUE;
 236         return;
 237
 238  error:
 239         fprintf(stderr,"Failed to start rendering thread\n");
 240         video_thread_stop();
 241 }
 242
 243 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
 244         video_thread_cmd *cmd;
 245         uint32_t *cmd_list;
 246         video_thread_queue *queue;
 247         BOOL lock;
 248
 249         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
 250
 251         if (!cmd_list) {
 252                 /* Out of memory, disable the thread and run sync from now on */
 253                 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
 254                 video_thread_stop();
 255         }
 256
 257         memcpy(cmd_list, list, count * sizeof(uint32_t));
 258
 259         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
 260                 /* If the bg queue is full, do a full sync to empty both queues
 261                  * and clear space. This should be very rare, I've only seen it in
 262                  * Tekken 3 post-battle-replay. */
 263                 renderer_sync();
 264         }
 265
 266         if (hold_cmds) {
 267                 queue = thread.bg_queue;
 268                 lock = FALSE;
 269         } else {
 270                 queue = thread.queue;
 271                 lock = TRUE;
 272         }
 273
 274         if (lock) {
 275                 pthread_mutex_lock(&thread.queue_lock);
 276
 277                 while (queue->used >= QUEUE_SIZE) {
 278                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
 279                 }
 280         }
 281
 282         cmd = &queue->queue[queue->end];
 283         free(cmd->cmd_list);
 284         cmd->cmd_list = cmd_list;
 285         cmd->count = count;
 286         cmd->last_cmd = last_cmd;
 287         queue->end = (queue->end + 1) % QUEUE_SIZE;
 288         queue->used++;
 289
 290         if (lock) {
 291                 pthread_cond_signal(&thread.cond_msg_avail);
 292                 pthread_mutex_unlock(&thread.queue_lock);
 293         }
 294 }
 295
 296 /* Slice off just the part of the list that can be handled async, and
 297  * update ex_regs. */
 298 static int scan_cmd_list(uint32_t *data, int count,
 299         int *cycles_sum_out, int *cycles_last, int *last_cmd)
 300 {
 301         int cpu_cycles_sum = 0, cpu_cycles = *cycles_last;
 302         int cmd = 0, pos = 0, len, v;
 303
 304         while (pos < count) {
 305                 uint32_t *list = data + pos;
 306                 short *slist = (void *)list;
 307                 cmd = LE32TOH(list[0]) >> 24;
 308                 len = 1 + cmd_lengths[cmd];
 309
 310                 switch (cmd) {
 311                         case 0x02:
 312                                 gput_sum(cpu_cycles_sum, cpu_cycles,
 313                                         gput_fill(LE16TOH(slist[4]) & 0x3ff,
 314                                                 LE16TOH(slist[5]) & 0x1ff));
 315                                 break;
 316                         case 0x20 ... 0x23:
 317                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
 318                                 break;
 319                         case 0x24 ... 0x27:
 320                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
 321                                 gpu.ex_regs[1] &= ~0x1ff;
 322                                 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
 323                                 break;
 324                         case 0x28 ... 0x2b:
 325                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
 326                                 break;
 327                         case 0x2c ... 0x2f:
 328                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
 329                                 gpu.ex_regs[1] &= ~0x1ff;
 330                                 gpu.ex_regs[1] |= LE32TOH(list[4]) & 0x1ff;
 331                                 break;
 332                         case 0x30 ... 0x33:
 333                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
 334                                 break;
 335                         case 0x34 ... 0x37:
 336                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
 337                                 gpu.ex_regs[1] &= ~0x1ff;
 338                                 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
 339                                 break;
 340                         case 0x38 ... 0x3b:
 341                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
 342                                 break;
 343                         case 0x3c ... 0x3f:
 344                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
 345                                 gpu.ex_regs[1] &= ~0x1ff;
 346                                 gpu.ex_regs[1] |= LE32TOH(list[5]) & 0x1ff;
 347                                 break;
 348                         case 0x40 ... 0x47:
 349                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 350                                 break;
 351                         case 0x48 ... 0x4F:
 352                                 for (v = 3; pos + v < count; v++)
 353                                 {
 354                                         gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 355                                         if ((list[v] & 0xf000f000) == 0x50005000)
 356                                                 break;
 357                                 }
 358                                 len += v - 3;
 359                                 break;
 360                         case 0x50 ... 0x57:
 361                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 362                                 break;
 363                         case 0x58 ... 0x5F:
 364                                 for (v = 4; pos + v < count; v += 2)
 365                                 {
 366                                         gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 367                                         if ((list[v] & 0xf000f000) == 0x50005000)
 368                                                 break;
 369                                 }
 370                                 len += v - 4;
 371                                 break;
 372                         case 0x60 ... 0x63:
 373                                 gput_sum(cpu_cycles_sum, cpu_cycles,
 374                                         gput_sprite(LE16TOH(slist[4]) & 0x3ff,
 375                                                 LE16TOH(slist[5]) & 0x1ff));
 376                                 break;
 377                         case 0x64 ... 0x67:
 378                                 gput_sum(cpu_cycles_sum, cpu_cycles,
 379                                         gput_sprite(LE16TOH(slist[6]) & 0x3ff,
 380                                                 LE16TOH(slist[7]) & 0x1ff));
 381                                 break;
 382                         case 0x68 ... 0x6b:
 383                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
 384                                 break;
 385                         case 0x70 ... 0x77:
 386                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(8, 8));
 387                                 break;
 388                         case 0x78 ... 0x7f:
 389                                 gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(16, 16));
 390                                 break;
 391                         default:
 392                                 if ((cmd & 0xf8) == 0xe0)
 393                                         gpu.ex_regs[cmd & 7] = list[0];
 394                                 break;
 395                 }
 396
 397                 if (pos + len > count) {
 398                         cmd = -1;
 399                         break; /* incomplete cmd */
 400                 }
 401                 if (0x80 <= cmd && cmd <= 0xdf)
 402                         break; /* image i/o */
 403
 404                 pos += len;
 405         }
 406
 407         *cycles_sum_out += cpu_cycles_sum;
 408         *cycles_last = cpu_cycles;
 409         *last_cmd = cmd;
 410         return pos;
 411 }
 412
 413 int do_cmd_list(uint32_t *list, int count,
 414  int *cycles_sum, int *cycles_last, int *last_cmd)
 415 {
 416         int pos = 0;
 417
 418         if (thread.running) {
 419                 pos = scan_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
 420                 video_thread_queue_cmd(list, pos, *last_cmd);
 421         } else {
 422                 pos = real_do_cmd_list(list, count, cycles_sum, cycles_last, last_cmd);
 423                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
 424         }
 425         return pos;
 426 }
 427
 428 int renderer_init(void) {
 429         if (thread_rendering) {
 430                 video_thread_start();
 431         }
 432         return real_renderer_init();
 433 }
 434
 435 void renderer_finish(void) {
 436         real_renderer_finish();
 437
 438         if (thread_rendering && thread.running) {
 439                 video_thread_stop();
 440         }
 441 }
 442
 443 void renderer_sync_ecmds(uint32_t * ecmds) {
 444         if (thread.running) {
 445                 int dummy = 0;
 446                 do_cmd_list(&ecmds[1], 6, &dummy, &dummy, &dummy);
 447         } else {
 448                 real_renderer_sync_ecmds(ecmds);
 449         }
 450 }
 451
 452 void renderer_update_caches(int x, int y, int w, int h, int state_changed) {
 453         renderer_sync();
 454         real_renderer_update_caches(x, y, w, h, state_changed);
 455 }
 456
 457 void renderer_flush_queues(void) {
 458         /* Called during DMA and updateLace. We want to sync if it's DMA,
 459          * but not if it's updateLace. Instead of syncing here, there's a
 460          * renderer_sync call during DMA. */
 461         real_renderer_flush_queues();
 462 }
 463
 464 /*
 465  * Normally all GPU commands are processed before rendering the
 466  * frame. For games that naturally run < 50/60fps, this is unnecessary
 467  * -- it forces the game to render as if it was 60fps and leaves the
 468  * GPU idle half the time on a 30fps game, for example.
 469  *
 470  * Allowing the renderer to wait until a frame is done before
 471  * rendering it would give it double, triple, or quadruple the amount
 472  * of time to finish before we have to wait for it.
 473  *
 474  * We can use a heuristic to figure out when to force a render.
 475  *
 476  * - If a frame isn't done when we're asked to render, wait for it and
 477  *   put future GPU commands in a separate buffer (for the next frame)
 478  *
 479  * - If the frame is done, and had no future GPU commands, render it.
 480  *
 481  * - If we do have future GPU commands, it meant the frame took too
 482  *   long to render and there's another frame waiting. Stop until the
 483  *   first frame finishes, render it, and start processing the next
 484  *   one.
 485  *
 486  * This may possibly add a frame or two of latency that shouldn't be
 487  * different than the real device. It may skip rendering a frame
 488  * entirely if a VRAM transfer happens while a frame is waiting, or in
 489  * games that natively run at 60fps if frames are coming in too
 490  * quickly to process. Depending on how the game treats "60fps," this
 491  * may not be noticeable.
 492  */
 493 void renderer_notify_update_lace(int updated) {
 494         if (!thread.running) return;
 495
 496         if (thread_rendering == THREAD_RENDERING_SYNC) {
 497                 renderer_sync();
 498                 return;
 499         }
 500
 501         if (updated) {
 502                 cmd_queue_swap();
 503                 return;
 504         }
 505
 506         pthread_mutex_lock(&thread.queue_lock);
 507         if (thread.bg_queue->used || flushed) {
 508                 /* We have commands for a future frame to run. Force a wait until
 509                  * the current frame is finished, and start processing the next
 510                  * frame after it's drawn (see the `updated` clause above). */
 511                 pthread_mutex_unlock(&thread.queue_lock);
 512                 renderer_wait();
 513                 pthread_mutex_lock(&thread.queue_lock);
 514
 515                 /* We are no longer holding commands back, so the next frame may
 516                  * get mixed into the following frame. This is usually fine, but can
 517                  * result in frameskip-like effects for 60fps games. */
 518                 flushed = FALSE;
 519                 hold_cmds = FALSE;
 520                 needs_display = TRUE;
 521                 gpu.state.fb_dirty = TRUE;
 522         } else if (thread.queue->used) {
 523                 /* We are still drawing during a vblank. Cut off the current frame
 524                  * by sending new commands to the background queue and skip
 525                  * drawing our partly rendered frame to the display. */
 526                 hold_cmds = TRUE;
 527                 needs_display = TRUE;
 528                 gpu.state.fb_dirty = FALSE;
 529         } else if (needs_display && !thread.queue->used) {
 530                 /* We have processed all commands in the queue, render the
 531                  * buffer. We know we have something to render, because
 532                  * needs_display is TRUE. */
 533                 hold_cmds = FALSE;
 534                 needs_display = FALSE;
 535                 gpu.state.fb_dirty = TRUE;
 536         } else {
 537                 /* Everything went normally, so do the normal thing. */
 538         }
 539
 540         pthread_mutex_unlock(&thread.queue_lock);
 541 }
 542
 543 void renderer_set_interlace(int enable, int is_odd) {
 544         real_renderer_set_interlace(enable, is_odd);
 545 }
 546
 547 void renderer_set_config(const struct rearmed_cbs *cbs) {
 548         renderer_sync();
 549         thread_rendering = cbs->thread_rendering;
 550         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
 551                 video_thread_start();
 552         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
 553                 video_thread_stop();
 554         }
 555         real_renderer_set_config(cbs);
 556 }
 557
 558 void renderer_notify_res_change(void) {
 559         renderer_sync();
 560         real_renderer_notify_res_change();
 561 }