plugins/gpulib/gpulib_thread_if.c

   1 /**************************************************************************
   2 *   Copyright (C) 2020 The RetroArch Team                                 *
   3 *                                                                         *
   4 *   This program is free software; you can redistribute it and/or modify  *
   5 *   it under the terms of the GNU General Public License as published by  *
   6 *   the Free Software Foundation; either version 2 of the License, or     *
   7 *   (at your option) any later version.                                   *
   8 *                                                                         *
   9 *   This program is distributed in the hope that it will be useful,       *
  10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  12 *   GNU General Public License for more details.                          *
  13 *                                                                         *
  14 *   You should have received a copy of the GNU General Public License     *
  15 *   along with this program; if not, write to the                         *
  16 *   Free Software Foundation, Inc.,                                       *
  17 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
  18 ***************************************************************************/
  19
  20 #include <stdlib.h>
  21 #include <string.h>
  22 #include <pthread.h>
  23 #include "../gpulib/gpu.h"
  24 #include "../../frontend/plugin_lib.h"
  25 #include "gpulib_thread_if.h"
  26
  27 typedef struct {
  28         uint32_t *cmd_list;
  29         int count;
  30         int last_cmd;
  31 } video_thread_cmd;
  32
  33 #define QUEUE_SIZE 0x2000
  34
  35 typedef struct {
  36         size_t start;
  37         size_t end;
  38         size_t used;
  39         video_thread_cmd queue[QUEUE_SIZE];
  40 } video_thread_queue;
  41
  42 typedef struct {
  43         pthread_t thread;
  44         pthread_mutex_t queue_lock;
  45         pthread_cond_t cond_msg_avail;
  46         pthread_cond_t cond_msg_done;
  47         pthread_cond_t cond_queue_empty;
  48         video_thread_queue *queue;
  49         video_thread_queue *bg_queue;
  50         bool running;
  51 } video_thread_state;
  52
  53 static video_thread_state thread;
  54 static video_thread_queue queues[2];
  55 static int thread_rendering;
  56 static bool hold_cmds;
  57 static bool needs_display;
  58
  59 extern const unsigned char cmd_lengths[];
  60
  61 static void *video_thread_main(void *arg) {
  62         video_thread_state *thread = (video_thread_state *)arg;
  63         video_thread_cmd *cmd;
  64         int i;
  65         static int processed = 0;
  66
  67         while(1) {
  68                 int result, last_cmd, start, end;
  69                 video_thread_queue *queue;
  70                 pthread_mutex_lock(&thread->queue_lock);
  71
  72                 while (!thread->queue->used && thread->running) {
  73                         pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
  74                 }
  75
  76                 if (!thread->running) {
  77                         pthread_mutex_unlock(&thread->queue_lock);
  78                         break;
  79                 }
  80
  81                 queue = thread->queue;
  82                 start = queue->start;
  83                 end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
  84                 queue->start = end % QUEUE_SIZE;
  85                 pthread_mutex_unlock(&thread->queue_lock);
  86
  87                 for (i = start; i < end; i++) {
  88                         cmd = &queue->queue[i];
  89                         result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);
  90
  91                         if (result != cmd->count) {
  92                                 fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
  93                         }
  94
  95 #ifdef _3DS
  96                         /* Periodically yield so as not to starve other threads */
  97                         processed += cmd->count;
  98                         if (processed >= 512) {
  99                                 svcSleepThread(1);
 100                                 processed %= 512;
 101                         }
 102 #endif
 103                 }
 104
 105                 pthread_mutex_lock(&thread->queue_lock);
 106                 queue->used -= (end - start);
 107
 108                 if (!queue->used)
 109                         pthread_cond_signal(&thread->cond_queue_empty);
 110
 111                 pthread_cond_signal(&thread->cond_msg_done);
 112                 pthread_mutex_unlock(&thread->queue_lock);
 113         }
 114
 115         return 0;
 116 }
 117
 118 static void cmd_queue_swap() {
 119         video_thread_queue *tmp;
 120         if (!thread.bg_queue->used) return;
 121
 122         pthread_mutex_lock(&thread.queue_lock);
 123         if (!thread.queue->used) {
 124                 tmp = thread.queue;
 125                 thread.queue = thread.bg_queue;
 126                 thread.bg_queue = tmp;
 127                 needs_display = true;
 128                 pthread_cond_signal(&thread.cond_msg_avail);
 129         }
 130         pthread_mutex_unlock(&thread.queue_lock);
 131 }
 132
 133 /* Waits for the main queue to completely finish. */
 134 void renderer_wait() {
 135         if (!thread.running) return;
 136
 137         /* Not completely safe, but should be fine since the render thread
 138          * only decreases used, and we check again inside the lock. */
 139         if (!thread.queue->used) {
 140                 return;
 141         }
 142
 143         pthread_mutex_lock(&thread.queue_lock);
 144
 145         while (thread.queue->used) {
 146                 pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
 147         }
 148
 149         pthread_mutex_unlock(&thread.queue_lock);
 150 }
 151
 152 /* Waits for all GPU commands in both queues to finish, bringing VRAM
 153  * completely up-to-date. */
 154 void renderer_sync(void) {
 155         if (!thread.running) return;
 156
 157         /* Not completely safe, but should be fine since the render thread
 158          * only decreases used, and we check again inside the lock. */
 159         if (!thread.queue->used && !thread.bg_queue->used) {
 160                 return;
 161         }
 162
 163         /* Flush both queues. This is necessary because gpulib could be
 164          * trying to process a DMA write that a command in the queue should
 165          * run beforehand. For example, Xenogears sprites write a black
 166          * rectangle over the to-be-DMA'd spot in VRAM -- if this write
 167          * happens after the DMA, it will clear the DMA, resulting in
 168          * flickering sprites. We need to be totally up-to-date. This may
 169          * drop a frame. */
 170         renderer_wait();
 171         cmd_queue_swap();
 172         hold_cmds = false;
 173         renderer_wait();
 174 }
 175
 176 static void video_thread_stop() {
 177         int i;
 178         renderer_sync();
 179
 180         if (thread.running) {
 181                 thread.running = false;
 182                 pthread_cond_signal(&thread.cond_msg_avail);
 183                 pthread_join(thread.thread, NULL);
 184         }
 185
 186         pthread_mutex_destroy(&thread.queue_lock);
 187         pthread_cond_destroy(&thread.cond_msg_avail);
 188         pthread_cond_destroy(&thread.cond_msg_done);
 189         pthread_cond_destroy(&thread.cond_queue_empty);
 190
 191         for (i = 0; i < QUEUE_SIZE; i++) {
 192                 video_thread_cmd *cmd = &thread.queue->queue[i];
 193                 free(cmd->cmd_list);
 194                 cmd->cmd_list = NULL;
 195         }
 196
 197         for (i = 0; i < QUEUE_SIZE; i++) {
 198                 video_thread_cmd *cmd = &thread.bg_queue->queue[i];
 199                 free(cmd->cmd_list);
 200                 cmd->cmd_list = NULL;
 201         }
 202 }
 203
 204 static void video_thread_start() {
 205         fprintf(stdout, "Starting render thread\n");
 206
 207         if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
 208                         pthread_cond_init(&thread.cond_msg_done, NULL) ||
 209                         pthread_cond_init(&thread.cond_queue_empty, NULL) ||
 210                         pthread_mutex_init(&thread.queue_lock, NULL) ||
 211                         pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
 212                 goto error;
 213         }
 214
 215         thread.queue = &queues[0];
 216         thread.bg_queue = &queues[1];
 217
 218         thread.running = true;
 219         return;
 220
 221  error:
 222         fprintf(stderr,"Failed to start rendering thread\n");
 223         video_thread_stop();
 224 }
 225
 226 static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
 227         video_thread_cmd *cmd;
 228         uint32_t *cmd_list;
 229         video_thread_queue *queue;
 230         bool lock;
 231
 232         cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));
 233
 234         if (!cmd_list) {
 235                 /* Out of memory, disable the thread and run sync from now on */
 236                 fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
 237                 video_thread_stop();
 238         }
 239
 240         memcpy(cmd_list, list, count * sizeof(uint32_t));
 241
 242         if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
 243                 /* If the bg queue is full, do a full sync to empty both queues
 244                  * and clear space. This should be very rare, I've only seen it in
 245                  * Tekken 3 post-battle-replay. */
 246                 renderer_sync();
 247         }
 248
 249         if (hold_cmds) {
 250                 queue = thread.bg_queue;
 251                 lock = false;
 252         } else {
 253                 queue = thread.queue;
 254                 lock = true;
 255         }
 256
 257         if (lock) {
 258                 pthread_mutex_lock(&thread.queue_lock);
 259
 260                 while (queue->used >= QUEUE_SIZE) {
 261                         pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
 262                 }
 263         }
 264
 265         cmd = &queue->queue[queue->end];
 266         free(cmd->cmd_list);
 267         cmd->cmd_list = cmd_list;
 268         cmd->count = count;
 269         cmd->last_cmd = last_cmd;
 270         queue->end = (queue->end + 1) % QUEUE_SIZE;
 271         queue->used++;
 272
 273         if (lock) {
 274                 pthread_cond_signal(&thread.cond_msg_avail);
 275                 pthread_mutex_unlock(&thread.queue_lock);
 276         }
 277 }
 278
 279 /* Slice off just the part of the list that can be handled async, and
 280  * update ex_regs. */
 281 static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
 282 {
 283         int cmd = 0, pos = 0, len, v;
 284
 285         while (pos < count) {
 286                 uint32_t *list = data + pos;
 287                 cmd = list[0] >> 24;
 288                 len = 1 + cmd_lengths[cmd];
 289
 290                 switch (cmd) {
 291                         case 0x02:
 292                                 break;
 293                         case 0x24 ... 0x27:
 294                         case 0x2c ... 0x2f:
 295                         case 0x34 ... 0x37:
 296                         case 0x3c ... 0x3f:
 297                                 gpu.ex_regs[1] &= ~0x1ff;
 298                                 gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
 299                                 break;
 300                         case 0x48 ... 0x4F:
 301                                 for (v = 3; pos + v < count; v++)
 302                                 {
 303                                         if ((list[v] & 0xf000f000) == 0x50005000)
 304                                                 break;
 305                                 }
 306                                 len += v - 3;
 307                                 break;
 308                         case 0x58 ... 0x5F:
 309                                 for (v = 4; pos + v < count; v += 2)
 310                                 {
 311                                         if ((list[v] & 0xf000f000) == 0x50005000)
 312                                                 break;
 313                                 }
 314                                 len += v - 4;
 315                                 break;
 316                         default:
 317                                 if ((cmd & 0xf8) == 0xe0)
 318                                         gpu.ex_regs[cmd & 7] = list[0];
 319                                 break;
 320                 }
 321
 322                 if (pos + len > count) {
 323                         cmd = -1;
 324                         break; /* incomplete cmd */
 325                 }
 326                 if (0xa0 <= cmd && cmd <= 0xdf)
 327                         break; /* image i/o */
 328
 329                 pos += len;
 330         }
 331
 332         *last_cmd = cmd;
 333         return pos;
 334 }
 335
 336 int do_cmd_list(uint32_t *list, int count, int *last_cmd) {
 337         int pos = 0;
 338
 339         if (thread.running) {
 340                 pos = scan_cmd_list(list, count, last_cmd);
 341                 video_thread_queue_cmd(list, pos, *last_cmd);
 342         } else {
 343                 pos = real_do_cmd_list(list, count, last_cmd);
 344                 memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
 345         }
 346         return pos;
 347 }
 348
 349 int renderer_init(void) {
 350         if (thread_rendering) {
 351                 video_thread_start();
 352         }
 353         return real_renderer_init();
 354 }
 355
 356 void renderer_finish(void) {
 357         real_renderer_finish();
 358
 359         if (thread_rendering && thread.running) {
 360                 video_thread_stop();
 361         }
 362 }
 363
 364 void renderer_sync_ecmds(uint32_t * ecmds) {
 365         if (thread.running) {
 366                 int dummy;
 367                 do_cmd_list(&ecmds[1], 6, &dummy);
 368         } else {
 369                 real_renderer_sync_ecmds(ecmds);
 370         }
 371 }
 372
 373 void renderer_update_caches(int x, int y, int w, int h) {
 374         renderer_sync();
 375         real_renderer_update_caches(x, y, w, h);
 376 }
 377
 378 void renderer_flush_queues(void) {
 379         /* Called during DMA and updateLace. We want to sync if it's DMA,
 380          * but not if it's updateLace. Instead of syncing here, there's a
 381          * renderer_sync call during DMA. */
 382         real_renderer_flush_queues();
 383 }
 384
 385 /*
 386  * Normally all GPU commands are processed before rendering the
 387  * frame. For games that naturally run < 50/60fps, this is unnecessary
 388  * -- it forces the game to render as if it was 60fps and leaves the
 389  * GPU idle half the time on a 30fps game, for example.
 390  *
 391  * Allowing the renderer to wait until a frame is done before
 392  * rendering it would give it double, triple, or quadruple the amount
 393  * of time to finish before we have to wait for it.
 394  *
 395  * We can use a heuristic to figure out when to force a render.
 396  *
 397  * - If a frame isn't done when we're asked to render, wait for it and
 398  *   put future GPU commands in a separate buffer (for the next frame)
 399  *
 400  * - If the frame is done, and had no future GPU commands, render it.
 401  *
 402  * - If we do have future GPU commands, it meant the frame took too
 403  *   long to render and there's another frame waiting. Stop until the
 404  *   first frame finishes, render it, and start processing the next
 405  *   one.
 406  *
 407  * This may possibly add a frame or two of latency that shouldn't be
 408  * different than the real device. It may skip rendering a frame
 409  * entirely if a VRAM transfer happens while a frame is waiting, or in
 410  * games that natively run at 60fps if frames are coming in too
 411  * quickly to process. Depending on how the game treats "60fps," this
 412  * may not be noticeable.
 413  */
 414 void renderer_notify_update_lace(int updated) {
 415         if (!thread.running) return;
 416
 417         if (thread_rendering == THREAD_RENDERING_SYNC) {
 418                 renderer_sync();
 419                 return;
 420         }
 421
 422         if (updated) {
 423                 cmd_queue_swap();
 424                 return;
 425         }
 426
 427         pthread_mutex_lock(&thread.queue_lock);
 428         if (thread.bg_queue->used) {
 429                 /* We have commands for a future frame to run. Force a wait until
 430                  * the current frame is finished, and start processing the next
 431                  * frame after it's drawn (see the `updated` clause above). */
 432                 pthread_mutex_unlock(&thread.queue_lock);
 433                 renderer_wait();
 434                 pthread_mutex_lock(&thread.queue_lock);
 435
 436                 /* We are no longer holding commands back, so the next frame may
 437                  * get mixed into the following frame. This is usually fine, but can
 438                  * result in frameskip-like effects for 60fps games. */
 439                 hold_cmds = false;
 440                 needs_display = true;
 441                 gpu.state.fb_dirty = true;
 442         } else if (thread.queue->used) {
 443                 /* We are still drawing during a vblank. Cut off the current frame
 444                  * by sending new commands to the background queue and skip
 445                  * drawing our partly rendered frame to the display. */
 446                 hold_cmds = true;
 447                 needs_display = true;
 448                 gpu.state.fb_dirty = false;
 449         } else if (needs_display && !thread.queue->used) {
 450                 /* We have processed all commands in the queue, render the
 451                  * buffer. We know we have something to render, because
 452                  * needs_display is true. */
 453                 hold_cmds = false;
 454                 needs_display = false;
 455                 gpu.state.fb_dirty = true;
 456         } else {
 457                 /* Everything went normally, so do the normal thing. */
 458         }
 459
 460         pthread_mutex_unlock(&thread.queue_lock);
 461 }
 462
 463 void renderer_set_interlace(int enable, int is_odd) {
 464         real_renderer_set_interlace(enable, is_odd);
 465 }
 466
 467 void renderer_set_config(const struct rearmed_cbs *cbs) {
 468         renderer_sync();
 469         thread_rendering = cbs->thread_rendering;
 470         if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
 471                 video_thread_start();
 472         } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
 473                 video_thread_stop();
 474         }
 475         real_renderer_set_config(cbs);
 476 }
 477
 478 void renderer_notify_res_change(void) {
 479         renderer_sync();
 480         real_renderer_notify_res_change();
 481 }