+ const uint32_t sx = LE32TOH(params[0]) & 0x3FF;
+ const uint32_t sy = (LE32TOH(params[0]) >> 16) & 0x1FF;
+ const uint32_t dx = LE32TOH(params[1]) & 0x3FF;
+ const uint32_t dy = (LE32TOH(params[1]) >> 16) & 0x1FF;
+ uint32_t w = ((LE32TOH(params[2]) - 1) & 0x3FF) + 1;
+ uint32_t h = (((LE32TOH(params[2]) >> 16) - 1) & 0x1FF) + 1;
+ uint16_t msb = gpu.ex_regs[6] << 15;
+ uint16_t lbuf[128];
+ uint32_t x, y;
+
+ *cpu_cycles += gput_copy(w, h);
+ if (sx == dx && sy == dy && msb == 0)
+ return;
+
+ renderer_flush_queues();
+
+ if (unlikely((sx < dx && dx < sx + w) || sx + w > 1024 || dx + w > 1024 || msb))
+ {
+ for (y = 0; y < h; y++)
+ {
+ const uint16_t *src = VRAM_MEM_XY(0, (sy + y) & 0x1ff);
+ uint16_t *dst = VRAM_MEM_XY(0, (dy + y) & 0x1ff);
+ for (x = 0; x < w; x += ARRAY_SIZE(lbuf))
+ {
+ uint32_t x1, w1 = w - x;
+ if (w1 > ARRAY_SIZE(lbuf))
+ w1 = ARRAY_SIZE(lbuf);
+ for (x1 = 0; x1 < w1; x1++)
+ lbuf[x1] = src[(sx + x + x1) & 0x3ff];
+ for (x1 = 0; x1 < w1; x1++)
+ dst[(dx + x + x1) & 0x3ff] = lbuf[x1] | msb;
+ }
+ }
+ }
+ else
+ {
+ uint32_t sy1 = sy, dy1 = dy;
+ for (y = 0; y < h; y++, sy1++, dy1++)
+ memcpy(VRAM_MEM_XY(dx, dy1 & 0x1ff), VRAM_MEM_XY(sx, sy1 & 0x1ff), w * 2);
+ }
+
+ renderer_update_caches(dx, dy, w, h, 0);
+}
+
+static noinline int do_cmd_list_skip(uint32_t *data, int count, int *last_cmd)
+{
+ int cmd = 0, pos = 0, len, dummy = 0, v;
+ int skip = 1;
+
+ gpu.frameskip.pending_fill[0] = 0;
+
+ while (pos < count && skip) {
+ uint32_t *list = data + pos;
+ cmd = LE32TOH(list[0]) >> 24;
+ len = 1 + cmd_lengths[cmd];
+
+ switch (cmd) {
+ case 0x02:
+ if ((LE32TOH(list[2]) & 0x3ff) > gpu.screen.w || ((LE32TOH(list[2]) >> 16) & 0x1ff) > gpu.screen.h)
+ // clearing something large, don't skip
+ do_cmd_list(list, 3, &dummy, &dummy, &dummy);
+ else
+ memcpy(gpu.frameskip.pending_fill, list, 3 * 4);
+ break;
+ case 0x24 ... 0x27:
+ case 0x2c ... 0x2f:
+ case 0x34 ... 0x37:
+ case 0x3c ... 0x3f:
+ gpu.ex_regs[1] &= ~0x1ff;
+ gpu.ex_regs[1] |= LE32TOH(list[4 + ((cmd >> 4) & 1)]) & 0x1ff;
+ break;
+ case 0x48 ... 0x4F:
+ for (v = 3; pos + v < count; v++)
+ {
+ if ((list[v] & HTOLE32(0xf000f000)) == HTOLE32(0x50005000))
+ break;
+ }
+ len += v - 3;
+ break;
+ case 0x58 ... 0x5F:
+ for (v = 4; pos + v < count; v += 2)
+ {
+ if ((list[v] & HTOLE32(0xf000f000)) == HTOLE32(0x50005000))
+ break;
+ }
+ len += v - 4;
+ break;
+ default:
+ if (cmd == 0xe3)
+ skip = decide_frameskip_allow(LE32TOH(list[0]));
+ if ((cmd & 0xf8) == 0xe0)
+ gpu.ex_regs[cmd & 7] = LE32TOH(list[0]);
+ break;
+ }
+
+ if (pos + len > count) {
+ cmd = -1;
+ break; // incomplete cmd
+ }
+ if (0x80 <= cmd && cmd <= 0xdf)
+ break; // image i/o
+
+ pos += len;
+ }
+
+ renderer_sync_ecmds(gpu.ex_regs);
+ *last_cmd = cmd;
+ return pos;
+}
+
+static noinline int do_cmd_buffer(uint32_t *data, int count,
+ int *cycles_sum, int *cycles_last)
+{
+ int cmd, pos;
+ uint32_t old_e3 = gpu.ex_regs[3];