From c55a44a88c217900cd4f56f164f14cb680f7597a Mon Sep 17 00:00:00 2001
From: kub <derkub@gmail.com>
Date: Sat, 14 Mar 2020 19:52:27 +0100
Subject: [PATCH] vdp fifo speed optimization

---
 pico/misc.c      | 129 ++++++++++++++++++++++++++++++
 pico/pico.c      |   1 +
 pico/pico_cmn.c  |   2 +
 pico/pico_int.h  |   3 +
 pico/videoport.c | 200 ++++++++++++++++++-----------------------------
 5 files changed, 210 insertions(+), 125 deletions(-)

diff --git a/pico/misc.c b/pico/misc.c
index 4837fd3e..74d4d8a8 100644
--- a/pico/misc.c
+++ b/pico/misc.c
@@ -48,6 +48,135 @@ const unsigned char hcounts_32[] = {
 0x82,0x83,0x83,0x84,0x85,0x85,0x86,0x87,0x87,0x88,0x89,0x8a,0x8a,0x8b,0x8c,0x8c,
 };
 
+// VDP transfer slots for blanked and active display in 32col and 40col mode.
+// 1 slot is 488/171 = 2.8538 68k cycles in h32, and 488/210 = 2.3238 in h40
+// In blanked display, all slots but 5(h32) / 6(h40) are usable for transfers,
+// in active display only 16(h32) / 18(h40) slots can be used.
+
+// XXX inactive tables by slot#=cycles*maxslot#/488. should be through hv tables
+// VDP transfer slots in inactive (blanked) display 32col mode.
+// refresh slots: 250, 26, 58, 90, 122 -> 32, 64, 96, 128, 160
+const unsigned char vdpcyc2sl_32_bl[] = { // 68k cycles/2 to slot #
+//  0   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30
+    0,  0,  1,  2,  2,  3,  4,  4,  5,  6,  6,  7,  8,  8,  9, 10,
+   10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 19, 20, 21,
+   21, 22, 23, 23, 24, 25, 25, 26, 27, 27, 28, 29, 29, 30, 31, 31,
+   32, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 40, 40, 41, 42, 42,
+   43, 44, 44, 45, 46, 46, 47, 48, 48, 49, 50, 51, 51, 52, 53, 53,
+   54, 55, 55, 56, 57, 57, 58, 59, 59, 60, 61, 61, 62, 63, 63, 64,
+   65, 65, 66, 67, 68, 68, 69, 70, 70, 71, 72, 72, 73, 74, 74, 75,
+   76, 76, 77, 78, 78, 79, 80, 80, 81, 82, 83, 83, 84, 85, 85, 86,
+   87, 87, 88, 89, 89, 90, 91, 91, 92, 93, 93, 94, 95, 95, 96, 97,
+   97, 98, 99,100,100,101,102,102,103,104,104,105,106,106,107,108,
+  108,109,110,110,111,112,112,113,114,114,115,116,117,117,118,119,
+  119,120,121,121,122,123,123,124,125,125,126,127,127,128,129,129,
+  130,131,131,132,133,134,134,135,136,136,137,138,138,139,140,140,
+  141,142,142,143,144,144,145,146,146,147,148,148,149,150,151,151,
+  152,153,153,154,155,155,156,157,157,158,159,159,160,161,161,162,
+  163,163,164,165,166,166,167,168,168,169,170,170,171,172,172,173,
+};
+// VDP transfer slots in inactive (blanked) display 40col mode.
+// refresh slots: 250, 26, 58, 90, 122, 154 -> 40, 72, 104, 136, 168, 200
+const unsigned char vdpcyc2sl_40_bl[] = { // 68k cycles/2 to slot #
+//  0   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30
+    0,  0,  1,  2,  3,  4,  5,  5,  6,  7,  8,  9, 10, 10, 11, 12,
+   13, 14, 15, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 25,
+   26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 35, 35, 36, 37, 38, 39,
+   40, 40, 41, 42, 43, 44, 45, 45, 46, 47, 48, 49, 50, 51, 51, 52,
+   53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 66,
+   66, 67, 68, 69, 70, 71, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79,
+   80, 81, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90, 91, 91, 92,
+   93, 94, 95, 96, 96, 97, 98, 99,100,101,102,102,103,104,105,106,
+  107,107,108,109,110,111,112,112,113,114,115,116,117,117,118,119,
+  120,121,122,122,123,124,125,126,127,127,128,129,130,131,132,132,
+  133,134,135,136,137,137,138,139,140,141,142,142,143,144,145,146,
+  147,147,148,149,150,151,152,153,153,154,155,156,157,158,158,159,
+  160,161,162,163,163,164,165,166,167,168,168,169,170,171,172,173,
+  173,174,175,176,177,178,178,179,180,181,182,183,183,184,185,186,
+  187,188,188,189,190,191,192,193,193,194,195,196,197,198,198,199,
+  200,201,202,203,204,204,205,206,207,208,209,209,210,211,212,213,
+};
+// VDP transfer slots in active display 32col mode. Transfer slots (Hint=0):
+// 11,25,40,48,56,72,80,88,104,112,120,136,144,152,167,168
+const unsigned char vdpcyc2sl_32[] = { // 68k cycles/2 to slot #
+//  0   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,
+    5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10,
+   10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
+   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+   11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13,
+   13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+   14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15,
+   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+};
+// VDP transfer slots in active display 40col mode. Transfer slots (Hint=0):
+// 21,47,55,63,79,87,95,111,119,127,143,151,159,175,183,191,206,207
+const unsigned char vdpcyc2sl_40[] = { // 68k cycles/2 to slot #
+//  0   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, //   0
+    0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1, //  32
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, //  64
+    1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2, //  96
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4, // 128
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5, // 160
+    5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7, // 192
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, // 224
+    7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, // 256
+    9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, // 288
+   10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, // 320
+   12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, // 352
+   13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, // 384
+   14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, // 416
+   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, // 448
+   18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, // 480
+};
+
+// XXX inactive tables by cyc=slot#*488/maxslot#. should be through hv tables
+const unsigned short vdpsl2cyc_32_bl[] = { // slot # to 68k cycles/2
+     0,  2,  3,  5,  6,  8,  9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
+    24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45, 46,
+    48, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 64, 65, 67, 68, 70,
+    71, 73, 74, 75, 77, 78, 80, 81, 83, 84, 86, 87, 89, 90, 92, 93,
+    95, 96, 98, 99,100,102,103,105,106,108,109,111,112,114,115,117,
+   118,120,121,122,124,125,127,128,130,131,133,134,136,137,139,140,
+   142,143,145,146,147,149,150,152,153,155,156,158,159,161,162,164,
+   165,167,168,170,171,172,174,175,177,178,180,181,183,184,186,187,
+   189,190,192,193,195,196,197,199,200,202,203,205,206,208,209,211,
+   212,214,215,217,218,220,221,222,224,225,227,228,230,231,233,234,
+   236,237,239,240,242,243,244,246,
+};
+const unsigned short vdpsl2cyc_40_bl[] = { // slot # to 68k cycles/2
+     0,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 14, 15, 16, 17, 18,
+    20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 38,
+    39, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 56, 57,
+    58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75, 76,
+    77, 78, 79, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 93, 94, 95,
+    96, 97, 99,100,101,102,103,105,106,107,108,109,111,112,113,114,
+   115,117,118,119,120,121,122,124,125,126,127,128,130,131,132,133,
+   134,136,137,138,139,140,142,143,144,145,146,148,149,150,151,152,
+   154,155,156,157,158,160,161,162,163,164,166,167,168,169,170,172,
+   173,174,175,176,178,179,180,181,182,183,185,186,187,188,189,191,
+   192,193,194,195,197,198,199,200,201,203,204,205,206,207,209,210,
+   211,212,213,215,216,217,218,219,221,222,223,224,225,227,228,229,
+   230,231,233,234,235,236,237,239,240,241,242,243,244,246,
+};
+const unsigned short vdpsl2cyc_32[] = { // slot # to 68k cycles/2
+    0, 16, 36, 56, 67, 79,102,113,125,148,159,171,194,205,217,239,
+  240,260
+};
+const unsigned short vdpsl2cyc_40[] = { // slot # to 68k cycles/2
+    0, 24, 55, 64, 73, 92,101,110,129,138,147,166,175,184,203,212,
+  221,239,240,268
+};
+
 #ifndef _ASM_MISC_C
 PICO_INTERNAL_ASM void memcpy16bswap(unsigned short *dest, void *src, int count)
 {
diff --git a/pico/pico.c b/pico/pico.c
index 9db2fc64..87e22e59 100644
--- a/pico/pico.c
+++ b/pico/pico.c
@@ -79,6 +79,7 @@ void PicoPower(void)
   Pico.video.reg[0] = Pico.video.reg[1] = 0x04;
   Pico.video.reg[0xc] = 0x81;
   Pico.video.reg[0xf] = 0x02;
+  PicoVideoFIFOMode(0, 1);
 
   if (PicoIn.AHW & PAHW_MCD)
     PicoPowerMCD();
diff --git a/pico/pico_cmn.c b/pico/pico_cmn.c
index 50a632ca..017c404b 100644
--- a/pico/pico_cmn.c
+++ b/pico/pico_cmn.c
@@ -179,6 +179,7 @@ static int PicoFrameHints(void)
   }
 
   pv->status |= SR_VB | PVS_VB2; // go into vblank
+  PicoVideoFIFOMode(pv->reg[1]&0x40, pv->reg[12]&1);
 
   // the following SekRun is there for several reasons:
   // there must be a delay after vblank bit is set and irq is asserted (Mazin Saga)
@@ -270,6 +271,7 @@ static int PicoFrameHints(void)
 
   pv->status &= ~(SR_VB | PVS_VB2);
   pv->status |= ((pv->reg[1] >> 3) ^ SR_VB) & SR_VB; // forced blanking
+  PicoVideoFIFOMode(pv->reg[1]&0x40, pv->reg[12]&1);
 
   // last scanline
   Pico.m.scanline = y++;
diff --git a/pico/pico_int.h b/pico/pico_int.h
index 65b56f1d..c0f2c343 100644
--- a/pico/pico_int.h
+++ b/pico/pico_int.h
@@ -299,6 +299,8 @@ extern SH2 sh2s[2];
 #define PVS_CPUWR     (1 << 18) // CPU write blocked by FIFO full
 #define PVS_CPURD     (1 << 19) // CPU read blocked by FIFO not empty
 #define PVS_DMAFILL   (1 << 20) // DMA fill is waiting for fill data
+#define PVS_DMABG     (1 << 21) // background DMA operation is running
+#define PVS_FIFORUN   (1 << 22) // FIFO is processing
 
 struct PicoVideo
 {
@@ -858,6 +860,7 @@ unsigned char PicoVideoRead8HV_L(void);
 extern int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned int *mask);
 void PicoVideoFIFOSync(int cycles);
 int PicoVideoFIFOHint(void);
+void PicoVideoFIFOMode(int active, int h40);
 int PicoVideoFIFOWrite(int count, int byte_p, unsigned sr_mask, unsigned sr_flags);
 void PicoVideoSave(void);
 void PicoVideoLoad(void);
diff --git a/pico/videoport.c b/pico/videoport.c
index cbcea796..3ed7f5b4 100644
--- a/pico/videoport.c
+++ b/pico/videoport.c
@@ -12,8 +12,11 @@
 #define NEED_DMA_SOURCE
 #include "memory.h"
 
-extern const unsigned char  hcounts_32[];
-extern const unsigned char  hcounts_40[];
+extern const unsigned char  hcounts_32[], hcounts_40[];
+extern const unsigned char  vdpcyc2sl_32_bl[], vdpcyc2sl_40_bl[];
+extern const unsigned char  vdpcyc2sl_32[], vdpcyc2sl_40[];
+extern const unsigned short vdpsl2cyc_32_bl[], vdpsl2cyc_40_bl[];
+extern const unsigned short vdpsl2cyc_32[], vdpsl2cyc_40[];
 
 static int blankline;           // display disabled for this line
 static unsigned sat;            // VRAM addr of sprite attribute table
@@ -53,48 +56,6 @@ int (*PicoDmaHook)(unsigned int source, int len, unsigned short **base, unsigned
  * FIFORead executes a 68k read. 68k is blocked until the next transfer slot.
  */
 
-// FIFO transfer slots per line: [active][h40]
-static const short vdpslots[2][2] = {{ 166, 204 },{ 16, 18 }};
-// mapping between slot# and 68k cycles in a blanked scanline [H32, H40]
-static const int vdpcyc2sl_bl[] = { (166<<16)/488, (204<<16)/488 };
-static const int vdpsl2cyc_bl[] = { (488<<16)/166, (488<<16)/204 };
-
-// VDP transfer slots in active display 32col mode. 1 slot is 488/171 = 2.8538
-// 68k cycles. Only 16 of the 171 slots in a scanline can be used by CPU/DMA:
-// (HINT=slot 0): 11,25,40,48,56,72,80,88,104,112,120,136,144,152,167,168
-static const unsigned char vdpcyc2sl_32[] = { // 68k cycles/4 to slot #
-//  4  8 12 16 20 24 28 32 36 40 44 48 52 56 60
- 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
- 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5,
- 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9,10,
-10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,
-11,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,
-14,14,14,14,14,14,14,15,16,16,16,16,16,16,16,16,
-};
-static const unsigned char vdpsl2cyc_32[] = { // slot # to 68k cycles/4
-  0,  8, 18, 28, 33, 39, 51, 56, 62, 74, 79, 85, 97,102,108,119,120,130
-};
-
-// VDP transfer slots in active display 40col mode. 1 slot is 488/210 = 2.3238
-// 68k cycles. Only 18 of the 210 slots in a scanline can be used by CPU/DMA:
-// (HINT=0): 21,47,55,63,79,87,95,111,119,127,143,151,159,175,183,191,206,207,
-static const unsigned char vdpcyc2sl_40[] = { // 68k cycles/4 to slot #
-//  4  8 12 16 20 24 28 32 36 40 44 48 52 56 60
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5,
- 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 8, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,
-10,10,10,11,11,11,11,12,12,12,12,12,13,13,13,13,
-13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,
-16,16,16,16,16,16,16,17,18,18,18,18,18,18,18,18,
-};
-static const unsigned char vdpsl2cyc_40[] = { // slot # to 68k cycles/4
-  0, 12, 27, 32, 36, 46, 50, 55, 64, 69, 73, 83, 87, 92,101,106,111,119,120,134
-};
-
 // NB code assumes fifo_* arrays have size 2^n
 // last transferred FIFO data, ...x = index  XXX currently only CPU
 static short fifo_data[4], fifo_dx; // XXX must go into save?
@@ -106,34 +67,10 @@ enum { FQ_BYTE = 1, FQ_BGDMA = 2, FQ_FGDMA = 4 }; // queue flags, NB: BYTE = 1!
 static unsigned int fifo_total;    // total# of pending FIFO entries (w/o BGDMA)
 
 static unsigned short fifo_slot;   // last executed slot in current scanline
+static unsigned short fifo_maxslot;// #slots in scanline
 
-// map cycles to FIFO slot
-static __inline int GetFIFOSlot(struct PicoVideo *pv, int cycles)
-{
-  int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40);
-  int h40 = pv->reg[12] & 1;
-
-  if (active)	return (h40 ? vdpcyc2sl_40 : vdpcyc2sl_32)[cycles/4];
-  else		return (cycles * vdpcyc2sl_bl[h40] + cycles) >> 16;
-}
-
-static __inline int GetMaxFIFOSlot(struct PicoVideo *pv)
-{
-  int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40);
-  int h40 = pv->reg[12] & 1;
-
-  return vdpslots[active][h40];
-}
-
-// map FIFO slot to cycles
-static __inline int GetFIFOCycles(struct PicoVideo *pv, int slot)
-{
-  int active = !(pv->status & SR_VB) && (pv->reg[1] & 0x40);
-  int h40 = pv->reg[12] & 1;
-
-  if (active)	return (h40 ? vdpsl2cyc_40 : vdpsl2cyc_32)[slot]*4;
-  else		return ((slot * vdpsl2cyc_bl[h40] + slot) >> 16);
-}
+static const unsigned char *fifo_cyc2sl;
+static const unsigned short *fifo_sl2cyc;
 
 // do the FIFO math
 static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots)
@@ -149,20 +86,16 @@ static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots)
 
   // if entry has been processed...
   if (pv->fifo_cnt == 0) {
-    if (fifo_ql) {
-      // terminate DMA if applicable
-      if ((pv->status & SR_DMA) && (fifo_queue[fifo_qx] & FQ_BGDMA)) {
-        pv->status &= ~SR_DMA;
-        pv->command &= ~0x80;
-      }
-      // remove entry from FIFO
+    // remove entry from FIFO
+    if (fifo_ql)
       fifo_qx ++, fifo_qx &= 7, fifo_ql --;
-    }
     // start processing for next entry if there is one
     if (fifo_ql)
       pv->fifo_cnt = (fifo_queue[fifo_qx] >> 3) << (fifo_queue[fifo_qx] & FQ_BYTE);
-    else
+    else { // FIFO empty
+      pv->status &= ~PVS_FIFORUN;
       fifo_total = 0;
+    }
   }
   return l;
 }
@@ -170,16 +103,20 @@ static __inline int AdvanceFIFOEntry(struct PicoVideo *pv, int slots)
 static __inline void SetFIFOState(struct PicoVideo *pv)
 {
   // release CPU and terminate DMA if FIFO isn't blocking the 68k anymore
-  if (fifo_total == 0)
-    pv->status &= ~PVS_CPURD;
   if (fifo_total <= 4) {
-    int x = (fifo_qx + fifo_ql - 1) & 7;
-    if ((pv->status & SR_DMA) && !(pv->status & PVS_DMAFILL) &&
-                (!fifo_ql || !(fifo_queue[x] & FQ_BGDMA))) {
+    pv->status &= ~PVS_CPUWR;
+    if (!(pv->status & (PVS_DMABG|PVS_DMAFILL))) {
       pv->status &= ~SR_DMA;
       pv->command &= ~0x80;
     }
-    pv->status &= ~PVS_CPUWR;
+  }
+  if (fifo_total == 0) {
+    pv->status &= ~PVS_CPURD;
+    // terminate DMA if applicable
+    if (!(pv->status & (PVS_FIFORUN|PVS_DMAFILL))) {
+      pv->status &= ~(SR_DMA|PVS_DMABG);
+      pv->command &= ~0x80;
+    }
   }
 }
 
@@ -190,7 +127,7 @@ void PicoVideoFIFOSync(int cycles)
   int slots, done;
 
   // calculate #slots since last executed slot
-  slots = GetFIFOSlot(pv, cycles) - fifo_slot;
+  slots = fifo_cyc2sl[cycles>>1] - fifo_slot;
 
   // advance FIFO queue by #done slots
   done = slots;
@@ -208,31 +145,28 @@ void PicoVideoFIFOSync(int cycles)
 int PicoVideoFIFODrain(int level, int cycles, int bgdma)
 {
   struct PicoVideo *pv = &Pico.video;
-  int maxsl = GetMaxFIFOSlot(pv); // max xfer slots in this scanline
+  unsigned ocyc = cycles;
   int burn = 0;
 
   // process FIFO entries until low level is reached
-  while (fifo_total > level && fifo_slot < maxsl &&
+  while (fifo_total > level && fifo_slot < fifo_maxslot &&
                  (!(fifo_queue[fifo_qx] & FQ_BGDMA) || bgdma)) {
     int b = fifo_queue[fifo_qx] & FQ_BYTE;
     int cnt = ((fifo_total-level) << b) - (pv->fifo_cnt & b);
-    int last = fifo_slot;
-    int slot = (pv->fifo_cnt < cnt ? pv->fifo_cnt : cnt) + last; // target slot
-    unsigned ocyc = cycles;
+    int slot = (pv->fifo_cnt<cnt ? pv->fifo_cnt:cnt) + fifo_slot; // target slot
 
-    if (slot > maxsl) {
+    if (slot > fifo_maxslot) {
       // target in later scanline, advance to eol
-      slot = maxsl;
+      slot = fifo_maxslot;
       cycles = 488;
     } else {
       // advance FIFO to target slot and CPU to cycles at that slot
-      cycles = GetFIFOCycles(pv, slot);
+      cycles = fifo_sl2cyc[slot]<<1;
     }
+    AdvanceFIFOEntry(pv, slot - fifo_slot);
     fifo_slot = slot;
-    burn += cycles - ocyc;
-
-    AdvanceFIFOEntry(pv, slot - last);
   }
+  burn = cycles - ocyc;
 
   SetFIFOState(pv);
 
@@ -246,17 +180,19 @@ int PicoVideoFIFORead(void)
   int lc = SekCyclesDone()-Pico.t.m68c_line_start;
   int burn = 0;
 
-  PicoVideoFIFOSync(lc);
+  if (pv->fifo_cnt) {
+    PicoVideoFIFOSync(lc);
+    // advance FIFO and CPU until FIFO is empty
+    burn = PicoVideoFIFODrain(0, lc, 1);
+    lc += burn;
+  }
 
-  // advance FIFO and CPU until FIFO is empty
-  burn = PicoVideoFIFODrain(0, lc, 1);
-  lc += burn;
   if (fifo_total > 0)
     pv->status |= PVS_CPURD; // target slot is in later scanline
   else {
     // use next VDP access slot for reading, block 68k until then
-    fifo_slot = GetFIFOSlot(pv, lc) + 1;
-    burn += GetFIFOCycles(pv, fifo_slot) - lc;
+    fifo_slot = fifo_cyc2sl[lc>>1] + 1;
+    burn += (fifo_sl2cyc[fifo_slot]<<1) - lc;
   }
 
   return burn;
@@ -267,35 +203,41 @@ int PicoVideoFIFOWrite(int count, int flags, unsigned sr_mask,unsigned sr_flags)
 {
   struct PicoVideo *pv = &Pico.video;
   int lc = SekCyclesDone()-Pico.t.m68c_line_start;
-  int burn = 0, x;
+  int burn = 0, x, head = 0;
 
-  PicoVideoFIFOSync(lc);
+  if (pv->fifo_cnt)
+    PicoVideoFIFOSync(lc);
   pv->status = (pv->status & ~sr_mask) | sr_flags;
 
   if (count && fifo_ql < 8) {
     // update FIFO state if it was empty
     if (fifo_ql == 0) {
-      fifo_slot = GetFIFOSlot(pv, lc+9); // FIFO latency ~3 vdp slots
+      fifo_slot = fifo_cyc2sl[(lc+8)>>1]; // FIFO latency ~3 vdp slots
       pv->fifo_cnt = count << (flags & FQ_BYTE);
+      pv->status |= PVS_FIFORUN;
     }
 
-    // create xfer queue entry
+    // determine queue position for entry
     x = (fifo_qx + fifo_ql - 1) & 7;
     if (fifo_ql && (fifo_queue[x] & FQ_BGDMA)) {
       // CPU FIFO writes have priority over a background DMA Fill/Copy
       fifo_queue[(x+1) & 7] = fifo_queue[x];
-      if (fifo_ql == 1) {
+      if (x == fifo_qx) { // overtaking to queue head?
         // XXX if interrupting a DMA fill, fill data changes
         int f = fifo_queue[x] & 7;
         fifo_queue[(x+1) & 7] = (pv->fifo_cnt >> (f & FQ_BYTE) << 3) | f;
         pv->fifo_cnt = count << (flags & FQ_BYTE);
+        head = 1;
       }
       x = (x-1) & 7;
     }
-    if (fifo_ql && (fifo_queue[x] & 7) == flags) {
+
+    // create xfer queue entry
+    if (fifo_ql && !head && (fifo_queue[x] & 7) == flags) {
       // amalgamate entries if of same type
       fifo_queue[x] += (count << 3);
-      if (fifo_ql == 1) pv->fifo_cnt += count << (flags & FQ_BYTE);
+      if (x == fifo_qx) // modifiying fifo head, adjust count
+        pv->fifo_cnt += count << (flags & FQ_BYTE);
     } else {
       fifo_ql ++;
       x = (x+1) & 7;
@@ -331,20 +273,25 @@ int PicoVideoFIFOHint(void)
 }
 
 // switch FIFO mode between active/inactive display
-static void PicoVideoFIFOMode(int active)
+void PicoVideoFIFOMode(int active, int h40)
 {
+  static const unsigned char *vdpcyc2sl[2][2] =
+        { {vdpcyc2sl_32_bl, vdpcyc2sl_40_bl} , {vdpcyc2sl_32, vdpcyc2sl_40} };
+  static const unsigned short *vdpsl2cyc[2][2] =
+        { {vdpsl2cyc_32_bl, vdpsl2cyc_40_bl} , {vdpsl2cyc_32, vdpsl2cyc_40} };
+
   struct PicoVideo *pv = &Pico.video;
-  int h40 = pv->reg[12] & 1;
   int lc = SekCyclesDone() - Pico.t.m68c_line_start;
+  active = active && !(pv->status & PVS_VB2);
 
-  PicoVideoFIFOSync(lc);
+  if (fifo_maxslot)
+    PicoVideoFIFOSync(lc);
 
-  if (fifo_ql) {
-    // recalculate FIFO slot for new mode
-    if (!(pv->status & SR_VB) && active)
-          fifo_slot = (pv->reg[12]&1 ? vdpcyc2sl_40 : vdpcyc2sl_32)[lc/4];
-    else  fifo_slot = ((lc * vdpcyc2sl_bl[h40] + lc) >> 16);
-  }
+  fifo_cyc2sl = vdpcyc2sl[active][h40];
+  fifo_sl2cyc = vdpsl2cyc[active][h40];
+  // recalculate FIFO slot for new mode
+  fifo_slot = fifo_cyc2sl[lc>>1]-1;
+  fifo_maxslot = fifo_cyc2sl[488>>1];
 }
 
 
@@ -459,7 +406,7 @@ static void DmaSlow(int len, unsigned int source)
     SekCyclesDone(), SekPc);
 
   SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_FGDMA | (Pico.video.type == 1),
-                              0, SR_DMA| PVS_CPUWR));
+                              PVS_DMABG, SR_DMA | PVS_CPUWR));
 
   if ((source & 0xe00000) == 0xe00000) { // Ram
     base = (u16 *)PicoMem.ram;
@@ -583,13 +530,13 @@ static void DmaCopy(int len)
   int source;
   elprintf(EL_VDPDMA, "DmaCopy len %i [%u]", len, SekCyclesDone());
 
+  // XXX implement VRAM 128k? Is this even working? xfer/count still FQ_BYTE?
   SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_BGDMA | FQ_BYTE,
-                              PVS_CPUWR, SR_DMA));
+                              PVS_CPUWR, SR_DMA | PVS_DMABG));
 
   source =Pico.video.reg[0x15];
   source|=Pico.video.reg[0x16]<<8;
 
-  // XXX implement VRAM 128k? Is this even working? count still in bytes?
   for (; len; len--)
   {
     vr[(u16)a] = vr[(u16)(source++)];
@@ -616,7 +563,7 @@ static NOINLINE void DmaFill(int data)
   elprintf(EL_VDPDMA, "DmaFill len %i inc %i [%u]", len, inc, SekCyclesDone());
 
   SekCyclesBurnRun(PicoVideoFIFOWrite(len, FQ_BGDMA | (Pico.video.type == 1),
-                              PVS_CPUWR | PVS_DMAFILL, SR_DMA));
+                              PVS_CPUWR | PVS_DMAFILL, SR_DMA | PVS_DMABG));
 
   switch (Pico.video.type)
   {
@@ -823,11 +770,13 @@ PICO_INTERNAL_ASM void PicoVideoWrite(unsigned int a,unsigned short d)
         if (num == 0 && !(pvid->reg[0]&2) && (d&2))
           pvid->hv_latch = PicoVideoRead(0x08);
         if (num == 1 && ((pvid->reg[1]^d)&0x40)) {
-          PicoVideoFIFOMode(d & 0x40);
+          PicoVideoFIFOMode(d & 0x40, pvid->reg[12]&1);
           // handle line blanking before line rendering
           if (SekCyclesDone() - Pico.t.m68c_line_start <= 488-390)
             blankline = d&0x40 ? -1 : Pico.m.scanline;
         }
+        if (num == 12 && ((pvid->reg[12]^d)&0x01))
+          PicoVideoFIFOMode(pvid->reg[1]&0x40, d & 1);
         DrawSync(SekCyclesDone() - Pico.t.m68c_line_start <= 488-390);
         pvid->reg[num]=(unsigned char)d;
         switch (num)
@@ -1058,6 +1007,7 @@ void PicoVideoLoad(void)
 
   // convert former dma_xfers (why was this in PicoMisc anyway?)
   if (Pico.m.dma_xfers) {
+    pv->status = SR_DMA|PVS_FIFORUN;
     pv->fifo_cnt = Pico.m.dma_xfers * (pv->type == 1 ? 2 : 1);
     fifo_total = Pico.m.dma_xfers;
     Pico.m.dma_xfers = 0;
-- 
2.39.5