core vdp, arm rendering speed optimisation
authorkub <derkub@gmail.com>
Thu, 18 Jul 2024 19:36:43 +0000 (21:36 +0200)
committerkub <derkub@gmail.com>
Thu, 18 Jul 2024 19:36:43 +0000 (21:36 +0200)
pico/draw.c
pico/draw_arm.S

index 0f9b137..387f3a4 100644 (file)
@@ -304,34 +304,34 @@ TileFlipMakerAS(TileFlipSH_AS_and, pix_sh_as_and)
 // --------------------------------------------\r
 \r
 #ifndef _ASM_DRAW_C\r
-#define DrawTile(mask) { \\r
-  if (code!=oldcode) { \\r
-    oldcode = code; \\r
- \\r
-    pack = 0; \\r
-    if (code != blank) { \\r
-      /* Get tile address/2: */\\r
-      u32 addr = ((code&0x7ff)<<4) + ty; \\r
-      if (code & 0x1000) addr ^= 0xe; /* Y-flip */ \\r
- \\r
-      pal = ((code>>9)&0x30) | sh; /* shadow */ \\r
- \\r
-      pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \\r
-      if (!pack) \\r
-        blank = code; \\r
-    } \\r
-  } \\r
- \\r
-  if (code & 0x8000) { /* (un-forced) high priority tile */ \\r
-    if (sh | (pack&mask)) { \\r
-      code |= (dx<<16) | (ty<<25); \\r
-      if (code & 0x1000) code ^= 0xe<<25; \\r
-      *hc++ = code, *hc++ = pack&mask; /* cache it */ \\r
-    } \\r
-  } else if (pack&mask) { \\r
-    if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \\r
-    else               TileNorm(pd + dx, pack&mask, pal); \\r
-  } \\r
+#define DrawTile(mask) {                                               \\r
+  if (code!=oldcode) {                                                 \\r
+    oldcode = code;                                                    \\r
                                                                      \\r
+    pack = 0;                                                          \\r
+    if (code != blank) {                                               \\r
+      /* Get tile address/2: */                                                \\r
+      u32 addr = ((code&0x7ff)<<4) + ty;                               \\r
+      if (code & 0x1000) addr ^= 0xe; /* Y-flip */                     \\r
                                                                      \\r
+      pal = ((code>>9)&0x30) | sh; /* shadow */                                \\r
                                                                      \\r
+      pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr));                   \\r
+      if (!pack)                                                       \\r
+        blank = code;                                                  \\r
+    }                                                                  \\r
+  }                                                                    \\r
                                                                      \\r
+  if (code & 0x8000) { /* (un-forced) high priority tile */            \\r
+    if (sh | (pack&mask)) {                                            \\r
+      code |= (dx<<16) | (ty<<25);                                     \\r
+      if (code & 0x1000) code ^= 0xe<<25;                              \\r
+      *hc++ = code, *hc++ = pack&mask; /* cache it */                  \\r
+    }                                                                  \\r
+  } else if (pack&mask) {                                              \\r
+    if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal);              \\r
+    else               TileNorm(pd + dx, pack&mask, pal);              \\r
+  }                                                                    \\r
 }\r
 \r
 static void DrawStrip(struct TileStrip *ts, int lflags, int cellskip)\r
@@ -478,34 +478,34 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip)
 }\r
 #endif\r
 \r
-#define DrawTileInterlace(mask) { \\r
-  if (code!=oldcode) { \\r
-    oldcode = code; \\r
- \\r
-    pack = 0; \\r
-    if (code != blank) { \\r
-      /* Get tile address/2: */ \\r
-      u32 addr = ((code&0x3ff)<<5) + ty; \\r
-      if (code & 0x1000) addr ^= 0x1e; /* Y-flip */ \\r
- \\r
-      pal = ((code>>9)&0x30) | sh; /* shadow */ \\r
- \\r
-      pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \\r
-      if (!pack) \\r
-        blank = code; \\r
-    } \\r
-  } \\r
- \\r
-  if (code & 0x8000) { /* high priority tile */ \\r
-    if (sh | (pack&mask)) { \\r
-      code = (code&0xfc00) | ((code&0x3ff)<<1) | (dx<<16) | (ty<<25); \\r
-      if (code & 0x1000) code ^= 0x1e<<25; \\r
-      *hc++ = code, *hc++ = pack&mask; /* cache it */ \\r
-    } \\r
-  } else if (pack&mask) { \\r
-    if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \\r
-    else               TileNorm(pd + dx, pack&mask, pal); \\r
-  } \\r
+#define DrawTileInterlace(mask) {                                      \\r
+  if (code!=oldcode) {                                                 \\r
+    oldcode = code;                                                    \\r
                                                                      \\r
+    pack = 0;                                                          \\r
+    if (code != blank) {                                               \\r
+      /* Get tile address/2: */                                                \\r
+      u32 addr = ((code&0x3ff)<<5) + ty;                               \\r
+      if (code & 0x1000) addr ^= 0x1e; /* Y-flip */                    \\r
                                                                      \\r
+      pal = ((code>>9)&0x30) | sh; /* shadow */                                \\r
                                                                      \\r
+      pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr));                   \\r
+      if (!pack)                                                       \\r
+        blank = code;                                                  \\r
+    }                                                                  \\r
+  }                                                                    \\r
                                                                      \\r
+  if (code & 0x8000) { /* high priority tile */                                \\r
+    if (sh | (pack&mask)) {                                            \\r
+      code = (code&0xfc00) | ((code&0x3ff)<<1) | (dx<<16) | (ty<<25);  \\r
+      if (code & 0x1000) code ^= 0x1e<<25;                             \\r
+      *hc++ = code, *hc++ = pack&mask; /* cache it */                  \\r
+    }                                                                  \\r
+  } else if (pack&mask) {                                              \\r
+    if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal);              \\r
+    else               TileNorm(pd + dx, pack&mask, pal);              \\r
+  }                                                                    \\r
 }\r
 \r
 #ifndef _ASM_DRAW_C\r
index 7a2e6f1..40d5c44 100644 (file)
@@ -433,7 +433,7 @@ DrawLayer:
     movs    r3, r9, lsl #1  @ (force[31]|sh[30]) << 1\r
     mov     r3, #0\r
     orrmi   r10,r10, #1<<23 @ r10=cells[31:24]|sh[23]|hi_not_empty[22]\r
-@    orrcc   r10,r10, #1<<20 @   |had_output[21]|!force[20]|hscroll[19:17]|ty[15:0]\r
+@    orrcc   r10,r10, #1<<20 @   |had_output[21]|!force[20]|hscroll[18:16]|ty[15:0]\r
     movmi   r3, #0x80       @ default to shadowed pal on sh mode\r
 \r
     and     r4, r7, #7\r
@@ -452,7 +452,7 @@ DrawLayer:
     mvn     r9, #0               @ r9=prevcode=-1\r
     add     r1, r11, r7          @ r1=pdest\r
 \r
-    @ r10=cells[31:24]|sh[23]|hi_not_empty[22]|had_output[21]|!force[20]|hscroll[19:17]|ty[15:0]\r
+    @ r10=cells[31:24]|sh[23]|hi_not_empty[22]|had_output[21]|!force[20]|hscroll[18:16]|ty[15:0]\r
     @ r1=pd+dx r2=pack r3=pal r5=xmask r6=hc r8=tilex r9=prevcode r11=HighCol r12=nametab lr=vram\r
     @ r4 & r7 are scratch in this loop\r
 \r
@@ -467,21 +467,19 @@ DrawLayer:
 \r
     add     r8, r8, #1\r
 \r
-    movs    r2, r9, lsl #20 @ if (code&0x1000)\r
-    mov     r2, r2, lsl #1\r
+    tst     r9, #0x1000     @ if (code&0x1000)\r
+    mov     r2, r9, lsl #21\r
     add     r2, r2, r10, lsl #17\r
-    mov     r2, r2, lsr #17\r
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;\r
+    eorne   r2, r2, #0xe<<17 @ if (code&0x1000) addr^=0xe;\r
 \r
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
 \r
     mvn     r7, #0\r
     mov     r4, r4, lsr #16-2  @ (dx&7)*4\r
     tst     r9, #0x0800\r
     moveq   r7, r7, lsl r4     @ mask = ~0 [shift] (dx&7)*4\r
     movne   r7, r7, lsr r4\r
-    mvn     r7, r7, ror #16\r
-    and     r2, r2, r7         @ pack&mask\r
+    bic     r2, r2, r7, ror #16 @ pack&~mask\r
 \r
     orr     r9, r9, #0x80000000 @ invalidate oldcode since pack is masked\r
     b       .DrawStrip_samecode\r
@@ -504,13 +502,12 @@ DrawLayer:
 \r
     mov     r9, r7          @ remember code\r
 \r
-    movs    r2, r9, lsl #20 @ if (code&0x1000)\r
-    mov     r2, r2, lsl #1\r
+    tst     r9, #0x1000     @ if (code&0x1000)\r
+    mov     r2, r9, lsl #21\r
     add     r2, r2, r10, lsl #17\r
-    mov     r2, r2, lsr #17\r
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;\r
+    eorne   r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;\r
 \r
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
 .DrawStrip_samecode:\r
     tst     r9, #0x8000\r
 @    tstne   r10, #1<<20     @ !force[20]\r
@@ -577,21 +574,19 @@ DrawLayer:
 \r
     add     r1, r1, #8\r
 \r
-    movs    r2, r9, lsl #20 @ if (code&0x1000)\r
-    mov     r2, r2, lsl #1\r
+    tst     r9, #0x1000     @ if (code&0x1000)\r
+    mov     r2, r9, lsl #21\r
     add     r2, r2, r10, lsl #17\r
-    mov     r2, r2, lsr #17\r
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;\r
+    eorne   r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;\r
 \r
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
 \r
     mvn     r7, #0\r
     mov     r4, r4, lsr #16-2  @ (dx&7)*4\r
     tst     r9, #0x0800\r
     moveq   r7, r7, lsl r4     @ mask = ~0 [shift] (dx&7)*4\r
     movne   r7, r7, lsr r4\r
-    mov     r7, r7, ror #16\r
-    and     r2, r2, r7         @ pack&mask\r
+    and     r2, r2, r7, ror #16 @ pack&mask\r
 \r
     bic     r10,r10, #7<<16\r
     b       .DrawStrip_samecode @ one last time, with last tile now masked\r
@@ -743,13 +738,12 @@ DrawLayer:
 \r
     mov     r9, r7          @ remember code\r
 \r
-    movs    r2, r9, lsl #20 @ if (code&0x1000)\r
-    mov     r2, r2, lsl #1\r
+    tst     r9, #0x1000     @ if (code&0x1000)\r
+    mov     r2, r9, lsl #21\r
     add     r2, r2, r10, lsl #17\r
-    mov     r2, r2, lsr #17\r
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;\r
+    eorne   r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;\r
 \r
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels\r
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
 \r
 .DrawStrip_vs_samecode:\r
     tst     r9, #0x8000\r
@@ -1007,8 +1001,7 @@ DrawTilesFromCache:
     tst     r6, #0x0800     @ flipped?\r
     moveq   r12,r12, lsl r4\r
     movne   r12,r12, lsr r4\r
-    mov     r12,r12, ror #16\r
-    and     r2, r2, r12\r
+    and     r2, r2, r12, ror #16\r
     mov     r12,#0xf\r
     tst     r8, #1\r
     bne     .dtfc_shadow\r
@@ -1152,9 +1145,7 @@ DrawSpriteSHi:
     cmp     r0, #328\r
     bge     DrawSpriteSHi\r
 \r
-    mov     r8, r8, lsl #17\r
-    mov     r8, r8, lsr #17    @ tile&=0x7fff; // Clip tile address\r
-\r
+    bic     r8, r8, #0xf8000   @ tile&=0x7fff; // Clip tile address\r
     ldr     r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels\r
     add     r1, r11, r0        @ r1=pdest\r
     tst     r2, r2\r
@@ -1346,9 +1337,7 @@ DrawSprite:
     cmp     r0, #328\r
     bge     DrawSprite\r
 \r
-    mov     r8, r8, lsl #17\r
-    mov     r8, r8, lsr #17    @ tile&=0x7fff; // Clip tile address\r
-\r
+    bic     r8, r8, #0xf8000   @ tile&=0x7fff; // Clip tile address\r
     ldr     r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels\r
     add     r1, r11, r0        @ r1=pdest\r
     tst     r2, r2\r
@@ -1492,16 +1481,16 @@ DrawWindow:
 \r
     mov     r9, r7          @ remember code\r
 \r
-    movs    r2, r9, lsl #20 @ if (code&0x1000)\r
-    mov     r2, r2, lsl #1\r
-    add     r2, r10, r2, lsr #17 @ r2=addr=(code&0x7ff)<<4; addr+=ty\r
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;\r
+    tst     r9, #0x1000     @ if (code&0x1000)\r
+    mov     r2, r9, lsl #21\r
+    add     r2, r2, r10, lsl #17\r
+    eorne   r2, r2, #0xe<<17 @ if (code&0x1000) addr^=0xe;\r
+\r
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
 \r
     and     r3, r9, #0x6000\r
     mov     r3, r3, lsr #9  @ r3=pal=((code&0x6000)>>9);\r
 \r
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
-\r
 .dw_samecode:\r
     tst     r6, #0x100\r
     bne     .dw_shadow\r