From: kub <derkub@gmail.com>
Date: Thu, 18 Jul 2024 19:36:43 +0000 (+0200)
Subject: core vdp, arm rendering speed optimisation
X-Git-Tag: v2.00~1
X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=11a1966bf349ab74035df4673293bdeab41b1a96;p=picodrive.git

core vdp, arm rendering speed optimisation
---

diff --git a/pico/draw.c b/pico/draw.c
index 0f9b137a..387f3a42 100644
--- a/pico/draw.c
+++ b/pico/draw.c
@@ -304,34 +304,34 @@ TileFlipMakerAS(TileFlipSH_AS_and, pix_sh_as_and)
 // --------------------------------------------
 
 #ifndef _ASM_DRAW_C
-#define DrawTile(mask) { \
-  if (code!=oldcode) { \
-    oldcode = code; \
- \
-    pack = 0; \
-    if (code != blank) { \
-      /* Get tile address/2: */\
-      u32 addr = ((code&0x7ff)<<4) + ty; \
-      if (code & 0x1000) addr ^= 0xe; /* Y-flip */ \
- \
-      pal = ((code>>9)&0x30) | sh; /* shadow */ \
- \
-      pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \
-      if (!pack) \
-        blank = code; \
-    } \
-  } \
- \
-  if (code & 0x8000) { /* (un-forced) high priority tile */ \
-    if (sh | (pack&mask)) { \
-      code |= (dx<<16) | (ty<<25); \
-      if (code & 0x1000) code ^= 0xe<<25; \
-      *hc++ = code, *hc++ = pack&mask; /* cache it */ \
-    } \
-  } else if (pack&mask) { \
-    if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \
-    else               TileNorm(pd + dx, pack&mask, pal); \
-  } \
+#define DrawTile(mask) {						\
+  if (code!=oldcode) {							\
+    oldcode = code;							\
+									\
+    pack = 0;								\
+    if (code != blank) {						\
+      /* Get tile address/2: */						\
+      u32 addr = ((code&0x7ff)<<4) + ty;				\
+      if (code & 0x1000) addr ^= 0xe; /* Y-flip */			\
+									\
+      pal = ((code>>9)&0x30) | sh; /* shadow */				\
+									\
+      pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr));			\
+      if (!pack)							\
+        blank = code;							\
+    }									\
+  }									\
+									\
+  if (code & 0x8000) { /* (un-forced) high priority tile */		\
+    if (sh | (pack&mask)) {						\
+      code |= (dx<<16) | (ty<<25);					\
+      if (code & 0x1000) code ^= 0xe<<25;				\
+      *hc++ = code, *hc++ = pack&mask; /* cache it */			\
+    }									\
+  } else if (pack&mask) {						\
+    if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal);		\
+    else               TileNorm(pd + dx, pack&mask, pal);		\
+  }									\
 }
 
 static void DrawStrip(struct TileStrip *ts, int lflags, int cellskip)
@@ -478,34 +478,34 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip)
 }
 #endif
 
-#define DrawTileInterlace(mask) { \
-  if (code!=oldcode) { \
-    oldcode = code; \
- \
-    pack = 0; \
-    if (code != blank) { \
-      /* Get tile address/2: */ \
-      u32 addr = ((code&0x3ff)<<5) + ty; \
-      if (code & 0x1000) addr ^= 0x1e; /* Y-flip */ \
- \
-      pal = ((code>>9)&0x30) | sh; /* shadow */ \
- \
-      pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \
-      if (!pack) \
-        blank = code; \
-    } \
-  } \
- \
-  if (code & 0x8000) { /* high priority tile */ \
-    if (sh | (pack&mask)) { \
-      code = (code&0xfc00) | ((code&0x3ff)<<1) | (dx<<16) | (ty<<25); \
-      if (code & 0x1000) code ^= 0x1e<<25; \
-      *hc++ = code, *hc++ = pack&mask; /* cache it */ \
-    } \
-  } else if (pack&mask) { \
-    if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \
-    else               TileNorm(pd + dx, pack&mask, pal); \
-  } \
+#define DrawTileInterlace(mask) {					\
+  if (code!=oldcode) {							\
+    oldcode = code;							\
+									\
+    pack = 0;								\
+    if (code != blank) {						\
+      /* Get tile address/2: */						\
+      u32 addr = ((code&0x3ff)<<5) + ty;				\
+      if (code & 0x1000) addr ^= 0x1e; /* Y-flip */			\
+									\
+      pal = ((code>>9)&0x30) | sh; /* shadow */				\
+									\
+      pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr));			\
+      if (!pack)							\
+        blank = code;							\
+    }									\
+  }									\
+									\
+  if (code & 0x8000) { /* high priority tile */				\
+    if (sh | (pack&mask)) {						\
+      code = (code&0xfc00) | ((code&0x3ff)<<1) | (dx<<16) | (ty<<25);	\
+      if (code & 0x1000) code ^= 0x1e<<25;				\
+      *hc++ = code, *hc++ = pack&mask; /* cache it */			\
+    }									\
+  } else if (pack&mask) {						\
+    if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal);		\
+    else               TileNorm(pd + dx, pack&mask, pal);		\
+  }									\
 }
 
 #ifndef _ASM_DRAW_C
diff --git a/pico/draw_arm.S b/pico/draw_arm.S
index 7a2e6f1d..40d5c443 100644
--- a/pico/draw_arm.S
+++ b/pico/draw_arm.S
@@ -433,7 +433,7 @@ DrawLayer:
     movs    r3, r9, lsl #1  @ (force[31]|sh[30]) << 1
     mov     r3, #0
     orrmi   r10,r10, #1<<23 @ r10=cells[31:24]|sh[23]|hi_not_empty[22]
-@    orrcc   r10,r10, #1<<20 @   |had_output[21]|!force[20]|hscroll[19:17]|ty[15:0]
+@    orrcc   r10,r10, #1<<20 @   |had_output[21]|!force[20]|hscroll[18:16]|ty[15:0]
     movmi   r3, #0x80       @ default to shadowed pal on sh mode
 
     and     r4, r7, #7
@@ -452,7 +452,7 @@ DrawLayer:
     mvn     r9, #0               @ r9=prevcode=-1
     add     r1, r11, r7          @ r1=pdest
 
-    @ r10=cells[31:24]|sh[23]|hi_not_empty[22]|had_output[21]|!force[20]|hscroll[19:17]|ty[15:0]
+    @ r10=cells[31:24]|sh[23]|hi_not_empty[22]|had_output[21]|!force[20]|hscroll[18:16]|ty[15:0]
     @ r1=pd+dx r2=pack r3=pal r5=xmask r6=hc r8=tilex r9=prevcode r11=HighCol r12=nametab lr=vram
     @ r4 & r7 are scratch in this loop
 
@@ -467,21 +467,19 @@ DrawLayer:
 
     add     r8, r8, #1
 
-    movs    r2, r9, lsl #20 @ if (code&0x1000)
-    mov     r2, r2, lsl #1
+    tst     r9, #0x1000     @ if (code&0x1000)
+    mov     r2, r9, lsl #21
     add     r2, r2, r10, lsl #17
-    mov     r2, r2, lsr #17
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;
+    eorne   r2, r2, #0xe<<17 @ if (code&0x1000) addr^=0xe;
 
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
 
     mvn     r7, #0
     mov     r4, r4, lsr #16-2  @ (dx&7)*4
     tst     r9, #0x0800
     moveq   r7, r7, lsl r4     @ mask = ~0 [shift] (dx&7)*4
     movne   r7, r7, lsr r4
-    mvn     r7, r7, ror #16
-    and     r2, r2, r7         @ pack&mask
+    bic     r2, r2, r7, ror #16 @ pack&~mask
 
     orr     r9, r9, #0x80000000 @ invalidate oldcode since pack is masked
     b       .DrawStrip_samecode
@@ -504,13 +502,12 @@ DrawLayer:
 
     mov     r9, r7          @ remember code
 
-    movs    r2, r9, lsl #20 @ if (code&0x1000)
-    mov     r2, r2, lsl #1
+    tst     r9, #0x1000     @ if (code&0x1000)
+    mov     r2, r9, lsl #21
     add     r2, r2, r10, lsl #17
-    mov     r2, r2, lsr #17
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;
+    eorne   r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;
 
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
 .DrawStrip_samecode:
     tst     r9, #0x8000
 @    tstne   r10, #1<<20     @ !force[20]
@@ -577,21 +574,19 @@ DrawLayer:
 
     add     r1, r1, #8
 
-    movs    r2, r9, lsl #20 @ if (code&0x1000)
-    mov     r2, r2, lsl #1
+    tst     r9, #0x1000     @ if (code&0x1000)
+    mov     r2, r9, lsl #21
     add     r2, r2, r10, lsl #17
-    mov     r2, r2, lsr #17
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;
+    eorne   r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;
 
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
 
     mvn     r7, #0
     mov     r4, r4, lsr #16-2  @ (dx&7)*4
     tst     r9, #0x0800
     moveq   r7, r7, lsl r4     @ mask = ~0 [shift] (dx&7)*4
     movne   r7, r7, lsr r4
-    mov     r7, r7, ror #16
-    and     r2, r2, r7         @ pack&mask
+    and     r2, r2, r7, ror #16 @ pack&mask
 
     bic     r10,r10, #7<<16
     b       .DrawStrip_samecode @ one last time, with last tile now masked
@@ -743,13 +738,12 @@ DrawLayer:
 
     mov     r9, r7          @ remember code
 
-    movs    r2, r9, lsl #20 @ if (code&0x1000)
-    mov     r2, r2, lsl #1
+    tst     r9, #0x1000     @ if (code&0x1000)
+    mov     r2, r9, lsl #21
     add     r2, r2, r10, lsl #17
-    mov     r2, r2, lsr #17
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;
+    eorne   r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;
 
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
 
 .DrawStrip_vs_samecode:
     tst     r9, #0x8000
@@ -1007,8 +1001,7 @@ DrawTilesFromCache:
     tst     r6, #0x0800     @ flipped?
     moveq   r12,r12, lsl r4
     movne   r12,r12, lsr r4
-    mov     r12,r12, ror #16
-    and     r2, r2, r12
+    and     r2, r2, r12, ror #16
     mov     r12,#0xf
     tst     r8, #1
     bne     .dtfc_shadow
@@ -1152,9 +1145,7 @@ DrawSpriteSHi:
     cmp     r0, #328
     bge     DrawSpriteSHi
 
-    mov     r8, r8, lsl #17
-    mov     r8, r8, lsr #17    @ tile&=0x7fff; // Clip tile address
-
+    bic     r8, r8, #0xf8000   @ tile&=0x7fff; // Clip tile address
     ldr     r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels
     add     r1, r11, r0        @ r1=pdest
     tst     r2, r2
@@ -1346,9 +1337,7 @@ DrawSprite:
     cmp     r0, #328
     bge     DrawSprite
 
-    mov     r8, r8, lsl #17
-    mov     r8, r8, lsr #17    @ tile&=0x7fff; // Clip tile address
-
+    bic     r8, r8, #0xf8000   @ tile&=0x7fff; // Clip tile address
     ldr     r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels
     add     r1, r11, r0        @ r1=pdest
     tst     r2, r2
@@ -1492,16 +1481,16 @@ DrawWindow:
 
     mov     r9, r7          @ remember code
 
-    movs    r2, r9, lsl #20 @ if (code&0x1000)
-    mov     r2, r2, lsl #1
-    add     r2, r10, r2, lsr #17 @ r2=addr=(code&0x7ff)<<4; addr+=ty
-    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;
+    tst     r9, #0x1000     @ if (code&0x1000)
+    mov     r2, r9, lsl #21
+    add     r2, r2, r10, lsl #17
+    eorne   r2, r2, #0xe<<17 @ if (code&0x1000) addr^=0xe;
+
+    ldr     r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
 
     and     r3, r9, #0x6000
     mov     r3, r3, lsr #9  @ r3=pal=((code&0x6000)>>9);
 
-    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
-
 .dw_samecode:
     tst     r6, #0x100
     bne     .dw_shadow