From c3fcdf3f8da7eb3a1b320d18a46de1b28e258ea3 Mon Sep 17 00:00:00 2001
From: kub <derkub@gmail.com>
Date: Sun, 19 Dec 2021 14:40:16 +0100
Subject: [PATCH] 32x, more ARM asm drawing optimisations for dc mode

---
 pico/32x/draw_arm.S  | 51 ++++++++++++++++++++++----------------------
 platform/linux/emu.c |  6 +++---
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S
index 4ac3e558..dce37192 100644
--- a/pico/32x/draw_arm.S
+++ b/pico/32x/draw_arm.S
@@ -84,8 +84,8 @@
     mov     r3, r3, lsl #26  @ mdbg << 26
     mla     r11,r4,r5,r11    @ r11 = pmd = PicoDraw2FB + offs*328: md data
     tst     r10,#P32XV_PRI
-    moveq   r10,#0
-    movne   r10,#0x8000      @ r10 = inv_bit
+    movne   r10,#0
+    moveq   r10,#0x8000      @ r10 = !inv_bit
     call_scan_prep \call_scan lr
 
     mov     r4, #0           @ line
@@ -120,49 +120,48 @@
     beq     5f @ check_fill
 
 3: @ no_fill:
+    ldrb    r12,[r11], #1    @ MD pixel 0
+
     eor     r7, r7, r10
-    and     r12,r7, #0x03e0  @ convert BGR555 -> RGB565
+    and     lr, r7, #0x03e0  @ convert BGR555 -> RGB565
     mov     r7, r7, ror #5
     orr     r7, r7, r7, ror #10+11
-    orr     r7, r7, r12,lsl #1+16
+    orr     r7, r7, lr, lsl #1+16
     eor     r8, r8, r10
-    and     r12,r8, #0x03e0
+    and     lr, r8, #0x03e0
     mov     r8, r8, ror #5
     orr     r8, r8, r8, ror #10+11
-    orr     r8, r8, r12,lsl #1+16
+    orr     r8, r8, lr, lsl #1+16
 
-    ldrb    r12,[r11], #1    @ MD pixel 0
     ldrb    lr, [r11], #1    @ MD pixel 1
-    lsr     r7, #16
-    lsr     r8, #16
 
 .if \do_md
     cmp     r3, r12, lsl #26
-    movne   r12,r12, lsl #1  @ load MD color if not bg
+    tstne   r7, #0x20<<16
+    movne   r12,r12, lsl #1  @ load MD color if no 32X prio and not bg
     ldrneh  r12,[r9, r12]
-    orreq   r7, r7, #0x20    @ accumulate MD bg info into prio bit
+    moveq   r12,r7, lsr #16  @ else replace with 32X color
+
     cmp     r3, lr,  lsl #26
-    movne   lr, lr,  lsl #1
+    tstne   r8, #0x20<<16
+    movne   lr, lr,  lsl #1  @ load MD color if no 32X prio and not bg
     ldrneh  lr, [r9, lr]
-    orreq   r8, r8, #0x20
+    moveq   lr, r8, lsr #16  @ else replace with 32X color
 
-    tst     r7, #0x20        @ replace 32X with MD color if no prio and not bg
-    moveq   r7, r12
-    tst     r8, #0x20
-    moveq   r8, lr
-    orr     r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth
-    str     r7, [r0], #4     @ (no write combining on ARM9)
+    orr     r12,r12, lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
+    str     r12,[r0], #4     @ (no write combining on ARM9)
 .else
     cmp     r3, r12, lsl #26 @ replace MD bg info into prio bit
-    orreq   r7, r7, #0x20
+    tstne   r7, #0x20<<16
+    moveq   r7, r7, lsr #16
+    streqh  r7, [r0, #0]
+
     cmp     r3, lr,  lsl #26
-    orreq   r8, r8, #0x20
+    tstne   r8, #0x20<<16
+    moveq   r8, r8, lsr #16
+    streqh  r8, [r0, #2]
 
     add     r0, r0, #4       @ store 32x pixels if 32X prio or MD bg
-    tst     r7, #0x20
-    strneh  r7, [r0, #-4]
-    tst     r8, #0x20
-    strneh  r8, [r0, #-2]
 .endif
     b       2b @ loop_inner
 
@@ -205,7 +204,7 @@
     lsr     r7, #16
 
     tst     r7, #0x20        @ check for prio transfer
-    beq     9f @ bg_loop
+    bne     9f @ bg_loop
 
     add     r11,r11,r8       @ consume md pixels (not used)
     orr     r12,r7, r7, lsl #16
diff --git a/platform/linux/emu.c b/platform/linux/emu.c
index 053a2063..0d90ae52 100644
--- a/platform/linux/emu.c
+++ b/platform/linux/emu.c
@@ -207,12 +207,12 @@ static void apply_renderer(void)
 {
 	PicoIn.opt &= ~(POPT_ALT_RENDERER|POPT_EN_SOFTSCALE|POPT_DIS_32C_BORDER);
 	if (is_16bit_mode()) {
-		if (currentConfig.scaling == EOPT_SCALE_SW) {
+		if (currentConfig.scaling == EOPT_SCALE_SW)
 			PicoIn.opt |= POPT_EN_SOFTSCALE;
-			PicoIn.filter = currentConfig.filter;
-		} else if (currentConfig.scaling == EOPT_SCALE_HW)
+		else if (currentConfig.scaling == EOPT_SCALE_HW)
 			// hw scaling, render without any padding
 			PicoIn.opt |= POPT_DIS_32C_BORDER;
+		PicoIn.filter = currentConfig.filter;
 	} else
 		PicoIn.opt |= POPT_DIS_32C_BORDER;
 
-- 
2.47.3