From 2a29ca852be122c9058c319a02560b4b31c037b3 Mon Sep 17 00:00:00 2001 From: kub Date: Sat, 18 Dec 2021 19:19:37 +0100 Subject: [PATCH] 32x, ARM asm drawing fixes and optimzations for dc,pp modes --- pico/32x/draw_arm.S | 336 +++++++++++++++++++++++--------------------- 1 file changed, 179 insertions(+), 157 deletions(-) diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S index 5c19329f..4ac3e558 100644 --- a/pico/32x/draw_arm.S +++ b/pico/32x/draw_arm.S @@ -76,7 +76,6 @@ PIC_LDR(lr, r9, Pico) PIC_LDR(r10,r9, Pico32x) ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] - ldr r12, [lr, #OFS_Pico_est+OFS_EST_DrawLineDestIncr] ldrh r10,[r10, #0x40] @ Pico32x.vdp_regs[0] add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd @@ -85,8 +84,8 @@ mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data tst r10,#P32XV_PRI - movne r10,#0 - moveq r10,#0x8000 @ r10 = inv_bit + moveq r10,#0 + movne r10,#0x8000 @ r10 = inv_bit call_scan_prep \call_scan lr mov r4, #0 @ line @@ -107,125 +106,143 @@ mov r12,r4, lsl #1 ldrh r12,[r1, r12] add r11,r11,#8 - mov r6, #320 + mov r6, #320/2 add r5, r1, r12, lsl #1 @ p32x = dram + dram[l] - ldrh r7, [r5], #2 2: @ loop_inner: - mov r8, r7 - subs lr, r6, #1 +@ r4,r6 - counters; r5 - 32x data; r9 - md pal; r10 - inv_prio; r11 - md data +@ r7,r8,r12,lr - temp + ldrh r7, [r5], #2 + ldrh r8, [r5], #2 + subs r6, r6, #1 blt 0b @ loop_outer - beq 7f @ single_pix - ldrh r7, [r5], #2 @ 32x pixel - cmp r7, r8 @ do RLE only if we have at least 2 px -@ ldreqh r7, [r5] -@ cmpeq r7, r8 - subeq lr, lr, #1 - beq 3f @ loop_innermost - -7: @ single_pix: - mov r6, lr - - eor r12,r8, r10 - tst r12, #0x8000 @ !((t ^ inv) & 0x8000) - addeq r11,r11,#1 - beq 8f @ single_pix_32x - - ldrb r12,[r11], #1 @ MD pixel - cmp r3, r12,lsl #26 @ MD has bg pixel? + cmp r7, r8 + beq 5f @ check_fill + +3: @ no_fill: + eor r7, r7, r10 + and r12,r7, #0x03e0 @ convert BGR555 -> RGB565 + mov r7, r7, ror #5 + orr r7, r7, r7, ror #10+11 + orr r7, r7, r12,lsl #1+16 + eor r8, r8, r10 + and r12,r8, #0x03e0 + mov r8, r8, ror #5 + orr r8, r8, r8, ror #10+11 + orr r8, r8, r12,lsl #1+16 + + ldrb r12,[r11], #1 @ MD pixel 0 + ldrb lr, [r11], #1 @ MD pixel 1 + lsr r7, #16 + lsr r8, #16 + .if \do_md - movne r12,r12,lsl #1 + cmp r3, r12, lsl #26 + movne r12,r12, lsl #1 @ load MD color if not bg ldrneh r12,[r9, r12] - strneh r12,[r0], #2 @ *dst++ = palmd[*pmd] + orreq r7, r7, #0x20 @ accumulate MD bg info into prio bit + cmp r3, lr, lsl #26 + movne lr, lr, lsl #1 + ldrneh lr, [r9, lr] + orreq r8, r8, #0x20 + + tst r7, #0x20 @ replace 32X with MD color if no prio and not bg + moveq r7, r12 + tst r8, #0x20 + moveq r8, lr + orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r7, [r0], #4 @ (no write combining on ARM9) .else - addne r0, r0, #2 -.endif - bne 2b @ loop_inner + cmp r3, r12, lsl #26 @ replace MD bg info into prio bit + orreq r7, r7, #0x20 + cmp r3, lr, lsl #26 + orreq r8, r8, #0x20 -8: @ single_pix_32x: - and r12,r8, #0x03e0 - mov r8, r8, lsl #11 - orr r8, r8, r8, lsr #(10+11) - orr r8, r8, r12,lsl #1 - bic r8, r8, #0x0020 @ kill prio bit - strh r8, [r0], #2 + add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg + tst r7, #0x20 + strneh r7, [r0, #-4] + tst r8, #0x20 + strneh r8, [r0, #-2] +.endif b 2b @ loop_inner -3: @ loop_innermost: - ldrh r7, [r5], #2 @ 32x pixel - subs lr, lr, #1 - cmpge r7, r8 - beq 3b @ loop_innermost +5: @ check_fill: + @ count pixels, align if needed + ldrh r12,[r5, #0] @ only do this for at least 4 pixels + ldrh lr ,[r5, #2] + cmp r12,r7 + cmpeq lr ,r7 + bne 3b @ no_fill + add r5, r5, #4 @ adjust for the check above + + sub lr, r5, #4+4 @ starting r5 (32x render data start) + add r6, r6, #1 @ restore from dec +6: @ count_loop: + sub r12,r5, lr @ loop checks 2 pixels + ldrh r8, [r5], #2 + cmp r12,r6, lsl #2 + ldrh r12,[r5], #2 + bge 7f @ count_done + cmp r8, r7 + cmpeq r12,r7 + beq 6b - add lr, lr, #1 - sub lr, r6, lr - sub r6, r6, lr +7: @ count_done: + sub r5, r5, #4 @ undo readahead - eor r12,r8, r10 - tst r12, #0x8000 @ !((t ^ inv) & 0x8000) - bne 5f @ draw_md + sub r8, r5, lr @ pixel count + mov r8, r8, lsr #1 - and r12,r8, #0x03e0 - mov r8, r8, lsl #11 - orr r8, r8, r8, lsr #(10+11) - orr r8, r8, r12,lsl #1 - bic r8, r8, #0x0020 @ kill prio bit - - add r11,r11,lr - tst r0, #2 @ dst unaligned? - strneh r8, [r0], #2 - subne lr, lr, #1 - cmp lr, #0 - beq 2b @ loop_inner - mov r8, r8, lsl #16 - orr r12,r8, r8, lsr #16 - mov r8 ,r12 -4: @ draw_32x: - subs lr, lr, #4 @ store 4 pixels - stmgeia r0!, {r8, r12} - bgt 4b @ draw_32x + cmp r8, r6, lsl #1 @ limit count to line length + movgt r8, r6, lsl #1 + sub r6, r6, r8, lsr #1 @ consume pixels + + eor r7, r7, r10 + and r12,r7, #0x03e0 @ convert BGR555 -> RGB565 + mov r7, r7, ror #5 + orr r7, r7, r7, ror #10+11 + orr r7, r7, r12,lsl #1+16 + lsr r7, #16 + + tst r7, #0x20 @ check for prio transfer + beq 9f @ bg_loop + + add r11,r11,r8 @ consume md pixels (not used) + orr r12,r7, r7, lsl #16 + mov r7 ,r12 +8: @ 32x_loop: + subs r8, r8, #4 @ store 4 pixels + stmgeia r0!, {r7, r12} + bgt 8b @ 32x_loop beq 2b @ loop_inner - adds lr, lr, #2 @ store 1-3 leftover pixels - strge r8, [r0], #4 - strneh r8, [r0], #2 + adds r8, r8, #2 + strge r7, [r0], #4 @ store 2 leftover pixels b 2b @ loop_inner -5: @ draw_md: - subs lr, lr, #1 - ldrgeb r12,[r11], #1 @ MD pixel - blt 2b @ loop_inner - cmp r3, r12,lsl #26 @ MD has bg pixel? -.if \do_md - mov r12,r12,lsl #1 - ldrneh r12,[r9, r12] - strneh r12,[r0], #2 @ *dst++ = palmd[*pmd] -.else - addne r0, r0, #2 -.endif - bne 5b @ draw_md - - and r12,r8, #0x03e0 - mov r8, r8, lsl #11 - orr r8, r8, r8, lsr #(10+11) - orr r8, r8, r12,lsl #1 - bic r8, r8, #0x0020 @ kill prio bit - strh r8, [r0], #2 @ *dst++ = bgr2rgb(*p32x++) - -6: @ draw_md_32x: - subs lr, lr, #1 - ldrgeb r12,[r11], #1 @ MD pixel - blt 2b @ loop_inner - cmp r3, r12,lsl #26 @ MD has bg pixel? +9: @ bg_loop: + ldrb r12,[r11],#1 @ MD pixel 0,1 + ldrb lr, [r11],#1 .if \do_md + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? mov r12,r12,lsl #1 - ldrneh r12,[r9, r12] @ *dst++ = palmd[*pmd] - moveq r12,r8 @ *dst++ = bgr2rgb(*p32x++) - strh r12,[r0], #2 + ldrneh r12,[r9, r12] @ t = palmd[*pmd] + moveq r12,r7 + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? + mov lr, lr, lsl #1 + ldrneh lr, [r9, lr] + moveq lr, r7 + orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r12,[r0], #4 @ (no write combining on ARM9) .else - streqh r8, [r0] @ *dst++ = bgr2rgb(*p32x++) - add r0, r0, #2 + add r0, r0, #4 + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? + streqh r7, [r0, #-4] + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? + streqh r7, [r0, #-2] .endif - b 6b @ draw_md_32x + subs r8, r8, #2 + bgt 9b @ bg_loop + b 2b @ loop_inner .endm @@ -281,91 +298,95 @@ tst r5, #1 ldreqb r8, [r5], #2 ldrb r7, [r5, #-1] - ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index + ldrneb r8, [r5, #2]! @ r7,r8 - 32X pixel 0,1 subs r6, r6, #1 blt 0b @ loop_outer cmp r7, r8 - beq 5f @ check_fill @ +8 + beq 5f @ check_fill 3: @ no_fill: - mov r12,r7, lsl #1 - mov lr, r8, lsl #1 - ldrh r7, [r10,r12] - ldrh r8, [r10,lr] - add r11,r11,#2 - - eor r12,r7, #0x20 - tst r12,#0x20 - ldrneb r12,[r11,#-2] @ MD pixel 0 - eor lr, r8, #0x20 - cmpne r3, r12, lsl #26 @ MD has bg pixel? + ldrb r12,[r11], #1 @ MD pixel 0 + ldrb lr, [r11], #1 @ MD pixel 1 + + mov r7, r7, lsl #1 + mov r8, r8, lsl #1 + ldrh r7, [r10,r7] @ 32X color 0 + ldrh r8, [r10,r8] @ 32X color 1 + .if \do_md - mov r12,r12,lsl #1 - ldrneh r7, [r9, r12] @ t = palmd[pmd[0]] - tst lr, #0x20 - ldrneb lr, [r11,#-1] @ MD pixel 1 - cmpne r3, lr, lsl #26 @ MD has bg pixel? - mov lr, lr, lsl #1 - ldrneh r8, [r9, lr] @ t = palmd[pmd[1]] + cmp r3, r12, lsl #26 + movne r12,r12, lsl #1 @ load MD color if not bg + ldrneh r12,[r9, r12] + orreq r7, r7, #0x20 @ accumulate MD bg info into prio bit + cmp r3, lr, lsl #26 + movne lr, lr, lsl #1 + ldrneh lr, [r9, lr] + orreq r8, r8, #0x20 + + tst r7, #0x20 @ replace 32X with MD color if no prio and not bg + moveq r7, r12 + tst r8, #0x20 + moveq r8, lr orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth str r7, [r0], #4 @ (no write combining on ARM9) .else - streqh r7, [r0] - tst lr, #0x20 - ldrneb lr, [r11,#-1] @ MD pixel 1 - add r0, r0, #4 - cmpne r3, lr, lsl #26 @ MD has bg pixel? - streqh r8, [r0, #-2] + cmp r3, r12, lsl #26 @ replace MD bg info into prio bit + orreq r7, r7, #0x20 + cmp r3, lr, lsl #26 + orreq r8, r8, #0x20 + + add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg + tst r7, #0x20 + strneh r7, [r0, #-4] + tst r8, #0x20 + strneh r8, [r0, #-2] .endif b 2b @ loop_inner 5: @ check_fill: @ count pixels, align if needed bic r12,r5, #1 - ldrh lr ,[r12, #2] @ only do this for at least 4 pixels - ldrh r12,[r12] - orr r12,lr,r12, lsl #16 + ldrh r12,[r12, #0] @ only do this for at least 4 pixels orr lr, r7, r7, lsl #8 - orr lr, lr, lr, lsl #16 cmp r12,lr bne 3b @ no_fill + add r5, r5, #2 @ adjust for the check above - tst r5, #1 - sub lr, r5, #2 @ starting r5 (32x render data start) - addeq r5, r5, #4 - addne r5, r5, #3 @ add for the check above + sub lr, r5, #4 @ starting r5 (32x render data start) + bic r5, r5, #1 add r6, r6, #1 @ restore from dec orr r7, r7, r7, lsl #8 -6: - sub r12,r5, lr +6: @ count_loop: + sub r12,r5, lr @ loop checks 4 pixels ldrh r8, [r5], #2 cmp r12,r6, lsl #1 ldrh r12,[r5], #2 bge 7f @ count_done cmp r8, r7 - subne r5, r5, #2 @ undo readahead cmpeq r12,r7 beq 6b + cmp r8, r7 + addeq r5, r5, #2 @ adjust if 2 pixels where ok 7: @ count_done: - sub r5, r5, #2 @ undo readahead + sub r5, r5, #4 @ undo readahead - @ fix alignment and check type - sub r8, r5, lr - tst r8, #1 + tst lr, #1 @ fix alignment and calculate count subne r5, r5, #1 - subne r8, r8, #1 + sub r8, r5, lr - and r7, r7, #0xff - cmp r8, r6, lsl #1 + and r7, r7, #0xff @ 32x pixel color mov r7, r7, lsl #1 - movgt r8, r6, lsl #1 @ r8=count ldrh r7, [r10,r7] - sub r6, r6, r8, lsr #1 @ adjust counter - tst r7, #0x20 - beq 9f @ bg_mode - add r11,r11,r8 + cmp r8, r6, lsl #1 @ limit count to line length + movgt r8, r6, lsl #1 + sub r6, r6, r8, lsr #1 @ consume pixels + + tst r7, #0x20 @ check for prio transfer + beq 9f @ bg_loop + + add r11,r11,r8 @ consume md pixels (not used) orr r12,r7, r7, lsl #16 mov r7 ,r12 8: @ 32x_loop: @@ -377,11 +398,11 @@ strge r7, [r0], #4 @ store 2 leftover pixels b 2b @ loop_inner -9: @ bg_mode: +9: @ bg_loop: ldrb r12,[r11],#1 @ MD pixel 0,1 ldrb lr, [r11],#1 - cmp r3, r12,lsl #26 @ MD pixel 0 has bg? .if \do_md + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? mov r12,r12,lsl #1 ldrneh r12,[r9, r12] @ t = palmd[*pmd] moveq r12,r7 @@ -392,13 +413,14 @@ orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth str r12,[r0], #4 @ (no write combining on ARM9) .else - streqh r7, [r0] - cmp r3, lr, lsl #26 @ MD pixel 1 has bg? - streqh r7, [r0, #2] add r0, r0, #4 + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? + streqh r7, [r0, #-4] + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? + streqh r7, [r0, #-2] .endif subs r8, r8, #2 - bgt 9b @ bg_mode + bgt 9b @ bg_loop b 2b @ loop_inner .endm -- 2.39.5