From: kub Date: Thu, 25 Apr 2019 17:02:29 +0000 (+0200) Subject: speed improvement and fixes for 32x ARM asm draw X-Git-Tag: v2.00~862 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=08626dab12e62ba1caf018a739c44073029606b4;p=picodrive.git speed improvement and fixes for 32x ARM asm draw --- diff --git a/pico/32x/draw.c b/pico/32x/draw.c index 4bdbc89a..372f27ef 100644 --- a/pico/32x/draw.c +++ b/pico/32x/draw.c @@ -311,11 +311,6 @@ void PicoDraw32xLayerMdOnly(int offs, int lines) void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode) { -#ifdef _ASM_32X_DRAW - extern void *Pico32xNativePal; - Pico32xNativePal = Pico32xMem->pal_native; -#endif - if (which == PDF_RGB555) { // need CLUT pixels in PicoDraw2FB for layer transparency PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328); diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S index c59fa8f5..e0cdcbe5 100644 --- a/pico/32x/draw_arm.S +++ b/pico/32x/draw_arm.S @@ -13,12 +13,6 @@ .equiv P32XV_PRI, (1<< 7) -.bss -.align 2 -.global Pico32xNativePal -Pico32xNativePal: - .word 0 - .text .align 2 @@ -82,8 +76,8 @@ Pico32xNativePal: mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data tst r10,#P32XV_PRI - moveq r10,#0 - movne r10,#0x8000 @ r10 = inv_bit + movne r10,#0 + moveq r10,#0x8000 @ r10 = inv_bit call_scan_prep \call_scan lr mov r4, #0 @ line @@ -92,7 +86,6 @@ Pico32xNativePal: 0: @ loop_outer: call_scan_end \call_scan add r4, r4, #1 - sub r11,r11,#1 @ adjust for prev read cmp r4, r2, lsr #16 call_scan_fin_ge \call_scan ldmgefd sp!, {r4-r11,pc} @@ -106,31 +99,86 @@ Pico32xNativePal: add r5, r1, r12, lsl #1 @ p32x = dram + dram[l] 2: @ loop_inner: - ldrb r7, [r11], #1 @ MD pixel - subs r6, r6, #1 + ldrh r8, [r5], #2 + subs lr, r6, #1 blt 0b @ loop_outer - ldrh r8, [r5], #2 @ 32x pixel - cmp r3, r7, lsl #26 @ MD has bg pixel? - beq 3f @ draw32x + +3: @ loop_innermost: + ldrh r7, [r5], #2 @ 32x pixel + subs lr, lr, #1 + cmpge r7, r8 + beq 3b @ loop_innermost + + sub r5, r5, #2 + add lr, lr, #1 + sub lr, r6, lr + sub r6, r6, lr + eor r12,r8, r10 - ands r12,r12,#0x8000 @ !((t ^ inv) & 0x8000) + tst r12, #0x8000 @ !((t ^ inv) & 0x8000) + bne 5f @ draw_md + + and r7 ,r8, #0x03e0 + mov r8, r8, lsl #11 + orr r8, r8, r8, lsr #(10+11) + orr r8, r8, r7 ,lsl #1 + bic r8, r8, #0x0020 @ kill prio bit + + add r11,r11,lr + tst r0, #2 @ dst unaligned? + strneh r8, [r0], #2 + subne lr, lr, #1 + cmp lr, #0 + beq 2b @ loop_inner + mov r8, r8, lsl #16 + orr r12,r8, r8, lsr #16 + mov r8 ,r12 +4: @ draw_32x: + subs lr, lr, #4 @ store 4 pixels + stmgeia r0!, {r8, r12} + bgt 4b @ draw_32x + beq 2b @ loop_inner + adds lr, lr, #2 @ store 1-3 leftover pixels + strge r8, [r0], #4 + strneh r8, [r0], #2 + b 2b @ loop_inner + +5: @ draw_md: + subs lr, lr, #1 + ldrgeb r7, [r11], #1 @ MD pixel + blt 2b @ loop_inner + cmp r3, r7, lsl #26 @ MD has bg pixel? .if \do_md mov r7, r7, lsl #1 - ldreqh r12,[r9, r7] - streqh r12,[r0], #2 @ *dst++ = palmd[*pmd] + ldrneh r7 ,[r9, r7] + strneh r7 ,[r0], #2 @ *dst++ = palmd[*pmd] .else - addeq r0, r0, #2 + addne r0, r0, #2 .endif - beq 2b @ loop_inner + bne 5b @ draw_md -3: @ draw32x: - and r12,r8, #0x03e0 + and r7 ,r8, #0x03e0 mov r8, r8, lsl #11 orr r8, r8, r8, lsr #(10+11) - orr r8, r8, r12,lsl #1 + orr r8, r8, r7 ,lsl #1 bic r8, r8, #0x0020 @ kill prio bit strh r8, [r0], #2 @ *dst++ = bgr2rgb(*p32x++) - b 2b @ loop_inner + +6: @ draw_md_32x: + subs lr, lr, #1 + ldrgeb r7, [r11], #1 @ MD pixel + blt 2b @ loop_inner + cmp r3, r7, lsl #26 @ MD has bg pixel? +.if \do_md + mov r7, r7, lsl #1 + ldrneh r7 ,[r9, r7] @ *dst++ = palmd[*pmd] + moveq r7 ,r8 @ *dst++ = bgr2rgb(*p32x++) + strh r7 ,[r0], #2 +.else + streqh r8, [r0] @ *dst++ = bgr2rgb(*p32x++) + add r0, r0, #2 +.endif + b 6b @ draw_md_32x .endm @@ -144,9 +192,11 @@ Pico32xNativePal: stmfd sp!, {r4-r11,lr} ldr lr,=Pico - ldr r10,=Pico32xNativePal + ldr r10,=Pico32xMem + ldr r9,=OFS_PMEM32x_pal_native + ldr r10, [r10] ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] - ldr r10,[r10] + add r10,r10,r9 add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd and r4, r2, #0xff @@ -184,7 +234,7 @@ Pico32xNativePal: ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index subs r6, r6, #1 blt 0b @ loop_outer - cmp r7, r8 @ is this really improving things? + cmp r7, r8 beq 5f @ check_fill @ +8 3: @ no_fill: @@ -204,11 +254,11 @@ Pico32xNativePal: ldrneh r7, [r9, r12] @ t = palmd[pmd[0]] tst lr, #0x20 ldrneb lr, [r11,#-1] @ MD pixel 1 - strh r7, [r0], #2 cmpne r3, lr, lsl #26 @ MD has bg pixel? mov lr, lr, lsl #1 ldrneh r8, [r9, lr] @ t = palmd[pmd[1]] - strh r8, [r0], #2 + orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r7, [r0], #4 @ (no write combining on ARM9) .else streqh r7, [r0] tst lr, #0x20 @@ -219,18 +269,21 @@ Pico32xNativePal: .endif b 2b @ loop_inner -5: @ check_fill +5: @ check_fill: @ count pixels, align if needed bic r12,r5, #1 + ldrh lr ,[r12, #2] @ only do this for at least 4 pixels ldrh r12,[r12] + orr r12,lr,r12, lsl #16 orr lr, r7, r7, lsl #8 + orr lr, lr, lr, lsl #16 cmp r12,lr bne 3b @ no_fill tst r5, #1 sub lr, r5, #2 @ starting r5 (32x render data start) - addeq r5, r5, #2 - addne r5, r5, #1 @ add for the check above + addeq r5, r5, #4 + addne r5, r5, #3 @ add for the check above add r6, r6, #1 @ restore from dec orr r7, r7, r7, lsl #8 6: @@ -240,11 +293,12 @@ Pico32xNativePal: ldrh r12,[r5], #2 bge 7f @ count_done cmp r8, r7 + subne r5, r5, #2 @ undo readahead cmpeq r12,r7 beq 6b -7: @ count_done - sub r5, r5, #4 @ undo readahead +7: @ count_done: + sub r5, r5, #2 @ undo readahead @ fix alignment and check type sub r8, r5, lr @@ -262,11 +316,15 @@ Pico32xNativePal: beq 9f @ bg_mode add r11,r11,r8 -8: - subs r8, r8, #2 - strgeh r7, [r0], #2 - strgeh r7, [r0], #2 - bgt 8b + orr r12,r7, r7, lsl #16 + mov r7 ,r12 +8: @ 32x_loop: + subs r8, r8, #4 @ store 4 pixels + stmgeia r0!, {r7, r12} + bgt 8b @ 32x_loop + beq 2b @ loop_inner + adds r8, r8, #2 + strge r7, [r0], #4 @ store 2 leftover pixels b 2b @ loop_inner 9: @ bg_mode: @@ -281,8 +339,8 @@ Pico32xNativePal: mov lr, lr, lsl #1 ldrneh lr, [r9, lr] moveq lr, r7 - strh r12,[r0], #2 - strh lr, [r0], #2 + orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r12,[r0], #4 @ (no write combining on ARM9) .else streqh r7, [r0] cmp r3, lr, lsl #26 @ MD pixel 1 has bg? @@ -303,9 +361,11 @@ Pico32xNativePal: stmfd sp!, {r4-r11,lr} ldr lr,=Pico - ldr r10,=Pico32xNativePal + ldr r10,=Pico32xMem + ldr r9,=OFS_PMEM32x_pal_native + ldr r10, [r10] ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] - ldr r10,[r10] + add r10,r10,r9 add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd and r4, r2, #0xff @@ -320,7 +380,6 @@ Pico32xNativePal: 0: @ loop_outer: call_scan_end \call_scan add r4, r4, #1 - sub r11,r11,#1 @ adjust for prev read cmp r4, r2, lsr #16 call_scan_fin_ge \call_scan ldmgefd sp!, {r4-r11,pc} @@ -341,13 +400,13 @@ Pico32xNativePal: eor lr, lr, #0x20 3: @ loop_innermost: - ldrb r7, [r11], #1 @ MD pixel subs r6, r6, #1 + ldrgeb r7, [r11], #1 @ MD pixel blt 0b @ loop_outer - cmp r3, r7, lsl #26 @ MD has bg pixel? - mov r7, r7, lsl #1 - tstne lr, #0x20 + tst lr, #0x20 + cmpne r3, r7, lsl #26 @ MD has bg pixel? .if \do_md + mov r7, r7, lsl #1 ldrneh r12,[r9, r7] @ t = palmd[*pmd] streqh lr, [r0], #2 strneh r12,[r0], #2 @ *dst++ = t @@ -365,15 +424,18 @@ make_do_loop_dc do_loop_dc, 0, 0 make_do_loop_dc do_loop_dc_md, 0, 1 make_do_loop_dc do_loop_dc_scan, 1, 0 make_do_loop_dc do_loop_dc_scan_md, 1, 1 +.pool make_do_loop_pp do_loop_pp, 0, 0 make_do_loop_pp do_loop_pp_md, 0, 1 make_do_loop_pp do_loop_pp_scan, 1, 0 make_do_loop_pp do_loop_pp_scan_md, 1, 1 +.pool make_do_loop_rl do_loop_rl, 0, 0 make_do_loop_rl do_loop_rl_md, 0, 1 make_do_loop_rl do_loop_rl_scan, 1, 0 make_do_loop_rl do_loop_rl_scan_md, 1, 1 +.pool @ vim:filetype=armasm diff --git a/pico/draw.c b/pico/draw.c index e345a28d..4834d6bf 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1364,8 +1364,8 @@ static void FinalizeLine8bit(int sh, int line, struct PicoEState *est) { // a hack for mid-frame palette changes if (!(est->rendstatus & PDRAW_SONIC_MODE) || line - dirty_line > 4) { - // store a maximum of 3 additional palettes in SonicPal - if (est->SonicPalCount < 3) + // store a maximum of 2 additional palettes in SonicPal + if (est->SonicPalCount < 2) est->SonicPalCount ++; dirty_line = line; est->rendstatus |= PDRAW_SONIC_MODE; diff --git a/platform/gp2x/emu.c b/platform/gp2x/emu.c index 450ac080..4ad90b83 100644 --- a/platform/gp2x/emu.c +++ b/platform/gp2x/emu.c @@ -328,7 +328,7 @@ static int make_local_pal_md(int fast_mode) localPal[0xe0] = 0x00000000; // reserved pixels for OSD localPal[0xf0] = 0x00ffffff; - if (Pico.m.dirtyPal == 2) + if (Pico.m.dirtyPal == 2) Pico.m.dirtyPal = 0; return pallen; } diff --git a/tools/mkoffsets.sh b/tools/mkoffsets.sh index 6d68a1bc..461fbfa7 100755 --- a/tools/mkoffsets.sh +++ b/tools/mkoffsets.sh @@ -84,6 +84,7 @@ get_define OFS_EST_ PicoEState HighPal ; echo "$line" >>$fn get_define OFS_PMEM_ PicoMem vram ; echo "$line" >>$fn get_define OFS_PMEM_ PicoMem vsram ; echo "$line" >>$fn +get_define OFS_PMEM32x_ Pico32xMem pal_native ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn