From e44277d0f209e2369ac521a39f03f6397cd8e110 Mon Sep 17 00:00:00 2001 From: kub Date: Mon, 17 Mar 2025 01:00:21 +0100 Subject: [PATCH] 32x, fix using md h32 mode (4 px offset) --- pico/32x/draw.c | 47 +++++++++++++++++---- pico/32x/draw_arm.S | 101 +++++++++++++++++++++++++++++++++++++------- pico/draw.c | 6 +-- 3 files changed, 127 insertions(+), 27 deletions(-) diff --git a/pico/32x/draw.c b/pico/32x/draw.c index 6b32fe57..4426f3a5 100644 --- a/pico/32x/draw.c +++ b/pico/32x/draw.c @@ -15,6 +15,8 @@ // slot (signalling the display of background color) is processed in this case // is however unclear and might lead to glitches due to race conditions by the // different video clocks for H32 and H40. +// NB: there is an offset of 4 pixels between MD and 32X layers in H32 mode. +#define H32_OFFSET 4 // BGR555 to native conversion #if defined(USE_BGR555) @@ -118,14 +120,19 @@ static void convert_pal555(int invert_prio) } \ } +#define MD_LAYER_CODE_H32 \ + *dst = dst[H32_OFFSET] + // this is almost never used (Wiz and menu bg gen only) void FinalizeLine32xRGB555(int sh, int line, struct PicoEState *est) { - unsigned short *pd = est->DrawLineDest; + unsigned short *dst = est->DrawLineDest; unsigned short *pal = Pico32xMem->pal_native; unsigned char *pmd = est->HighCol + 8; + unsigned short *palmd = est->HighPal; unsigned short *dram, *p32x; unsigned char mdbg; + int h32 = !(Pico.video.reg[12] & 0x1); FinalizeLine555(sh, line, est); @@ -138,10 +145,14 @@ void FinalizeLine32xRGB555(int sh, int line, struct PicoEState *est) dram = (void *)Pico32xMem->dram[Pico32x.vdp_regs[0x0a/2] & P32XV_FS]; p32x = dram + dram[line]; mdbg = Pico.video.reg[7] & 0x3f; + if (h32) pmd += H32_OFFSET; if ((Pico32x.vdp_regs[0] & P32XV_Mx) == 2) { // Direct Color Mode int inv_bit = (Pico32x.vdp_regs[0] & P32XV_PRI) ? 0x8000 : 0; - do_line_dc(pd, p32x, pmd, inv_bit,); + if (h32) { + do_line_dc(dst, p32x, pmd, inv_bit, MD_LAYER_CODE_H32); + } else + do_line_dc(dst, p32x, pmd, inv_bit,); return; } @@ -152,10 +163,16 @@ void FinalizeLine32xRGB555(int sh, int line, struct PicoEState *est) unsigned char *p32xb = (void *)p32x; if (Pico32x.vdp_regs[2 / 2] & P32XV_SFT) p32xb++; - do_line_pp(pd, p32xb, pmd,); + if (h32) { + do_line_pp(dst, p32xb, pmd, MD_LAYER_CODE_H32); + } else + do_line_pp(dst, p32xb, pmd,); } else { // Run Length Mode - do_line_rl(pd, p32x, pmd,); + if (h32) { + do_line_rl(dst, p32x, pmd, MD_LAYER_CODE_H32); + } else + do_line_rl(dst, p32x, pmd,); } } @@ -182,6 +199,7 @@ static void do_loop_dc##name(unsigned short *dst, \ unsigned short *p32x; \ int lines = (lines_sft_offs >> 16) & 0xff; \ int l; \ + if (lines_sft_offs & (2<<8)) pmd += H32_OFFSET; \ (void)palmd; \ for (l = 0; l < lines; l++, pmd += 8) { \ pre_code; \ @@ -203,6 +221,7 @@ static void do_loop_pp##name(unsigned short *dst, \ unsigned char *p32x; \ int lines = (lines_sft_offs >> 16) & 0xff; \ int l; \ + if (lines_sft_offs & (2<<8)) pmd += H32_OFFSET; \ (void)palmd; \ for (l = 0; l < lines; l++, pmd += 8) { \ pre_code; \ @@ -225,6 +244,7 @@ static void do_loop_rl##name(unsigned short *dst, \ unsigned short *p32x; \ int lines = (lines_sft_offs >> 16) & 0xff; \ int l; \ + if (lines_sft_offs & (2<<8)) pmd += H32_OFFSET; \ (void)palmd; \ for (l = 0; l < lines; l++, pmd += 8) { \ pre_code; \ @@ -248,15 +268,17 @@ extern void do_loop_rl##name(unsigned short *dst, \ make_do_loop(,,,) make_do_loop(_md, , , MD_LAYER_CODE) +make_do_loop(_h32, , , MD_LAYER_CODE_H32) make_do_loop(_scan, PICOSCAN_PRE, PICOSCAN_POST, ) +make_do_loop(_scan_h32, PICOSCAN_PRE, PICOSCAN_POST, MD_LAYER_CODE_H32) make_do_loop(_scan_md, PICOSCAN_PRE, PICOSCAN_POST, MD_LAYER_CODE) typedef void (*do_loop_func)(unsigned short *dst, unsigned short *dram, unsigned lines, int mdbg); -enum { DO_LOOP, DO_LOOP_MD, DO_LOOP_SCAN, DO_LOOP_MD_SCAN }; +enum { DO_LOOP, DO_LOOP_H32, DO_LOOP_MD, DO_LOOP_SCAN, DO_LOOP_H32_SCAN, DO_LOOP_MD_SCAN }; -static const do_loop_func do_loop_dc_f[] = { do_loop_dc, do_loop_dc_md, do_loop_dc_scan, do_loop_dc_scan_md }; -static const do_loop_func do_loop_pp_f[] = { do_loop_pp, do_loop_pp_md, do_loop_pp_scan, do_loop_pp_scan_md }; -static const do_loop_func do_loop_rl_f[] = { do_loop_rl, do_loop_rl_md, do_loop_rl_scan, do_loop_rl_scan_md }; +static const do_loop_func do_loop_dc_f[] = { do_loop_dc, do_loop_dc_h32, do_loop_dc_md, do_loop_dc_scan, do_loop_dc_scan_h32, do_loop_dc_scan_md }; +static const do_loop_func do_loop_pp_f[] = { do_loop_pp, do_loop_pp_h32, do_loop_pp_md, do_loop_pp_scan, do_loop_pp_scan_h32, do_loop_pp_scan_md }; +static const do_loop_func do_loop_rl_f[] = { do_loop_rl, do_loop_rl_h32, do_loop_rl_md, do_loop_rl_scan, do_loop_rl_scan_h32, do_loop_rl_scan_md }; void PicoDraw32xLayer(int offs, int lines, int md_bg) { @@ -297,13 +319,20 @@ void PicoDraw32xLayer(int offs, int lines, int md_bg) } do_it: + // In 8bit modes MD+32X layers are merged together in 32X rendering, while in + // 16bit mode the MD layer is directly created in the target buffer and the + // 32X layer is overlaid onto that. if (Pico32xDrawMode == PDM32X_BOTH) which_func = have_scan ? DO_LOOP_MD_SCAN : DO_LOOP_MD; + else if (!(Pico.video.reg[12] & 1)) // H32, mind 4 px offset + which_func = have_scan ? DO_LOOP_H32_SCAN : DO_LOOP_H32; else which_func = have_scan ? DO_LOOP_SCAN : DO_LOOP; lines_sft_offs = (Pico32x.sync_line << 24) | (lines << 16) | offs; if (Pico32x.vdp_regs[2 / 2] & P32XV_SFT) lines_sft_offs |= 1 << 8; + if (!(Pico.video.reg[12] & 1)) // offset flag for H32 + lines_sft_offs |= 2 << 8; do_loop[which_func](Pico.est.DrawLineDest, dram, lines_sft_offs, md_bg); } @@ -357,7 +386,7 @@ void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode) // we'll draw via FinalizeLine32xRGB555 (rare) Pico32xDrawMode = PDM32X_OFF; else - // in RGB555 mode the 32x layer is drawn over the MD layer, in the other + // in RGB555 mode the 32x layer is overlaid on the MD layer, in the other // modes 32x and MD layer are merged together by the 32x renderer Pico32xDrawMode = (which == PDF_RGB555) ? PDM32X_32X_ONLY : PDM32X_BOTH; } diff --git a/pico/32x/draw_arm.S b/pico/32x/draw_arm.S index f78f68c8..8d8fd16d 100644 --- a/pico/32x/draw_arm.S +++ b/pico/32x/draw_arm.S @@ -68,7 +68,7 @@ @ direct color @ unsigned short *dst, unsigned short *dram, int lines_sft_offs, int mdbg -.macro make_do_loop_dc name call_scan do_md +.macro make_do_loop_dc name call_scan do_md do_h32 .global \name \name: stmfd sp!, {r4-r11,lr} @@ -83,6 +83,8 @@ mov r5, #328 mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data + tst r2, #(2<<8) + addne r11,r11,#4 @ H32 offset tst r10,#P32XV_PRI movne r10,#0 moveq r10,#0x8000 @ r10 = !inv_bit @@ -152,6 +154,20 @@ orr r12,r12, lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth str r12,[r0], #4 @ (no write combining on ARM9) +.else +.if \do_h32 + cmp r3, r12, lsl #26 @ replace MD bg info into prio bit + tstne r7, #0x20<<16 + ldrneh r7, [r0, #8] + moveq r7, r7, lsr #16 + + cmp r3, lr, lsl #26 + tstne r8, #0x20<<16 + ldrneh r8, [r0, #10] + moveq r8, r8, lsr #16 + + orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r7, [r0], #4 @ (no write combining on ARM9) .else cmp r3, r12, lsl #26 @ replace MD bg info into prio bit tstne r7, #0x20<<16 @@ -164,6 +180,7 @@ streqh r8, [r0, #2] add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg +.endif .endif b 2b @ loop_inner @@ -234,12 +251,24 @@ moveq lr, r7 orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth str r12,[r0], #4 @ (no write combining on ARM9) +.else +.if \do_h32 + add r0, r0, #4 + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? + ldrneh lr, [r0, #4] + streqh r7, [r0, #-4] + strneh lr, [r0, #-4] + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? + ldrneh lr, [r0, #6] + streqh r7, [r0, #-2] + strneh lr, [r0, #-2] .else add r0, r0, #4 cmp r3, r12,lsl #26 @ MD pixel 0 has bg? streqh r7, [r0, #-4] cmp r3, lr, lsl #26 @ MD pixel 1 has bg? streqh r7, [r0, #-2] +.endif .endif subs r8, r8, #2 bgt 9b @ bg_loop @@ -251,7 +280,7 @@ @ note: this may read a few bytes over the end of PicoDraw2FB and dram, @ so those should have a bit more alloc'ed than really needed. @ unsigned short *dst, unsigned short *dram, int lines_sft_offs, int mdbg -.macro make_do_loop_pp name call_scan do_md +.macro make_do_loop_pp name call_scan do_md do_h32 .global \name \name: stmfd sp!, {r4-r11,lr} @@ -268,6 +297,8 @@ mov r5, #328 mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data + tst r2, #(2<<8) + addne r11,r11,#4 @ H32 offset call_scan_prep \call_scan lr mov r4, #0 @ line @@ -333,6 +364,20 @@ orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth str r7, [r0], #4 @ (no write combining on ARM9) .else +.if \do_h32 + cmp r3, r12, lsl #26 @ replace MD bg info into prio bit + orreq r7, r7, #0x20 + cmp r3, lr, lsl #26 + orreq r8, r8, #0x20 + + tst r7, #0x20 + ldreqh r7, [r0, #8] + tst r8, #0x20 + ldreqh r8, [r0, #10] + + orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth + str r7, [r0], #4 @ (no write combining on ARM9) +.else cmp r3, r12, lsl #26 @ replace MD bg info into prio bit orreq r7, r7, #0x20 cmp r3, lr, lsl #26 @@ -343,6 +388,7 @@ strneh r7, [r0, #-4] tst r8, #0x20 strneh r8, [r0, #-2] +.endif .endif b 2b @ loop_inner @@ -415,12 +461,24 @@ moveq lr, r7 orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth str r12,[r0], #4 @ (no write combining on ARM9) +.else +.if \do_h32 + add r0, r0, #4 + cmp r3, r12,lsl #26 @ MD pixel 0 has bg? + ldrneh lr, [r0, #4] + streqh r7, [r0, #-4] + strneh lr, [r0, #-4] + cmp r3, lr, lsl #26 @ MD pixel 1 has bg? + ldrneh lr, [r0, #6] + streqh r7, [r0, #-2] + strneh lr, [r0, #-2] .else add r0, r0, #4 cmp r3, r12,lsl #26 @ MD pixel 0 has bg? streqh r7, [r0, #-4] cmp r3, lr, lsl #26 @ MD pixel 1 has bg? streqh r7, [r0, #-2] +.endif .endif subs r8, r8, #2 bgt 9b @ bg_loop @@ -430,7 +488,7 @@ @ run length @ unsigned short *dst, unsigned short *dram, int lines_sft_offs, int mdbg -.macro make_do_loop_rl name call_scan do_md +.macro make_do_loop_rl name call_scan do_md do_h32 .global \name \name: stmfd sp!, {r4-r11,lr} @@ -447,6 +505,8 @@ mov r5, #328 mov r3, r3, lsl #26 @ mdbg << 26 mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data + tst r2, #(2<<8) + addne r11,r11,#4 @ H32 offset call_scan_prep \call_scan lr mov r4, #0 @ line @@ -490,9 +550,14 @@ ldrneh r12,[r9, r7] @ t = palmd[*pmd] streqh lr, [r0], #2 strneh r12,[r0], #2 @ *dst++ = t +.else +.if \do_h32 + ldrneh lr, [r0, #8] + strh lr, [r0], #2 .else streqh lr, [r0] add r0, r0, #2 +.endif .endif subs r8, r8, #0x100 bge 3b @ loop_innermost @@ -500,22 +565,28 @@ .endm -make_do_loop_dc do_loop_dc, 0, 0 -make_do_loop_dc do_loop_dc_md, 0, 1 -make_do_loop_dc do_loop_dc_scan, 1, 0 -make_do_loop_dc do_loop_dc_scan_md, 1, 1 +make_do_loop_dc do_loop_dc, 0, 0, 0 +make_do_loop_dc do_loop_dc_h32, 0, 0, 1 +make_do_loop_dc do_loop_dc_md, 0, 1, 0 +make_do_loop_dc do_loop_dc_scan, 1, 0, 0 +make_do_loop_dc do_loop_dc_scan_h32,1, 0, 1 +make_do_loop_dc do_loop_dc_scan_md, 1, 1, 0 .pool -make_do_loop_pp do_loop_pp, 0, 0 -make_do_loop_pp do_loop_pp_md, 0, 1 -make_do_loop_pp do_loop_pp_scan, 1, 0 -make_do_loop_pp do_loop_pp_scan_md, 1, 1 +make_do_loop_pp do_loop_pp, 0, 0, 0 +make_do_loop_pp do_loop_pp_h32, 0, 0, 1 +make_do_loop_pp do_loop_pp_md, 0, 1, 0 +make_do_loop_pp do_loop_pp_scan, 1, 0, 0 +make_do_loop_pp do_loop_pp_scan_h32,1, 0, 1 +make_do_loop_pp do_loop_pp_scan_md, 1, 1, 0 .pool -make_do_loop_rl do_loop_rl, 0, 0 -make_do_loop_rl do_loop_rl_md, 0, 1 -make_do_loop_rl do_loop_rl_scan, 1, 0 -make_do_loop_rl do_loop_rl_scan_md, 1, 1 +make_do_loop_rl do_loop_rl, 0, 0, 0 +make_do_loop_rl do_loop_rl_h32, 0, 0, 1 +make_do_loop_rl do_loop_rl_md, 0, 1, 0 +make_do_loop_rl do_loop_rl_scan, 1, 0, 0 +make_do_loop_rl do_loop_rl_scan_h32,1, 0, 1 +make_do_loop_rl do_loop_rl_scan_md, 1, 1, 0 .pool @ vim:filetype=armasm diff --git a/pico/draw.c b/pico/draw.c index e58d50fd..2d47998c 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -1749,17 +1749,17 @@ PICO_INTERNAL void PicoFrameStart(void) if (PicoIn.AHW & PAHW_32X) // H32 upscaling, before mixing in 32X layer est->rendstatus = (*est->PicoOpt & POPT_ALT_RENDERER) ? PDRAW_BORDER_32 : PDRAW_32X_SCALE|PDRAW_SOFTSCALE; - else if (!(PicoIn.opt & POPT_DIS_32C_BORDER)) + else if (!(*est->PicoOpt & POPT_DIS_32C_BORDER)) est->rendstatus |= PDRAW_BORDER_32; - if ((PicoIn.opt & POPT_EN_SOFTSCALE) && !(*est->PicoOpt & POPT_ALT_RENDERER)) + if ((*est->PicoOpt & POPT_EN_SOFTSCALE) && !(*est->PicoOpt & POPT_ALT_RENDERER)) est->rendstatus |= PDRAW_SOFTSCALE; if ((est->Pico->video.reg[12] & 6) == 6) est->rendstatus |= PDRAW_INTERLACE; // interlace mode if (!(est->Pico->video.reg[12] & 1)) { est->rendstatus |= PDRAW_32_COLS; - if (!(est->rendstatus & PDRAW_SOFTSCALE)) { + if (!(est->rendstatus & PDRAW_SOFTSCALE) && !(PicoIn.AHW & PAHW_32X)) { columns = 256; coffs = 32; } -- 2.39.5