From d1e50db771c93b32910ca406bb63a7abd98f1b72 Mon Sep 17 00:00:00 2001 From: notaz Date: Tue, 5 Nov 2024 02:42:18 +0200 Subject: [PATCH] gpu_unai: some new asm --- plugins/gpu_unai/gpu_arm.S | 170 +++++++++++++++++++++-- plugins/gpu_unai/gpu_arm.h | 8 +- plugins/gpu_unai/gpu_inner.h | 81 ++++++++--- plugins/gpu_unai/gpu_raster_sprite.h | 48 +------ plugins/gpu_unai/gpulib_if.cpp | 11 -- plugins/gpu_unai/old/gpu_arm.h | 9 -- plugins/gpu_unai/old/gpu_arm.s | 55 -------- plugins/gpu_unai/old/gpu_raster_sprite.h | 4 +- 8 files changed, 234 insertions(+), 152 deletions(-) delete mode 100644 plugins/gpu_unai/old/gpu_arm.h delete mode 100644 plugins/gpu_unai/old/gpu_arm.s diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S index ec87f211..93269932 100644 --- a/plugins/gpu_unai/gpu_arm.S +++ b/plugins/gpu_unai/gpu_arm.S @@ -1,5 +1,5 @@ /* - * (C) Gražvydas "notaz" Ignotas, 2011 + * (C) Gražvydas "notaz" Ignotas, 2011,2024 * * This work is licensed under the terms of GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. @@ -10,9 +10,15 @@ .text .align 2 +.macro pld_ reg offs=#0 +#ifdef HAVE_ARMV6 + pld [\reg, \offs] +#endif +.endm + @ in: r0=dst, r2=pal, r12=0x1e @ trashes r6-r8,lr,flags -.macro do_4_pixels rs ibase obase +.macro do_4x_4bpp rs ibase obase .if \ibase - 1 < 0 and r6, r12, \rs, lsl #1 .else @@ -35,22 +41,170 @@ strneh lr, [r0, #\obase+6] .endm -.global draw_spr16_full @ (u16 *d, void *s, u16 *pal, int lines) -draw_spr16_full: +@ in: r0=dst, r2=pal, r12=0x1fe +@ loads/stores \rs,r6-r8 +.macro do_4x_8bpp rs + and r6, r12, \rs, lsl #1 + and r7, r12, \rs, lsr #7 + and r8, r12, \rs, lsr #15 + and \rs,r12, \rs, lsr #23 + ldrh r6, [r2, r6] + ldrh r7, [r2, r7] + ldrh r8, [r2, r8] + ldrh \rs,[r2, \rs] + tst r6, r6 + strneh r6, [r0, #0] + tst r7, r7 + strneh r7, [r0, #2] + tst r8, r8 + strneh r8, [r0, #4] + tst \rs,\rs + strneh \rs,[r0, #6] +.endm + +.global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines) +sprite_4bpp_x16_asm_: + ldr r2, [r3] @ pal + ldr r3, [r3, #0x1c] @ lines +sprite_4bpp_x16_asm: + .cfi_startproc stmfd sp!, {r4-r8,lr} + .cfi_def_cfa_offset 4*6 + .cfi_rel_offset lr, 4*5 mov r12, #0x1e @ empty pixel 0: ldmia r1, {r4,r5} - do_4_pixels r4, 0, 0 - do_4_pixels r4, 16, 8 - do_4_pixels r5, 0, 16 - do_4_pixels r5, 16, 24 + pld_ r1, #2048 + do_4x_4bpp r4, 0, 0 + do_4x_4bpp r4, 16, 8 + do_4x_4bpp r5, 0, 16 + do_4x_4bpp r5, 16, 24 subs r3, r3, #1 add r0, r0, #2048 add r1, r1, #2048 bgt 0b ldmfd sp!, {r4-r8,pc} + .cfi_endproc + + +@ +.macro sprite_driver_part1 is8bpp + stmfd sp!, {r4-r11,lr} + .cfi_def_cfa_offset 4*9 + .cfi_rel_offset lr, 4*8 + mov r12, #0x01e +.if \is8bpp + orr r12, r12, #0x1f0 @ mask=0x01fe +.endif + ldr r4, [r3, #4] @ u0 + ldr r5, [r3, #0x1c] @ h + and r4, r4, #((8 >> \is8bpp) - 1) + sub r5, r5, #1 + orr r5, r4, r5, lsl #8 @ ((h-1) << 8) | u0_fraction + mov r9, r2 @ saved_w + mov r10, r0 @ saved_dst + mov r11, r1 @ saved_src + ldr r2, [r3] @ pal +11: @ line_loop: + pld_ r11, #2048 + mov r0, r10 + mov r1, r11 + mov r3, r9 + ands r6, r5, #(7 >> \is8bpp) + bne 15f @ fractional_u +12: + subs r3, r3, #(8 >> \is8bpp) @ w + bmi 14f @ fractional_w +.endm +.macro sprite_driver_part2 is8bpp + cmn r3, #(8 >> \is8bpp) + bne 14f @ fractional_w +13: @ eol: + add r10, r10, #2048 + add r11, r11, #2048 + subs r5, r5, #0x100 + bpl 11b @ line_loop + ldmfd sp!, {r4-r11,pc} +14: @ fractional_w: + ldr r4, [r1], #4 + add r8, r3, #(8 >> \is8bpp) + mov r3, #0 + mov r4, r4, lsl #1 + b 16f @ fractional_loop +15: @ fractional_u: + bic r1, r1, #3 + rsb r8, r6, #(8 >> \is8bpp) + ldr r4, [r1], #4 + cmp r8, r3 + movgt r8, r3 + mov r7, r6, lsl #(2 + \is8bpp) + sub r3, r3, r8 + sub r7, r7, #1 + mov r4, r4, lsr r7 +16: @ fractional_loop: +.endm +.macro sprite_driver_part3 + tst r3, r3 + beq 13b @ sprd4_eol + b 12b @ return from fractional_u +.endm + +.global sprite_driver_4bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg) +sprite_driver_4bpp_asm: + .cfi_startproc + ldr r12, [r3, #4] @ u0 + mov r12, r12, lsl #29 + orr r12, r12, r2 @ w + cmp r12, #16 + beq sprite_4bpp_x16_asm_ @ use specialized aligned x16 version + sprite_driver_part1 0 +0: + ldr r4, [r1], #4 + pld_ r1, #28 + do_4x_4bpp r4, 0, 0 + do_4x_4bpp r4, 16, 8 + add r0, r0, #16 + subs r3, r3, #8 + bpl 0b + sprite_driver_part2 0 +0: + and r7, r12, r4 + mov r4, r4, lsr #4 + ldrh r7, [r2, r7] + add r0, r0, #2 + tst r7, r7 + strneh r7, [r0, #-2] + subs r8, r8, #1 + bgt 0b + sprite_driver_part3 + .cfi_endproc + + +.global sprite_driver_8bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg) +sprite_driver_8bpp_asm: + .cfi_startproc + sprite_driver_part1 1 +0: + ldr r4, [r1], #4 + pld_ r1, #28 + do_4x_8bpp r4 + add r0, r0, #8 + subs r3, r3, #4 + bpl 0b + sprite_driver_part2 1 +0: + and r7, r12, r4 + mov r4, r4, lsr #8 + ldrh r7, [r2, r7] + add r0, r0, #2 + tst r7, r7 + strneh r7, [r0, #-2] + subs r8, r8, #1 + bgt 0b + sprite_driver_part3 + .cfi_endproc + @ vim:filetype=armasm diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h index 0f8ed6b5..2329c46c 100644 --- a/plugins/gpu_unai/gpu_arm.h +++ b/plugins/gpu_unai/gpu_arm.h @@ -5,7 +5,13 @@ extern "C" { #endif -void draw_spr16_full(void *d, void *s, void *pal, int lines); +struct spriteDriverArg; + +void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct spriteDriverArg *arg); +void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base, + u32 count, const struct spriteDriverArg *arg); +void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines); #ifdef __cplusplus } diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h index 11834838..a80c3a3a 100644 --- a/plugins/gpu_unai/gpu_inner.h +++ b/plugins/gpu_unai/gpu_inner.h @@ -359,10 +359,12 @@ const PT gpuTileSpanDrivers[32] = { /////////////////////////////////////////////////////////////////////////////// // GPU Sprites innerloops generator + +// warning: gpu_arm.S asm uses this, update it if you change this typedef struct spriteDriverArg { - const le16_t *CBA; - u32 u0, v0, u0_mask, v0_mask; - s32 y0, y1, li; + const le16_t *CBA; // 00 + u32 u0, v0, u0_mask, v0_mask; // 04 08 0c 10 + s32 y0, y1, lines, li; // 14 } spriteDriverArg; typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt, @@ -447,6 +449,40 @@ endsprite: } } +#ifdef __arm__ +#include "gpu_arm.h" + +static void Sprite4bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base, + const spriteDriverArg *arg) +{ +#if 1 + s32 lines = arg->lines; + u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1; + if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) { + pTxt_base += arg->u0 / 2 + arg->v0 * 2048; + sprite_driver_4bpp_asm(pPixel, pTxt_base, count, arg); + } + else +#endif + gpuSpriteDriverFn<0x20>(pPixel, count, pTxt_base, arg); +} + +static void Sprite8bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base, + const spriteDriverArg *arg) +{ +#if 1 + s32 lines = arg->lines; + u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1; + if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) { + pTxt_base += arg->u0 + arg->v0 * 2048; + sprite_driver_8bpp_asm(pPixel, pTxt_base, count, arg); + } + else +#endif + gpuSpriteDriverFn<0x40>(pPixel, count, pTxt_base, arg); +} +#endif // __arm__ + static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base, const spriteDriverArg *arg) { @@ -463,23 +499,30 @@ static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base, // Template instantiation helper macros #define TI(cf) gpuSpriteDriverFn<(cf)> #define TN SpriteNULL +#ifdef __arm__ +#define TA4(cf) Sprite4bppMaybeAsm +#define TA8(cf) Sprite8bppMaybeAsm +#else +#define TA4(cf) TI(cf) +#define TA8(cf) TI(cf) +#endif #define TIBLOCK(ub) \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TN, TN, TN, TN, TN, TN, TN, TN, \ - TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ - TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ - TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ - TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ - TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ - TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ - TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ - TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ - TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \ - TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \ - TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \ - TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f) + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TA4((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ + TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ + TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ + TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ + TA8((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ + TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ + TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ + TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ + TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \ + TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \ + TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \ + TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f) const PS gpuSpriteDrivers[256] = { TIBLOCK(0<<8), TIBLOCK(1<<8) diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h index e49e7a8d..2564e7f0 100644 --- a/plugins/gpu_unai/gpu_raster_sprite.h +++ b/plugins/gpu_unai/gpu_raster_sprite.h @@ -83,57 +83,11 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out) arg.v0_mask = gpu_unai.TextureWindow[3]; arg.y0 = y0; arg.y1 = y1; + arg.lines = y1 - y0; arg.li = li; gpuSpriteDriver(Pixel, x1, pTxt_base, &arg); } -#ifdef __arm__ -#include "gpu_arm.h" - -/* Notaz 4bit sprites optimization */ -void gpuDrawS16(PtrUnion packet, s32 *w_out, s32 *h_out) -{ - s32 x0, y0; - s32 u0, v0; - s32 xmin, xmax; - s32 ymin, ymax; - u32 h = 16; - - //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y, - // or sprites in 1st level of SkullMonkeys disappear when walking right. - // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon: - x0 = GPU_EXPANDSIGN(le16_to_s16(packet.U2[2]) + gpu_unai.DrawingOffset[0]); - y0 = GPU_EXPANDSIGN(le16_to_s16(packet.U2[3]) + gpu_unai.DrawingOffset[1]); - - xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; - ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; - u0 = packet.U1[8]; - v0 = packet.U1[9]; - - if (x0 > xmax - 16 || x0 < xmin || - ((u0 | v0) & 15) || !(gpu_unai.TextureWindow[2] & gpu_unai.TextureWindow[3] & 8)) { - // send corner cases to general handler - packet.U4[3] = u32_to_le32(0x00100010); - gpuDrawS(packet, gpuSpriteSpanFn<0x20>, w_out, h_out); - return; - } - - if (y0 >= ymax || y0 <= ymin - 16) - return; - if (y0 < ymin) { - h -= ymin - y0; - v0 += ymin - y0; - y0 = ymin; - } - else if (ymax - y0 < 16) - h = ymax - y0; - *w_out = 16; - *h_out = h; - - draw_spr16_full(&gpu_unai.vram[FRAME_OFFSET(x0, y0)], &gpu_unai.TBA[FRAME_OFFSET(u0/4, v0)], gpu_unai.CBA, h); -} -#endif // __arm__ - void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_out) { s32 x0, x1, y0, y1; diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp index aface806..eb47c2a6 100644 --- a/plugins/gpu_unai/gpulib_if.cpp +++ b/plugins/gpu_unai/gpulib_if.cpp @@ -766,17 +766,6 @@ int do_cmd_list(u32 *list_, int list_len, case 0x7C: case 0x7D: -#ifdef __arm__ - if ((gpu_unai.GPU_GP1 & 0x180) == 0 && (gpu_unai.Masking | gpu_unai.PixelMSB) == 0) - { - s32 w = 0, h = 0; - gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16); - gpuDrawS16(packet, &w, &h); - gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h)); - break; - } - // fallthrough -#endif case 0x7E: case 0x7F: { // Textured rectangle (16x16) gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00100010); diff --git a/plugins/gpu_unai/old/gpu_arm.h b/plugins/gpu_unai/old/gpu_arm.h deleted file mode 100644 index a0b22487..00000000 --- a/plugins/gpu_unai/old/gpu_arm.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifdef __cplusplus -extern "C" { -#endif - -void draw_spr16_full(u16 *d, void *s, u16 *pal, int lines); - -#ifdef __cplusplus -} -#endif diff --git a/plugins/gpu_unai/old/gpu_arm.s b/plugins/gpu_unai/old/gpu_arm.s deleted file mode 100644 index 8fa44a7a..00000000 --- a/plugins/gpu_unai/old/gpu_arm.s +++ /dev/null @@ -1,55 +0,0 @@ -/* - * (C) Gražvydas "notaz" Ignotas, 2011 - * - * This work is licensed under the terms of GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ - - -.text -.align 2 - -@ in: r0=dst, r2=pal, r12=0x1e -@ trashes r6-r8,lr,flags -.macro do_4_pixels rs ibase obase -.if \ibase - 1 < 0 - and r6, r12, \rs, lsl #1 -.else - and r6, r12, \rs, lsr #\ibase-1 -.endif - and r7, r12, \rs, lsr #\ibase+3 - and r8, r12, \rs, lsr #\ibase+7 - and lr, r12, \rs, lsr #\ibase+11 - ldrh r6, [r2, r6] - ldrh r7, [r2, r7] - ldrh r8, [r2, r8] - ldrh lr, [r2, lr] - tst r6, r6 - strneh r6, [r0, #\obase+0] - tst r7, r7 - strneh r7, [r0, #\obase+2] - tst r8, r8 - strneh r8, [r0, #\obase+4] - tst lr, lr - strneh lr, [r0, #\obase+6] -.endm - -.global draw_spr16_full @ (u16 *d, void *s, u16 *pal, int lines) -draw_spr16_full: - stmfd sp!, {r4-r8,lr} - mov r12, #0x1e @ empty pixel - -0: - ldmia r1, {r4,r5} - do_4_pixels r4, 0, 0 - do_4_pixels r4, 16, 8 - do_4_pixels r5, 0, 16 - do_4_pixels r5, 16, 24 - subs r3, r3, #1 - add r0, r0, #2048 - add r1, r1, #2048 - bgt 0b - - ldmfd sp!, {r4-r8,pc} - -@ vim:filetype=armasm diff --git a/plugins/gpu_unai/old/gpu_raster_sprite.h b/plugins/gpu_unai/old/gpu_raster_sprite.h index a700db32..4e19428e 100644 --- a/plugins/gpu_unai/old/gpu_raster_sprite.h +++ b/plugins/gpu_unai/old/gpu_raster_sprite.h @@ -85,7 +85,7 @@ void gpuDrawS(const PS gpuSpriteSpanDriver) } #ifdef __arm__ -#include "gpu_arm.h" +#include "../gpu_arm.h" void gpuDrawS16(void) { @@ -121,7 +121,7 @@ void gpuDrawS16(void) else if (ymax - y0 < 16) h = ymax - y0; - draw_spr16_full(&GPU_FrameBuffer[FRAME_OFFSET(x0, y0)], &TBA[FRAME_OFFSET(u0/4, v0)], CBA, h); + sprite_4bpp_x16_asm(&GPU_FrameBuffer[FRAME_OFFSET(x0, y0)], &TBA[FRAME_OFFSET(u0/4, v0)], CBA, h); } #endif // __arm__ -- 2.39.5