/*
- * (C) Gražvydas "notaz" Ignotas, 2011
+ * (C) Gražvydas "notaz" Ignotas, 2011,2024
*
* This work is licensed under the terms of GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
.text
.align 2
+.macro pld_ reg offs=#0
+#ifdef HAVE_ARMV6
+ pld [\reg, \offs]
+#endif
+.endm
+
@ in: r0=dst, r2=pal, r12=0x1e
@ trashes r6-r8,lr,flags
-.macro do_4_pixels rs ibase obase
+.macro do_4x_4bpp rs ibase obase
.if \ibase - 1 < 0
and r6, r12, \rs, lsl #1
.else
strneh lr, [r0, #\obase+6]
.endm
-.global draw_spr16_full @ (u16 *d, void *s, u16 *pal, int lines)
-draw_spr16_full:
+@ in: r0=dst, r2=pal, r12=0x1fe
+@ loads/stores \rs,r6-r8
+.macro do_4x_8bpp rs
+ and r6, r12, \rs, lsl #1
+ and r7, r12, \rs, lsr #7
+ and r8, r12, \rs, lsr #15
+ and \rs,r12, \rs, lsr #23
+ ldrh r6, [r2, r6]
+ ldrh r7, [r2, r7]
+ ldrh r8, [r2, r8]
+ ldrh \rs,[r2, \rs]
+ tst r6, r6
+ strneh r6, [r0, #0]
+ tst r7, r7
+ strneh r7, [r0, #2]
+ tst r8, r8
+ strneh r8, [r0, #4]
+ tst \rs,\rs
+ strneh \rs,[r0, #6]
+.endm
+
+.global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines)
+sprite_4bpp_x16_asm_:
+ ldr r2, [r3] @ pal
+ ldr r3, [r3, #0x1c] @ lines
+sprite_4bpp_x16_asm:
+ .cfi_startproc
stmfd sp!, {r4-r8,lr}
+ .cfi_def_cfa_offset 4*6
+ .cfi_rel_offset lr, 4*5
mov r12, #0x1e @ empty pixel
0:
ldmia r1, {r4,r5}
- do_4_pixels r4, 0, 0
- do_4_pixels r4, 16, 8
- do_4_pixels r5, 0, 16
- do_4_pixels r5, 16, 24
+ pld_ r1, #2048
+ do_4x_4bpp r4, 0, 0
+ do_4x_4bpp r4, 16, 8
+ do_4x_4bpp r5, 0, 16
+ do_4x_4bpp r5, 16, 24
subs r3, r3, #1
add r0, r0, #2048
add r1, r1, #2048
bgt 0b
ldmfd sp!, {r4-r8,pc}
+ .cfi_endproc
+
+
+@
+.macro sprite_driver_part1 is8bpp
+ stmfd sp!, {r4-r11,lr}
+ .cfi_def_cfa_offset 4*9
+ .cfi_rel_offset lr, 4*8
+ mov r12, #0x01e
+.if \is8bpp
+ orr r12, r12, #0x1f0 @ mask=0x01fe
+.endif
+ ldr r4, [r3, #4] @ u0
+ ldr r5, [r3, #0x1c] @ h
+ and r4, r4, #((8 >> \is8bpp) - 1)
+ sub r5, r5, #1
+ orr r5, r4, r5, lsl #8 @ ((h-1) << 8) | u0_fraction
+ mov r9, r2 @ saved_w
+ mov r10, r0 @ saved_dst
+ mov r11, r1 @ saved_src
+ ldr r2, [r3] @ pal
+11: @ line_loop:
+ pld_ r11, #2048
+ mov r0, r10
+ mov r1, r11
+ mov r3, r9
+ ands r6, r5, #(7 >> \is8bpp)
+ bne 15f @ fractional_u
+12:
+ subs r3, r3, #(8 >> \is8bpp) @ w
+ bmi 14f @ fractional_w
+.endm
+.macro sprite_driver_part2 is8bpp
+ cmn r3, #(8 >> \is8bpp)
+ bne 14f @ fractional_w
+13: @ eol:
+ add r10, r10, #2048
+ add r11, r11, #2048
+ subs r5, r5, #0x100
+ bpl 11b @ line_loop
+ ldmfd sp!, {r4-r11,pc}
+14: @ fractional_w:
+ ldr r4, [r1], #4
+ add r8, r3, #(8 >> \is8bpp)
+ mov r3, #0
+ mov r4, r4, lsl #1
+ b 16f @ fractional_loop
+15: @ fractional_u:
+ bic r1, r1, #3
+ rsb r8, r6, #(8 >> \is8bpp)
+ ldr r4, [r1], #4
+ cmp r8, r3
+ movgt r8, r3
+ mov r7, r6, lsl #(2 + \is8bpp)
+ sub r3, r3, r8
+ sub r7, r7, #1
+ mov r4, r4, lsr r7
+16: @ fractional_loop:
+.endm
+.macro sprite_driver_part3
+ tst r3, r3
+ beq 13b @ sprd4_eol
+ b 12b @ return from fractional_u
+.endm
+
+.global sprite_driver_4bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg)
+sprite_driver_4bpp_asm:
+ .cfi_startproc
+ ldr r12, [r3, #4] @ u0
+ mov r12, r12, lsl #29
+ orr r12, r12, r2 @ w
+ cmp r12, #16
+ beq sprite_4bpp_x16_asm_ @ use specialized aligned x16 version
+ sprite_driver_part1 0
+0:
+ ldr r4, [r1], #4
+ pld_ r1, #28
+ do_4x_4bpp r4, 0, 0
+ do_4x_4bpp r4, 16, 8
+ add r0, r0, #16
+ subs r3, r3, #8
+ bpl 0b
+ sprite_driver_part2 0
+0:
+ and r7, r12, r4
+ mov r4, r4, lsr #4
+ ldrh r7, [r2, r7]
+ add r0, r0, #2
+ tst r7, r7
+ strneh r7, [r0, #-2]
+ subs r8, r8, #1
+ bgt 0b
+ sprite_driver_part3
+ .cfi_endproc
+
+
+.global sprite_driver_8bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg)
+sprite_driver_8bpp_asm:
+ .cfi_startproc
+ sprite_driver_part1 1
+0:
+ ldr r4, [r1], #4
+ pld_ r1, #28
+ do_4x_8bpp r4
+ add r0, r0, #8
+ subs r3, r3, #4
+ bpl 0b
+ sprite_driver_part2 1
+0:
+ and r7, r12, r4
+ mov r4, r4, lsr #8
+ ldrh r7, [r2, r7]
+ add r0, r0, #2
+ tst r7, r7
+ strneh r7, [r0, #-2]
+ subs r8, r8, #1
+ bgt 0b
+ sprite_driver_part3
+ .cfi_endproc
+
@ vim:filetype=armasm
///////////////////////////////////////////////////////////////////////////////
// GPU Sprites innerloops generator
+
+// warning: gpu_arm.S asm uses this, update it if you change this
typedef struct spriteDriverArg {
- const le16_t *CBA;
- u32 u0, v0, u0_mask, v0_mask;
- s32 y0, y1, li;
+ const le16_t *CBA; // 00
+ u32 u0, v0, u0_mask, v0_mask; // 04 08 0c 10
+ s32 y0, y1, lines, li; // 14
} spriteDriverArg;
typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt,
}
}
+#ifdef __arm__
+#include "gpu_arm.h"
+
+static void Sprite4bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+ const spriteDriverArg *arg)
+{
+#if 1
+ s32 lines = arg->lines;
+ u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1;
+ if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) {
+ pTxt_base += arg->u0 / 2 + arg->v0 * 2048;
+ sprite_driver_4bpp_asm(pPixel, pTxt_base, count, arg);
+ }
+ else
+#endif
+ gpuSpriteDriverFn<0x20>(pPixel, count, pTxt_base, arg);
+}
+
+static void Sprite8bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+ const spriteDriverArg *arg)
+{
+#if 1
+ s32 lines = arg->lines;
+ u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1;
+ if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) {
+ pTxt_base += arg->u0 + arg->v0 * 2048;
+ sprite_driver_8bpp_asm(pPixel, pTxt_base, count, arg);
+ }
+ else
+#endif
+ gpuSpriteDriverFn<0x40>(pPixel, count, pTxt_base, arg);
+}
+#endif // __arm__
+
static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base,
const spriteDriverArg *arg)
{
// Template instantiation helper macros
#define TI(cf) gpuSpriteDriverFn<(cf)>
#define TN SpriteNULL
+#ifdef __arm__
+#define TA4(cf) Sprite4bppMaybeAsm
+#define TA8(cf) Sprite8bppMaybeAsm
+#else
+#define TA4(cf) TI(cf)
+#define TA8(cf) TI(cf)
+#endif
#define TIBLOCK(ub) \
- TN, TN, TN, TN, TN, TN, TN, TN, \
- TN, TN, TN, TN, TN, TN, TN, TN, \
- TN, TN, TN, TN, TN, TN, TN, TN, \
- TN, TN, TN, TN, TN, TN, TN, TN, \
- TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
- TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \
- TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \
- TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \
- TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
- TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \
- TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \
- TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \
- TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
- TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \
- TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \
- TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f)
+ TN, TN, TN, TN, TN, TN, TN, TN, \
+ TN, TN, TN, TN, TN, TN, TN, TN, \
+ TN, TN, TN, TN, TN, TN, TN, TN, \
+ TN, TN, TN, TN, TN, TN, TN, TN, \
+ TA4((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+ TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \
+ TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \
+ TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \
+ TA8((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+ TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \
+ TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \
+ TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \
+ TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+ TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \
+ TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \
+ TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f)
const PS gpuSpriteDrivers[256] = {
TIBLOCK(0<<8), TIBLOCK(1<<8)