From: notaz Date: Wed, 27 Nov 2024 00:14:03 +0000 (+0200) Subject: gpu_unai: asm part 3 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=eb54d5413cd037964c492ad6b237f70fdc3abe97;p=pcsx_rearmed.git gpu_unai: asm part 3 --- diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S index f0684992..b56951f9 100644 --- a/plugins/gpu_unai/gpu_arm.S +++ b/plugins/gpu_unai/gpu_arm.S @@ -7,6 +7,7 @@ #include "arm_features.h" +.syntax unified .text .align 2 @@ -32,13 +33,13 @@ ldrh r8, [r2, r8] ldrh lr, [r2, lr] tst r6, r6 - strneh r6, [r0, #\obase+0] + strhne r6, [r0, #\obase+0] tst r7, r7 - strneh r7, [r0, #\obase+2] + strhne r7, [r0, #\obase+2] tst r8, r8 - strneh r8, [r0, #\obase+4] + strhne r8, [r0, #\obase+4] tst lr, lr - strneh lr, [r0, #\obase+6] + strhne lr, [r0, #\obase+6] .endm @ in: r0=dst, r2=pal, r12=0x1fe @@ -53,13 +54,13 @@ ldrh r8, [r2, r8] ldrh \rs,[r2, \rs] tst r6, r6 - strneh r6, [r0, #0] + strhne r6, [r0, #0] tst r7, r7 - strneh r7, [r0, #2] + strhne r7, [r0, #2] tst r8, r8 - strneh r8, [r0, #4] + strhne r8, [r0, #4] tst \rs,\rs - strneh \rs,[r0, #6] + strhne \rs,[r0, #6] .endm .global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines) @@ -175,7 +176,7 @@ sprite_driver_4bpp_asm: ldrh r7, [r2, r7] add r0, r0, #2 tst r7, r7 - strneh r7, [r0, #-2] + strhne r7, [r0, #-2] subs r8, r8, #1 bgt 0b sprite_driver_part3 @@ -200,7 +201,7 @@ sprite_driver_8bpp_asm: ldrh r7, [r2, r7] add r0, r0, #2 tst r7, r7 - strneh r7, [r0, #-2] + strhne r7, [r0, #-2] subs r8, r8, #1 bgt 0b sprite_driver_part3 @@ -254,7 +255,7 @@ poly_4bpp_asm: add r0, r0, #2 mov r7, r4, lsr #13 tst r12,r12 - strneh r12,[r0, #-2] + strhne r12,[r0, #-2] subs r2, r2, #1 bgt 0b @@ -285,7 +286,7 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked and lr, r7, r9 tst r12,r12 add lr, r1, lr, lsl #1 - strneh r12,[r0, #-2] + strhne r12,[r0, #-2] mov r12,r4, lsr #13 subs r2, r2, #1 bgt 0b @@ -304,7 +305,7 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked smulbb \t0, \t0, \mbr @ -> 0000 0000 0000 orrr rrxx xxxx xxxx xxxx smulbt \t1, \t1, \mg @ -> 0000 000o gggg gxxx xxxx xxxx xxx0 0000 smulbt \t2, \t2, \mbr @ -> 00ob bbbb xxxx xxxx xxxx xx00 0000 0000 - and \rp, \rp, #0x8000 + ands \rp, \rp, #0x8000 @ retain msb + semi-transparency test usat \t0, #5, \t0, asr #14 usat \t1, #5, \t1, asr #19 usat \t2, #5, \t2, asr #24 @@ -313,13 +314,25 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked orr \rp, \rp, \t2, lsl #10 .endm -.global poly_4bpp_l_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count) -poly_4bpp_l_asm: +@ http://www.slack.net/~ant/info/rgb_mixing.html +@ p0 = (p0 + p1) / 2; p1 |= 0x8000 +@ msb of input p0 is assumed to be set +.macro semitrans0 p0 p1 t + eor \t, \p0, \p1 + and \t, \t, #0x0420 + sub \p0, \p0, \t + orr \p1, \p1, #0x8000 + uhadd16 \p0, \p0, \p1 +.endm + +.macro poly_4bpp_asm_m name semitrans +.global \name @ (void *d, const struct gpu_unai_inner_t *inn, int count) +\name: .cfi_startproc stmfd sp!, {r4-r11,lr} .cfi_def_cfa_offset 4*9 .cfi_rel_offset lr, 4*8 - poly_4bpp_init poly_4bpp_lv_asm 1 + poly_4bpp_init v_\name 1 0: mov r12,r4, lsr #13 subs r2, r2, #1 @@ -337,12 +350,20 @@ poly_4bpp_l_asm: tst r12,r12 beq 0b modulate r12, r10, r11, r7, r8, lr +.if \semitrans < 0 + @ no semi-transparency +.elseif \semitrans == 0 + ldrhne r7, [r0, #-2] + strheq r12,[r0, #-2] + beq 0b + semitrans0 r12, r7, lr +.endif strh r12,[r0, #-2] b 0b 1: ldmfd sp!, {r4-r11,pc} -poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked +v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked sub sp, sp, #4*2 .cfi_def_cfa_offset 4*(9+2) .cfi_rel_offset lr, 4*(8+2) @@ -372,6 +393,14 @@ poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked tst r12,r12 beq 0b modulate r12, r10, r11, r5, r6, lr +.if \semitrans < 0 + @ no semi-transparency +.elseif \semitrans == 0 + ldrhne r7, [r0, #-2] + strheq r12,[r0, #-2] + beq 0b + semitrans0 r12, r7, lr +.endif strh r12,[r0, #-2] ldmia sp, {r5,r6} b 0b @@ -379,6 +408,10 @@ poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked add sp, sp, #4*2 ldmfd sp!, {r4-r11,pc} .cfi_endproc +.endm + +poly_4bpp_asm_m poly_4bpp_l_asm, -1 +poly_4bpp_asm_m poly_4bpp_l_st0_asm, 0 #endif // HAVE_ARMV6 diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h index ccdc781f..027aa53c 100644 --- a/plugins/gpu_unai/gpu_arm.h +++ b/plugins/gpu_unai/gpu_arm.h @@ -16,6 +16,7 @@ void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines); void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count); void poly_4bpp_l_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_4bpp_l_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count); #ifdef __cplusplus } diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h index 5cef54a7..87324b90 100644 --- a/plugins/gpu_unai/gpu_inner.h +++ b/plugins/gpu_unai/gpu_inner.h @@ -56,6 +56,7 @@ #include "gpu_inner_light.h" #include "arm_features.h" +#include "compiler_features.h" #ifdef __arm__ #include "gpu_inner_blend_arm.h" #include "gpu_inner_light_arm.h" @@ -372,7 +373,7 @@ typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt, const spriteDriverArg *arg); template -static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base, +static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base, const spriteDriverArg *arg) { // Blend func can save an operation if it knows uSrc MSB is unset. @@ -557,7 +558,7 @@ const PS gpuSpriteDrivers[256] = { // relevant blend/light headers. // (see README_senquack.txt) template -static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) +static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) { // Blend func can save an operation if it knows uSrc MSB is unset. // Untextured prims can always skip this (src color MSB is always 0). @@ -754,11 +755,13 @@ endpolytext: #ifdef __arm__ template -static void PolySpanAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) +static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) { switch (CF) { - case 0x20: poly_4bpp_asm (pDst, &gpu_unai.inn, count); break; - case 0x21: poly_4bpp_l_asm(pDst, &gpu_unai.inn, count); break; + case 0x20: poly_4bpp_asm (pDst, &gpu_unai.inn, count); break; + case 0x21: poly_4bpp_l_asm (pDst, &gpu_unai.inn, count); break; + case 0x23: poly_4bpp_l_st0_asm(pDst, &gpu_unai.inn, count); break; + default: gpuPolySpanFn(gpu_unai, pDst, count); } } #endif @@ -778,12 +781,12 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count); #define TI(cf) gpuPolySpanFn<(cf)> #define TN PolyNULL #ifdef __arm__ -#define TA(cf) PolySpanAsm<(cf)> +#define TA(cf) PolySpanMaybeAsm<(cf)> #else #define TA(cf) TI(cf) #endif #ifdef HAVE_ARMV6 -#define TA6(cf) PolySpanAsm<(cf)> +#define TA6(cf) PolySpanMaybeAsm<(cf)> #else #define TA6(cf) TI(cf) #endif @@ -792,7 +795,7 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count); TN, TN, TI((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \ TN, TN, TI((ub)|0x12), TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \ TN, TN, TI((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \ - TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ + TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ diff --git a/plugins/gpu_unai/gpu_inner_blend_arm.h b/plugins/gpu_unai/gpu_inner_blend_arm.h index 6413527c..f887374c 100644 --- a/plugins/gpu_unai/gpu_inner_blend_arm.h +++ b/plugins/gpu_unai/gpu_inner_blend_arm.h @@ -41,10 +41,14 @@ GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst) asm ("eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst "and %[mix], %[mix], %[mask]\n\t" // ... & 0x0421 "sub %[mix], %[uDst], %[mix]\n\t" // uDst - ... + #ifdef HAVE_ARMV6 + "uhadd16 %[mix], %[uSrc], %[mix]\n\t" + #else "add %[mix], %[uSrc], %[mix]\n\t" // uSrc + ... "mov %[mix], %[mix], lsr #0x1\n\t" // ... >> 1 + #endif : [mix] "=&r" (mix) - : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421)); + : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0420)); // 421 } if (BLENDMODE == 1 || BLENDMODE == 3) {