From: notaz Date: Tue, 26 Nov 2024 00:26:59 +0000 (+0200) Subject: gpu_unai: asm part 2 X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=93b00bc7ff29bedbf4ed6d109f37fef57376994a;p=pcsx_rearmed.git gpu_unai: asm part 2 --- diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S index 3b68acea..f0684992 100644 --- a/plugins/gpu_unai/gpu_arm.S +++ b/plugins/gpu_unai/gpu_arm.S @@ -207,27 +207,40 @@ sprite_driver_8bpp_asm: .cfi_endproc -.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count) -poly_4bpp_asm: - .cfi_startproc +.macro poly_4bpp_init v_target need_rgb add r12, r1, #4 - stmfd sp!, {r4-r7,lr} - .cfi_def_cfa_offset 4*5 - .cfi_rel_offset lr, 4*4 ldmia r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk ldr r5, [r1, #0x18] @ u_inc +.if \need_rgb + ldr r10,[r1, #0x24] @ rbg +.endif mov r6, r12 ldr r12,[r1, #0x1c] @ v_inc +.if \need_rgb + mov r10,r10,lsl #7 @ 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000 + bic r10,r10,#1<<23 + bic r10,r10,#1<<15 + mov r11,r10,lsl #8 +.endif and r4, r4, r6 and lr, lr, r7 @ v_msk & v and lr, lr, #0xff<<10 tst r12,r12 - bne poly_4bpp_asm_v + bne \v_target ldr r1, [r1] @ src mov r7, r4, lsr #13 add r1, r1, lr, lsl #1 add r12,r1, r7, lsl #2 pld_ r12,#2048 +.endm + +.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count) +poly_4bpp_asm: + .cfi_startproc + stmfd sp!, {r4-r7,lr} + .cfi_def_cfa_offset 4*5 + .cfi_rel_offset lr, 4*4 + poly_4bpp_init poly_4bpp_v_asm 0 0: ldr lr, [r1, r7, lsl #2] lsr r12,r4, #8 @@ -247,8 +260,10 @@ poly_4bpp_asm: ldmfd sp!, {r4-r7,pc} -poly_4bpp_asm_v: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked +poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked stmfd sp!, {r8-r9} + .cfi_def_cfa_offset 4*7 + .cfi_rel_offset lr, 4*6 ldr r9, [r1, #0x14] @ v_msk ldr r1, [r1] @ src mov r8, r12 @ v_inc @@ -279,4 +294,92 @@ poly_4bpp_asm_v: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked ldmfd sp!, {r4-r7,pc} .cfi_endproc + +#ifdef HAVE_ARMV6 + +.macro modulate rp mbr mg t0 t1 t2 + and \t0, \rp, #0x001f + and \t1, \rp, #0x03e0 + and \t2, \rp, #0x7c00 + smulbb \t0, \t0, \mbr @ -> 0000 0000 0000 orrr rrxx xxxx xxxx xxxx + smulbt \t1, \t1, \mg @ -> 0000 000o gggg gxxx xxxx xxxx xxx0 0000 + smulbt \t2, \t2, \mbr @ -> 00ob bbbb xxxx xxxx xxxx xx00 0000 0000 + and \rp, \rp, #0x8000 + usat \t0, #5, \t0, asr #14 + usat \t1, #5, \t1, asr #19 + usat \t2, #5, \t2, asr #24 + orr \rp, \rp, \t0 + orr \rp, \rp, \t1, lsl #5 + orr \rp, \rp, \t2, lsl #10 +.endm + +.global poly_4bpp_l_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count) +poly_4bpp_l_asm: + .cfi_startproc + stmfd sp!, {r4-r11,lr} + .cfi_def_cfa_offset 4*9 + .cfi_rel_offset lr, 4*8 + poly_4bpp_init poly_4bpp_lv_asm 1 +0: + mov r12,r4, lsr #13 + subs r2, r2, #1 + bmi 1f + ldr lr, [r1, r12, lsl #2] + lsr r12,r4, #8 + and r12,r12,#0x1c + sub r12,r12,#1 + mov r12,lr, ror r12 + add r4, r4, r5 + and r12,r12,#0x1e + and r4, r4, r6 + ldrh r12,[r3, r12] + add r0, r0, #2 + tst r12,r12 + beq 0b + modulate r12, r10, r11, r7, r8, lr + strh r12,[r0, #-2] + b 0b +1: + ldmfd sp!, {r4-r11,pc} + +poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked + sub sp, sp, #4*2 + .cfi_def_cfa_offset 4*(9+2) + .cfi_rel_offset lr, 4*(8+2) + ldr r9, [r1, #0x14] @ v_msk + ldr r1, [r1] @ src + mov r8, r12 @ v_inc + mov r12,r4, lsr #13 + and r9, r9, #0xff<<10 @ v_msk_final + stmia sp, {r5,r6} +0: + and lr, r7, r9 + mov r12,r4, lsr #13 + add lr, r1, lr, lsl #1 + subs r2, r2, #1 + bmi 1f + ldr lr, [lr, r12, lsl #2] + lsr r12,r4, #8 + and r12,r12,#0x1c + sub r12,r12,#1 + mov r12,lr, ror r12 + add r4, r4, r5 + and r12,r12,#0x1e + and r4, r4, r6 + ldrh r12,[r3, r12] + add r0, r0, #2 + add r7, r7, r8 + tst r12,r12 + beq 0b + modulate r12, r10, r11, r5, r6, lr + strh r12,[r0, #-2] + ldmia sp, {r5,r6} + b 0b +1: + add sp, sp, #4*2 + ldmfd sp!, {r4-r11,pc} + .cfi_endproc + +#endif // HAVE_ARMV6 + @ vim:filetype=armasm diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h index 287846e4..ccdc781f 100644 --- a/plugins/gpu_unai/gpu_arm.h +++ b/plugins/gpu_unai/gpu_arm.h @@ -15,6 +15,7 @@ void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base, void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines); void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count); +void poly_4bpp_l_asm(void *d, const struct gpu_unai_inner_t *inn, int count); #ifdef __cplusplus } diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h index 4f2b1156..5cef54a7 100644 --- a/plugins/gpu_unai/gpu_inner.h +++ b/plugins/gpu_unai/gpu_inner.h @@ -55,6 +55,7 @@ #include "gpu_inner_quantization.h" #include "gpu_inner_light.h" +#include "arm_features.h" #ifdef __arm__ #include "gpu_inner_blend_arm.h" #include "gpu_inner_light_arm.h" @@ -752,9 +753,13 @@ endpolytext: } #ifdef __arm__ -static void PolySpan4bppAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) +template +static void PolySpanAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count) { - poly_4bpp_asm(pDst, &gpu_unai.inn, count); + switch (CF) { + case 0x20: poly_4bpp_asm (pDst, &gpu_unai.inn, count); break; + case 0x21: poly_4bpp_l_asm(pDst, &gpu_unai.inn, count); break; + } } #endif @@ -773,16 +778,21 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count); #define TI(cf) gpuPolySpanFn<(cf)> #define TN PolyNULL #ifdef __arm__ -#define TA4(cf) PolySpan4bppAsm +#define TA(cf) PolySpanAsm<(cf)> #else -#define TA4(cf) TI(cf) +#define TA(cf) TI(cf) +#endif +#ifdef HAVE_ARMV6 +#define TA6(cf) PolySpanAsm<(cf)> +#else +#define TA6(cf) TI(cf) #endif #define TIBLOCK(ub) \ TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \ TN, TN, TI((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \ TN, TN, TI((ub)|0x12), TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \ TN, TN, TI((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \ - TA4((ub)|0x20),TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ + TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ @@ -819,7 +829,7 @@ const PP gpuPolySpanDrivers[2048] = { #undef TI #undef TN #undef TIBLOCK -#undef TA4 -#undef TA8 +#undef TA +#undef TA6 #endif /* __GPU_UNAI_GPU_INNER_H__ */ diff --git a/plugins/gpu_unai/gpu_inner_light_arm.h b/plugins/gpu_unai/gpu_inner_light_arm.h index 7bd58908..7edb8fb0 100644 --- a/plugins/gpu_unai/gpu_inner_light_arm.h +++ b/plugins/gpu_unai/gpu_inner_light_arm.h @@ -1,6 +1,8 @@ #ifndef _OP_LIGHT_ARM_H_ #define _OP_LIGHT_ARM_H_ +#include "arm_features.h" + //////////////////////////////////////////////////////////////////////////////// // Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet // @@ -40,6 +42,27 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol) // u16 output: mbbbbbgggggrrrrr // Where 'X' are fixed-pt bits. //////////////////////////////////////////////////////////////////////////////// +#ifdef HAVE_ARMV6 +// clang uses smulbb but not gcc, so we need this +GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b) +{ + int_fast16_t r; + asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} + +GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5) +{ + // on v6 we have single-cycle mul and sat which is better than the lut + int_fast16_t r = smulbb(uSrc & 0x001f, r5); + int_fast16_t g = smulbb(uSrc & 0x03e0, g5); + int_fast16_t b = smulbb(uSrc & 0x7c00, b5); + asm volatile("usat %0, #5, %0, asr #4" : "=r"(r) : "0"(r)); + asm volatile("usat %0, #5, %0, asr #9" : "=r"(g) : "0"(g)); + asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b)); + return (uSrc & 0x8000) | (b << 10) | (g << 5) | r; +} +#else GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5) { uint_fast16_t out = 0x03E0; @@ -65,6 +88,7 @@ GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 : "cc"); return out; } +#endif //////////////////////////////////////////////////////////////////////////////// // Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color: diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h index fff9126b..722041a8 100644 --- a/plugins/gpu_unai/gpu_unai.h +++ b/plugins/gpu_unai/gpu_unai.h @@ -208,6 +208,10 @@ struct gpu_unai_inner_t { u32 u_msk, v_msk; // 10 s32 u_inc, v_inc; // 18 + // Color for flat-shaded, texture-blended prims + u8 r5, g5, b5, pad5; // 20 5-bit light for undithered prims + u8 r8, g8, b8, pad8; // 24 8-bit light for dithered prims + // Color for Gouraud-shaded prims // Fixed-pt 8.8 rgb triplet // Packed fixed-pt 8.3:8.3:8.2 rgb triplet @@ -216,10 +220,6 @@ struct gpu_unai_inner_t { gcol_t gCol; gcol_t gInc; // Increment along scanline for gCol - // Color for flat-shaded, texture-blended prims - u8 r5, g5, b5; // 5-bit light for undithered prims - u8 r8, g8, b8; // 8-bit light for dithered prims - // Color for flat-shaded, untextured prims u16 PixelData; // bgr555 color for untextured flat-shaded polys };