From 0ff4daa300c98f77a417a1f63546ca0e75ed969e Mon Sep 17 00:00:00 2001 From: notaz Date: Thu, 8 Jan 2026 01:39:43 +0200 Subject: [PATCH] gpu_unai: various asm tuning for armv6 --- include/arm_features.h | 12 ++- plugins/gpu_unai/gpu_arm.S | 13 ++- plugins/gpu_unai/gpu_inner.h | 55 +++++----- plugins/gpu_unai/gpu_inner_light.h | 26 +++-- plugins/gpu_unai/gpu_inner_light_arm.h | 140 ++++++++++--------------- plugins/gpu_unai/gpu_raster_sprite.h | 3 + plugins/gpu_unai/gpu_unai.h | 43 +++++++- plugins/gpu_unai/gpulib_if.cpp | 1 + 8 files changed, 164 insertions(+), 129 deletions(-) diff --git a/include/arm_features.h b/include/arm_features.h index 9f51ab81..bd76096a 100644 --- a/include/arm_features.h +++ b/include/arm_features.h @@ -14,6 +14,7 @@ #define HAVE_ARMV8 #define HAVE_ARMV7 #define HAVE_ARMV6 +#define HAVE_ARMV5E #define HAVE_ARMV5 #elif (defined(__ARM_ARCH) && __ARM_ARCH >= 7) \ @@ -23,6 +24,7 @@ #define HAVE_ARMV7 #define HAVE_ARMV6 +#define HAVE_ARMV5E #define HAVE_ARMV5 #elif (defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ @@ -32,11 +34,17 @@ || defined(__ARM_ARCH_6M__) #define HAVE_ARMV6 +#define HAVE_ARMV5E #define HAVE_ARMV5 #define HAVE_PRE_ARMV7 -#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5E__) \ - || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) +#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) + +#define HAVE_ARMV5E +#define HAVE_ARMV5 +#define HAVE_PRE_ARMV7 + +#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) #define HAVE_ARMV5 #define HAVE_PRE_ARMV7 diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S index a516f08f..4d302432 100644 --- a/plugins/gpu_unai/gpu_arm.S +++ b/plugins/gpu_unai/gpu_arm.S @@ -19,6 +19,8 @@ #ifdef HAVE_ARMV6 +@ mbr: 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000 +@ mg: 0ggg gggg ... .macro modulate rp mbr mg t0 t1 t2 and \t0, \rp, #0x001f and \t1, \rp, #0x03e0 @@ -649,8 +651,9 @@ FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count) and r4, r4, r6 and lr, lr, r7 @ v_msk & v and lr, lr, #0xff<<10 + pld_ r3 @ clut tst r12,r12 - bne v_\name + bne 10f @ vinc_\name ldr r1, [r1] @ src mov r7, r4, lsr #(13 - (\bpp / 8 * 3)) add r1, r1, lr, lsl #1 @@ -705,7 +708,7 @@ FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count) 1: ldmfd sp!, {r4-r11,pc} -v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked +10: @ vinc_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked .if \light || \semit >= 0 sub sp, sp, #4*2 stmia sp, {r5,r6} @@ -723,9 +726,9 @@ v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked .endif 0: .if \light || \semit >= 0 - and lr, r7, r9 - mov r12,r4, lsr #(13 - (\bpp / 8 * 3)) - add lr, r1, lr, lsl #1 + and lr, r7, r9 @ l_v & l_v_msk + mov r12,r4, lsr #(13 - (\bpp / 8 * 3)) @ l_u + add lr, r1, lr, lsl #1 @ (u16 *)TBA + l_v subs r2, r2, #1 bmi 1f .endif diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h index 3281d0fa..3ac39b66 100644 --- a/plugins/gpu_unai/gpu_inner.h +++ b/plugins/gpu_unai/gpu_inner.h @@ -62,11 +62,16 @@ #include "gpu_inner_blend_arm.h" #include "gpu_inner_light_arm.h" #define gpuBlending gpuBlendingARM -#define gpuLightingTXT gpuLightingTXTARM -#else +#endif +#ifndef gpuBlending #define gpuBlending gpuBlendingGeneric +#endif +#ifndef gpuLightingTXT // gpuLightingTXTARM #define gpuLightingTXT gpuLightingTXTGeneric #endif +#ifndef gpuLightingTXTGouraud // gpuLightingTXTGouraudARM +#define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric +#endif // Non-dithering lighting and blending functions preserve uSrc // MSB. This saves a few operations and useless load/stores. @@ -425,12 +430,12 @@ static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt uint_fast16_t uSrc, uDst, srcMSB; bool should_blend; u32 u0_mask = inn.u_msk >> 10; + u32 bgr0888; - u8 r5, g5, b5; if (CF_LIGHT) { - r5 = inn.r5; - g5 = inn.g5; - b5 = inn.b5; + bgr0888 = (gpu_unai.inn.b8 << 16) | + (gpu_unai.inn.g8 << 8) | + gpu_unai.inn.r8; } const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA; @@ -474,7 +479,7 @@ static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000; if (CF_LIGHT) - uSrc = gpuLightingTXT(uSrc, r5, g5, b5); + uSrc = gpuLightingTXT(uSrc, bgr0888); should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB; @@ -683,7 +688,7 @@ endpolynotextnogou: endpolynotextgou: pDst++; - l_gCol.raw += l_gInc.raw; + l_gCol += l_gInc; } while (--count); } @@ -707,25 +712,24 @@ endpolynotextgou: const le16_t* TBA_ = gpu_unai.inn.TBA; const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA; - u8 r5, g5, b5; - u8 r8, g8, b8; + u32 bgr0888; gcol_t l_gInc, l_gCol; + int pcounter = count - 1; // "repeat while positive" counter if (CF_LIGHT) { if (CF_GOURAUD) { l_gInc = gpu_unai.inn.gInc; l_gCol = gpu_unai.inn.gCol; + + l_gInc.set_counter(-1); + l_gCol.set_counter(pcounter); } else { - if (CF_DITHER) { - r8 = gpu_unai.inn.r8; - g8 = gpu_unai.inn.g8; - b8 = gpu_unai.inn.b8; - } else { - r5 = gpu_unai.inn.r5; - g5 = gpu_unai.inn.g5; - b5 = gpu_unai.inn.b5; - } + // keep this packed, otherwise gcc runs out of regs + bgr0888 = (gpu_unai.inn.b8 << 16) | + (gpu_unai.inn.g8 << 8) | + gpu_unai.inn.r8; + // XXX pre-pack } } @@ -769,7 +773,7 @@ endpolynotextgou: if ( CF_GOURAUD) uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol); if (!CF_GOURAUD) - uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8); + uSrc24 = gpuLightingTXT24(uSrc, bgr0888); if (CF_BLEND && srcMSB) uSrc24 = gpuBlending24(uSrc24, uDst); @@ -781,7 +785,7 @@ endpolynotextgou: if ( CF_GOURAUD) uSrc = gpuLightingTXTGouraud(uSrc, l_gCol); if (!CF_GOURAUD) - uSrc = gpuLightingTXT(uSrc, r5, g5, b5); + uSrc = gpuLightingTXT(uSrc, bgr0888); } should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB; @@ -796,10 +800,13 @@ endpolytext: pDst++; l_u = (l_u + l_u_inc) & l_u_msk; l_v += l_v_inc; - if (CF_LIGHT && CF_GOURAUD) - l_gCol.raw += l_gInc.raw; + if (CF_LIGHT && CF_GOURAUD) { + l_gCol += l_gInc; + l_gCol.get_counter(pcounter); + } + pcounter--; } - while (--count); + while (pcounter >= 0); } } diff --git a/plugins/gpu_unai/gpu_inner_light.h b/plugins/gpu_unai/gpu_inner_light.h index 44fecdc3..f4ec2134 100644 --- a/plugins/gpu_unai/gpu_inner_light.h +++ b/plugins/gpu_unai/gpu_inner_light.h @@ -85,11 +85,12 @@ static void SetupLightLUT() //////////////////////////////////////////////////////////////////////////////// GPU_INLINE gcol_t gpuPackGouraudCol(u32 r, u32 g, u32 b) { - return (gcol_t){ + return (gcol_t){{ (u16)(r >> 2), (u16)(g >> 2), (u16)(b >> 2), - }; + 0 + }}; } //////////////////////////////////////////////////////////////////////////////// @@ -167,8 +168,13 @@ GPU_INLINE u32 gpuLightingRGB24(gcol_t gCol) // u16 output: 0bbbbbgggggrrrrr // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care //////////////////////////////////////////////////////////////////////////////// -GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5) +GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u32 bgr0888) { + // gcc can move this out of the loop if it wants to + uint_fast32_t b5 = (bgr0888 >> 19); + uint_fast32_t g5 = (bgr0888 >> 11) & 0x1f; + uint_fast32_t r5 = (bgr0888 >> 3) & 0x1f; + return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) | (gpu_unai.LightLUT[ (uSrc&0x03E0) | g5] << 5) | (gpu_unai.LightLUT[((uSrc&0x001F)<<5) | r5] ) | @@ -189,7 +195,7 @@ GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5, // u16 output: 0bbbbbgggggrrrrr // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care //////////////////////////////////////////////////////////////////////////////// -GPU_INLINE uint_fast16_t gpuLightingTXTGouraud(uint_fast16_t uSrc, gcol_t gCol) +GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, gcol_t gCol) { return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | (gCol.c.b >> 11)] << 10) | (gpu_unai.LightLUT[ (uSrc&0x03E0) | (gCol.c.g >> 11)] << 5) | @@ -213,22 +219,22 @@ GPU_INLINE uint_fast16_t gpuLightingTXTGouraud(uint_fast16_t uSrc, gcol_t gCol) // ^ bit 31 // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care //////////////////////////////////////////////////////////////////////////////// -GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u8 r8, u8 g8, u8 b8) +GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u32 bgr0888) { uint_fast16_t r1 = uSrc&0x001F; uint_fast16_t g1 = uSrc&0x03E0; uint_fast16_t b1 = uSrc&0x7C00; - uint_fast16_t r2 = r8; - uint_fast16_t g2 = g8; - uint_fast16_t b2 = b8; + uint_fast16_t r2 = bgr0888 & 0x0000ff; + uint_fast32_t g2 = bgr0888 & 0x00ff00; + uint_fast16_t b2 = bgr0888 >> 16; u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000; - u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000; + u32 g3 = g1 * g2; if (g3 & 0xFE000000) g3 = ~0xFE000000; u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000; return ((r3>> 3) ) | - ((g3>> 8)<<10) | + ((g3>>16)<<10) | ((b3>>13)<<20); } diff --git a/plugins/gpu_unai/gpu_inner_light_arm.h b/plugins/gpu_unai/gpu_inner_light_arm.h index 7edb8fb0..3445e793 100644 --- a/plugins/gpu_unai/gpu_inner_light_arm.h +++ b/plugins/gpu_unai/gpu_inner_light_arm.h @@ -14,6 +14,7 @@ // ^ bit 16 // Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero //////////////////////////////////////////////////////////////////////////////// +// note: outdated, unused GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol) { uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output @@ -30,11 +31,14 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol) return out; } +//#ifdef HAVE_ARMV5E // todo? +#ifdef HAVE_ARMV6 + //////////////////////////////////////////////////////////////////////////////// -// Apply fast (low-precision) 5-bit lighting to bgr555 texture color: +// Apply 8-bit lighting to bgr555 texture color: // // INPUT: -// 'r5','g5','b5' are unsigned 5-bit color values, value of 15 +// 'r8','g8','b8' are unsigned 8-bit color values, value of 127 // is midpoint that doesn't modify that component of texture // 'uSrc' input: mbbbbbgggggrrrrr // ^ bit 16 @@ -42,95 +46,59 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol) // u16 output: mbbbbbgggggrrrrr // Where 'X' are fixed-pt bits. //////////////////////////////////////////////////////////////////////////////// -#ifdef HAVE_ARMV6 -// clang uses smulbb but not gcc, so we need this -GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b) +// on v6 we have single-cycle mul and sat which is better than the LightLUT +GPU_INLINE u32 gpuLightingTXTARM(u32 uSrc, u32 bgr0888) { - int_fast16_t r; - asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); - return r; + int_fast32_t r, g, b, s_d = uSrc; + // has to be in a block, otherwise gcc schedules the insns poorly + asm("and %[r], %[s_d], #0x001f\n" + "and %[b], %[bgr], #0xff\n" + "smulbb %[r], %[r], %[b]\n" + "uxtb %[b], %[bgr], ror #8\n" + "and %[g], %[s_d], #0x03e0\n" + "smulbb %[g], %[g], %[b]\n" + "and %[b], %[s_d], #0x7c00\n" + "and %[s_d],%[s_d], #0x8000\n" + "smulbt %[b], %[b], %[bgr]\n" + "usat %[r], #5, %[r], asr #7\n" + "usat %[g], #5, %[g], asr #12\n" + "usat %[b], #5, %[b], asr #17\n" + "orr %[s_d],%[s_d], %[r]\n" + "orr %[s_d],%[s_d], %[g], lsl #5\n" + "orr %[s_d],%[s_d], %[b], lsl #10\n" + : [s_d]"+r"(s_d), [r]"=&r"(r), [g]"=&r"(g), [b]"=&r"(b) + : [bgr]"r"(bgr0888)); + return s_d; } +#define gpuLightingTXT gpuLightingTXTARM -GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5) +GPU_INLINE u32 gpuLightingTXTGouraudARM(u32 uSrc, gcol_t gCol) { - // on v6 we have single-cycle mul and sat which is better than the lut - int_fast16_t r = smulbb(uSrc & 0x001f, r5); - int_fast16_t g = smulbb(uSrc & 0x03e0, g5); - int_fast16_t b = smulbb(uSrc & 0x7c00, b5); - asm volatile("usat %0, #5, %0, asr #4" : "=r"(r) : "0"(r)); - asm volatile("usat %0, #5, %0, asr #9" : "=r"(g) : "0"(g)); - asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b)); - return (uSrc & 0x8000) | (b << 10) | (g << 5) | r; + u32 r, g, s_d = uSrc; + asm("str %[b], [sp, #-4]!\n" // conserve regs for gcc + "uxtb16 %[b], %[b], ror #8\n" // b = g_rg >> 8 & 0xff00ff + "and %[r], %[s_d], #0x001f\n" + "and %[g], %[s_d], #0x03e0\n" + "smulbb %[r], %[r], %[b]\n" + "smulbt %[g], %[g], %[b]\n" + "uxtb %[b], %[g_b], ror #8\n" + "tst %[s_d], #0x8000\n" + "and %[s_d], %[s_d], #0x7c00\n" + "smulbb %[b], %[b], %[s_d]\n" + "usat %[s_d],#5, %[r], asr #7\n" + "usat %[g], #5, %[g], asr #12\n" + "usat %[b], #5, %[b], asr #17\n" + "orrne %[s_d], %[s_d], #0x8000\n" + "orr %[s_d], %[s_d], %[g], lsl #5\n" + "orr %[s_d], %[s_d], %[b], lsl #10\n" + "ldr %[b], [sp], #4\n" + : [s_d]"+r"(s_d), [r]"=&r"(r), [g]"=&r"(g) + : [b]"r"(gCol.raw32[0]), [g_b]"r"(gCol.raw32[1]) + : "cc"); + return s_d; } -#else -GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5) -{ - uint_fast16_t out = 0x03E0; - u32 db, dg; +#define gpuLightingTXTGouraud gpuLightingTXTGouraudARM - // Using `g` for src, `G` for dest - asm ("and %[dg], %[out], %[src] \n\t" // dg holds 0x000000ggggg00000 - "orr %[dg], %[dg], %[g5] \n\t" // dg holds 0x000000gggggGGGGG - "and %[db], %[out], %[src], lsr #0x05 \n\t" // db holds 0x000000bbbbb00000 - "ldrb %[dg], [%[lut], %[dg]] \n\t" // dg holds result 0x00000000000ggggg - "and %[out], %[out], %[src], lsl #0x05 \n\t" // out holds 0x000000rrrrr00000 - "orr %[out], %[out], %[r5] \n\t" // out holds 0x000000rrrrrRRRRR - "orr %[db], %[db], %[b5] \n\t" // db holds 0x000000bbbbbBBBBB - "ldrb %[out], [%[lut], %[out]] \n\t" // out holds result 0x00000000000rrrrr - "ldrb %[db], [%[lut], %[db]] \n\t" // db holds result 0x00000000000bbbbb - "tst %[src], #0x8000\n\t" // check whether msb was set on uSrc - "orr %[out], %[out], %[dg], lsl #0x05 \n\t" // out holds 0x000000gggggrrrrr - "orrne %[out], %[out], #0x8000\n\t" // add msb to out if set on uSrc - "orr %[out], %[out], %[db], lsl #0x0A \n\t" // out holds 0xmbbbbbgggggrrrrr - : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg) - : [r5] "r" (r5), [g5] "r" (g5), [b5] "r" (b5), - [lut] "r" (gpu_unai.LightLUT), [src] "r" (uSrc), "0" (out) - : "cc"); - return out; -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color: -// -// INPUT: -// 'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of -// 15.0 is midpoint that does not modify color of texture -// gCol input : rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX -// ^ bit 31 -// 'uSrc' input: mbbbbbgggggrrrrr -// ^ bit 16 -// RETURNS: -// u16 output: mbbbbbgggggrrrrr -// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care -//////////////////////////////////////////////////////////////////////////////// -GPU_INLINE uint_fast16_t gpuLightingTXTGouraudARM(uint_fast16_t uSrc, u32 gCol) -{ - uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output - u32 db,dg,gtmp; - - // Using `g` for src, `G` for dest - asm ("and %[dg], %[out], %[src] \n\t" // dg holds 0x000000ggggg00000 - "and %[gtmp],%[out], %[gCol], lsr #0x0B \n\t" // gtmp holds 0x000000GGGGG00000 - "and %[db], %[out], %[src], lsr #0x05 \n\t" // db holds 0x000000bbbbb00000 - "orr %[dg], %[dg], %[gtmp], lsr #0x05 \n\t" // dg holds 0x000000gggggGGGGG - "and %[gtmp],%[out], %[gCol] \n\t" // gtmp holds 0x000000BBBBB00000 - "ldrb %[dg], [%[lut], %[dg]] \n\t" // dg holds result 0x00000000000ggggg - "and %[out], %[out], %[src], lsl #0x05 \n\t" // out holds 0x000000rrrrr00000 - "orr %[out], %[out], %[gCol], lsr #0x1B \n\t" // out holds 0x000000rrrrrRRRRR - "orr %[db], %[db], %[gtmp], lsr #0x05 \n\t" // db holds 0x000000bbbbbBBBBB - "ldrb %[out], [%[lut], %[out]] \n\t" // out holds result 0x00000000000rrrrr - "ldrb %[db], [%[lut], %[db]] \n\t" // db holds result 0x00000000000bbbbb - "tst %[src], #0x8000\n\t" // check whether msb was set on uSrc - "orr %[out], %[out], %[dg], lsl #0x05 \n\t" // out holds 0x000000gggggrrrrr - "orrne %[out], %[out], #0x8000\n\t" // add msb to out if set on uSrc - "orr %[out], %[out], %[db], lsl #0x0A \n\t" // out holds 0xmbbbbbgggggrrrrr - : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg), - [gtmp] "=&r" (gtmp) \ - : [gCol] "r" (gCol), [lut] "r" (gpu_unai.LightLUT), "0" (out), [src] "r" (uSrc) - : "cc"); - - return out; -} +#endif // HAVE_ARMV6 #endif //_OP_LIGHT_ARM_H_ diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h index 5c7b67ce..26c7332b 100644 --- a/plugins/gpu_unai/gpu_raster_sprite.h +++ b/plugins/gpu_unai/gpu_raster_sprite.h @@ -63,6 +63,9 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out) le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)]; + gpu_unai.inn.r8 = packet.U1[0]; + gpu_unai.inn.g8 = packet.U1[1]; + gpu_unai.inn.b8 = packet.U1[2]; gpu_unai.inn.r5 = packet.U1[0] >> 3; gpu_unai.inn.g5 = packet.U1[1] >> 3; gpu_unai.inn.b5 = packet.U1[2] >> 3; diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h index 91cdb8af..2e30a283 100644 --- a/plugins/gpu_unai/gpu_unai.h +++ b/plugins/gpu_unai/gpu_unai.h @@ -56,12 +56,51 @@ #define s64 int64_t #define u64 uint64_t -typedef union { +union gcol_t { struct { u16 r, g, b; +#ifdef HAVE_ARMV6 + u16 counter; +#else + u16 unused; +#endif } c; +#if defined(HAVE_ARMV6) || (defined(__SIZEOF_SIZE_T__) && __SIZEOF_SIZE_T__ == 4) + u32 raw32[2]; +#else u64 raw; -} gcol_t; +#endif + + inline gcol_t & operator+=(const gcol_t &rhs) + { +#ifdef HAVE_ARMV6 + // prevent bit spills the other versions have, + // allowing to use the unused part as a counter + asm("uadd16 %[d], %[d], %[s]" : [d]"+r"(raw32[0]) : [s]"r"(rhs.raw32[0])); + asm("uadd16 %[d], %[d], %[s]" : [d]"+r"(raw32[1]) : [s]"r"(rhs.raw32[1])); +#elif defined(__SIZEOF_SIZE_T__) && __SIZEOF_SIZE_T__ == 4 + // avoid having to do carry that's not needed here + raw32[0] += rhs.raw32[0]; + raw32[1] += rhs.raw32[1]; +#else + raw += rhs.raw; +#endif + return *this; + } + + inline void set_counter(int counter) + { +#ifdef HAVE_ARMV6 + c.counter = counter; +#endif + } + inline void get_counter(int &counter) + { +#ifdef HAVE_ARMV6 + counter = raw32[1]; +#endif + } +}; #ifndef NDEBUG diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp index 71eccb1a..71c92728 100644 --- a/plugins/gpu_unai/gpulib_if.cpp +++ b/plugins/gpu_unai/gpulib_if.cpp @@ -24,6 +24,7 @@ #include #include #include +#include "arm_features.h" #include "../gpulib/gpu.h" #include "old/if.h" -- 2.47.3