From 0ff4daa300c98f77a417a1f63546ca0e75ed969e Mon Sep 17 00:00:00 2001
From: notaz <notasas@gmail.com>
Date: Thu, 8 Jan 2026 01:39:43 +0200
Subject: [PATCH] gpu_unai: various asm tuning for armv6

---
 include/arm_features.h                 |  12 ++-
 plugins/gpu_unai/gpu_arm.S             |  13 ++-
 plugins/gpu_unai/gpu_inner.h           |  55 +++++-----
 plugins/gpu_unai/gpu_inner_light.h     |  26 +++--
 plugins/gpu_unai/gpu_inner_light_arm.h | 140 ++++++++++---------------
 plugins/gpu_unai/gpu_raster_sprite.h   |   3 +
 plugins/gpu_unai/gpu_unai.h            |  43 +++++++-
 plugins/gpu_unai/gpulib_if.cpp         |   1 +
 8 files changed, 164 insertions(+), 129 deletions(-)

diff --git a/include/arm_features.h b/include/arm_features.h
index 9f51ab81..bd76096a 100644
--- a/include/arm_features.h
+++ b/include/arm_features.h
@@ -14,6 +14,7 @@
 #define HAVE_ARMV8
 #define HAVE_ARMV7
 #define HAVE_ARMV6
+#define HAVE_ARMV5E
 #define HAVE_ARMV5
 
 #elif (defined(__ARM_ARCH) && __ARM_ARCH >= 7) \
@@ -23,6 +24,7 @@
 
 #define HAVE_ARMV7
 #define HAVE_ARMV6
+#define HAVE_ARMV5E
 #define HAVE_ARMV5
 
 #elif (defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
@@ -32,11 +34,17 @@
     || defined(__ARM_ARCH_6M__)
 
 #define HAVE_ARMV6
+#define HAVE_ARMV5E
 #define HAVE_ARMV5
 #define HAVE_PRE_ARMV7
 
-#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5E__) \
-   || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+
+#define HAVE_ARMV5E
+#define HAVE_ARMV5
+#define HAVE_PRE_ARMV7
+
+#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__)
 
 #define HAVE_ARMV5
 #define HAVE_PRE_ARMV7
diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S
index a516f08f..4d302432 100644
--- a/plugins/gpu_unai/gpu_arm.S
+++ b/plugins/gpu_unai/gpu_arm.S
@@ -19,6 +19,8 @@
 
 #ifdef HAVE_ARMV6
 
+@ mbr: 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000
+@ mg:  0ggg gggg ...
 .macro modulate rp mbr mg t0 t1 t2
     and     \t0, \rp, #0x001f
     and     \t1, \rp, #0x03e0
@@ -649,8 +651,9 @@ FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count)
     and     r4, r4, r6
     and     lr, lr, r7         @ v_msk & v
     and     lr, lr, #0xff<<10
+    pld_    r3                 @ clut
     tst     r12,r12
-    bne     v_\name
+    bne     10f @ vinc_\name
     ldr     r1, [r1]           @ src
     mov     r7, r4, lsr #(13 - (\bpp / 8 * 3))
     add     r1, r1, lr, lsl #1
@@ -705,7 +708,7 @@ FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count)
 1:
     ldmfd   sp!, {r4-r11,pc}
 
-v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+10: @ vinc_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
 .if \light || \semit >= 0
     sub     sp, sp, #4*2
     stmia   sp, {r5,r6}
@@ -723,9 +726,9 @@ v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
 .endif
 0:
 .if \light || \semit >= 0
-    and     lr, r7, r9
-    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))
-    add     lr, r1, lr, lsl #1
+    and     lr, r7, r9         @ l_v & l_v_msk
+    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))  @ l_u
+    add     lr, r1, lr, lsl #1 @ (u16 *)TBA + l_v
     subs    r2, r2, #1
     bmi     1f
 .endif
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h
index 3281d0fa..3ac39b66 100644
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -62,11 +62,16 @@
 #include "gpu_inner_blend_arm.h"
 #include "gpu_inner_light_arm.h"
 #define gpuBlending gpuBlendingARM
-#define gpuLightingTXT gpuLightingTXTARM
-#else
+#endif
+#ifndef gpuBlending
 #define gpuBlending gpuBlendingGeneric
+#endif
+#ifndef gpuLightingTXT // gpuLightingTXTARM
 #define gpuLightingTXT gpuLightingTXTGeneric
 #endif
+#ifndef gpuLightingTXTGouraud // gpuLightingTXTGouraudARM
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric
+#endif
 
 // Non-dithering lighting and blending functions preserve uSrc
 // MSB. This saves a few operations and useless load/stores.
@@ -425,12 +430,12 @@ static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt
 	uint_fast16_t uSrc, uDst, srcMSB;
 	bool should_blend;
 	u32 u0_mask = inn.u_msk >> 10;
+	u32 bgr0888;
 
-	u8 r5, g5, b5;
 	if (CF_LIGHT) {
-		r5 = inn.r5;
-		g5 = inn.g5;
-		b5 = inn.b5;
+		bgr0888 = (gpu_unai.inn.b8 << 16) |
+			  (gpu_unai.inn.g8 << 8) |
+			   gpu_unai.inn.r8;
 	}
 
 	const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA;
@@ -474,7 +479,7 @@ static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt
 		if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
 		
 		if (CF_LIGHT)
-			uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+			uSrc = gpuLightingTXT(uSrc, bgr0888);
 
 		should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
 
@@ -683,7 +688,7 @@ endpolynotextnogou:
 
 endpolynotextgou:
 				pDst++;
-				l_gCol.raw += l_gInc.raw;
+				l_gCol += l_gInc;
 			}
 			while (--count);
 		}
@@ -707,25 +712,24 @@ endpolynotextgou:
 		const le16_t* TBA_ = gpu_unai.inn.TBA;
 		const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA;
 
-		u8 r5, g5, b5;
-		u8 r8, g8, b8;
+		u32 bgr0888;
 
 		gcol_t l_gInc, l_gCol;
+		int pcounter = count - 1; // "repeat while positive" counter
 
 		if (CF_LIGHT) {
 			if (CF_GOURAUD) {
 				l_gInc = gpu_unai.inn.gInc;
 				l_gCol = gpu_unai.inn.gCol;
+
+				l_gInc.set_counter(-1);
+				l_gCol.set_counter(pcounter);
 			} else {
-				if (CF_DITHER) {
-					r8 = gpu_unai.inn.r8;
-					g8 = gpu_unai.inn.g8;
-					b8 = gpu_unai.inn.b8;
-				} else {
-					r5 = gpu_unai.inn.r5;
-					g5 = gpu_unai.inn.g5;
-					b5 = gpu_unai.inn.b5;
-				}
+				// keep this packed, otherwise gcc runs out of regs
+				bgr0888 = (gpu_unai.inn.b8 << 16) |
+					  (gpu_unai.inn.g8 << 8) |
+					   gpu_unai.inn.r8;
+				// XXX pre-pack
 			}
 		}
 
@@ -769,7 +773,7 @@ endpolynotextgou:
 				if ( CF_GOURAUD)
 					uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol);
 				if (!CF_GOURAUD)
-					uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8);
+					uSrc24 = gpuLightingTXT24(uSrc, bgr0888);
 
 				if (CF_BLEND && srcMSB)
 					uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
@@ -781,7 +785,7 @@ endpolynotextgou:
 					if ( CF_GOURAUD)
 						uSrc = gpuLightingTXTGouraud(uSrc, l_gCol);
 					if (!CF_GOURAUD)
-						uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+						uSrc = gpuLightingTXT(uSrc, bgr0888);
 				}
 
 				should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
@@ -796,10 +800,13 @@ endpolytext:
 			pDst++;
 			l_u = (l_u + l_u_inc) & l_u_msk;
 			l_v += l_v_inc;
-			if (CF_LIGHT && CF_GOURAUD)
-				l_gCol.raw += l_gInc.raw;
+			if (CF_LIGHT && CF_GOURAUD) {
+				l_gCol += l_gInc;
+				l_gCol.get_counter(pcounter);
+			}
+			pcounter--;
 		}
-		while (--count);
+		while (pcounter >= 0);
 	}
 }
 
diff --git a/plugins/gpu_unai/gpu_inner_light.h b/plugins/gpu_unai/gpu_inner_light.h
index 44fecdc3..f4ec2134 100644
--- a/plugins/gpu_unai/gpu_inner_light.h
+++ b/plugins/gpu_unai/gpu_inner_light.h
@@ -85,11 +85,12 @@ static void SetupLightLUT()
 ////////////////////////////////////////////////////////////////////////////////
 GPU_INLINE gcol_t gpuPackGouraudCol(u32 r, u32 g, u32 b)
 {
-	return (gcol_t){
+	return (gcol_t){{
 		(u16)(r >> 2),
 		(u16)(g >> 2),
 		(u16)(b >> 2),
-	};
+		0
+	}};
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -167,8 +168,13 @@ GPU_INLINE u32 gpuLightingRGB24(gcol_t gCol)
 //          u16 output:  0bbbbbgggggrrrrr
 // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
 ////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u32 bgr0888)
 {
+	// gcc can move this out of the loop if it wants to
+	uint_fast32_t b5 = (bgr0888 >> 19);
+	uint_fast32_t g5 = (bgr0888 >> 11) & 0x1f;
+	uint_fast32_t r5 = (bgr0888 >>  3) & 0x1f;
+
 	return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) |
 	       (gpu_unai.LightLUT[ (uSrc&0x03E0)     | g5] <<  5) |
 	       (gpu_unai.LightLUT[((uSrc&0x001F)<<5) | r5]      ) |
@@ -189,7 +195,7 @@ GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5,
 //          u16 output:  0bbbbbgggggrrrrr
 // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
 ////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGouraud(uint_fast16_t uSrc, gcol_t gCol)
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, gcol_t gCol)
 {
 	return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | (gCol.c.b >> 11)] << 10) |
 	       (gpu_unai.LightLUT[ (uSrc&0x03E0)     | (gCol.c.g >> 11)] << 5) |
@@ -213,22 +219,22 @@ GPU_INLINE uint_fast16_t gpuLightingTXTGouraud(uint_fast16_t uSrc, gcol_t gCol)
 //                     ^ bit 31
 // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
 ////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u8 r8, u8 g8, u8 b8)
+GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u32 bgr0888)
 {
 	uint_fast16_t r1 = uSrc&0x001F;
 	uint_fast16_t g1 = uSrc&0x03E0;
 	uint_fast16_t b1 = uSrc&0x7C00;
 
-	uint_fast16_t r2 = r8;
-	uint_fast16_t g2 = g8;
-	uint_fast16_t b2 = b8;
+	uint_fast16_t r2 = bgr0888 & 0x0000ff;
+	uint_fast32_t g2 = bgr0888 & 0x00ff00;
+	uint_fast16_t b2 = bgr0888 >> 16;
 
 	u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
-	u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+	u32 g3 = g1 * g2; if (g3 & 0xFE000000) g3 = ~0xFE000000;
 	u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
 
 	return ((r3>> 3)    ) |
-	       ((g3>> 8)<<10) |
+	       ((g3>>16)<<10) |
 	       ((b3>>13)<<20);
 }
 
diff --git a/plugins/gpu_unai/gpu_inner_light_arm.h b/plugins/gpu_unai/gpu_inner_light_arm.h
index 7edb8fb0..3445e793 100644
--- a/plugins/gpu_unai/gpu_inner_light_arm.h
+++ b/plugins/gpu_unai/gpu_inner_light_arm.h
@@ -14,6 +14,7 @@
 //                 ^ bit 16
 // Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
 ////////////////////////////////////////////////////////////////////////////////
+// note: outdated, unused
 GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
 {
 	uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
@@ -30,11 +31,14 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
 	return out;
 }
 
+//#ifdef HAVE_ARMV5E // todo?
+#ifdef HAVE_ARMV6
+
 ////////////////////////////////////////////////////////////////////////////////
-// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+// Apply 8-bit lighting to bgr555 texture color:
 //
 // INPUT:
-//	  'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//	  'r8','g8','b8' are unsigned 8-bit color values, value of 127
 //	    is midpoint that doesn't modify that component of texture
 //	  'uSrc' input:	 mbbbbbgggggrrrrr
 //			 ^ bit 16
@@ -42,95 +46,59 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
 //	    u16 output:	 mbbbbbgggggrrrrr
 // Where 'X' are fixed-pt bits.
 ////////////////////////////////////////////////////////////////////////////////
-#ifdef HAVE_ARMV6
-// clang uses smulbb but not gcc, so we need this
-GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b)
+// on v6 we have single-cycle mul and sat which is better than the LightLUT
+GPU_INLINE u32 gpuLightingTXTARM(u32 uSrc, u32 bgr0888)
 {
-	int_fast16_t r;
-	asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
-	return r;
+	int_fast32_t r, g, b, s_d = uSrc;
+	// has to be in a block, otherwise gcc schedules the insns poorly
+	asm("and    %[r],  %[s_d], #0x001f\n"
+	    "and    %[b],  %[bgr], #0xff\n"
+	    "smulbb %[r],  %[r],   %[b]\n"
+	    "uxtb   %[b],  %[bgr], ror #8\n"
+	    "and    %[g],  %[s_d], #0x03e0\n"
+	    "smulbb %[g],  %[g],   %[b]\n"
+	    "and    %[b],  %[s_d], #0x7c00\n"
+	    "and    %[s_d],%[s_d], #0x8000\n"
+	    "smulbt %[b],  %[b],   %[bgr]\n"
+	    "usat   %[r],  #5, %[r], asr #7\n"
+	    "usat   %[g],  #5, %[g], asr #12\n"
+	    "usat   %[b],  #5, %[b], asr #17\n"
+	    "orr    %[s_d],%[s_d], %[r]\n"
+	    "orr    %[s_d],%[s_d], %[g], lsl #5\n"
+	    "orr    %[s_d],%[s_d], %[b], lsl #10\n"
+	  : [s_d]"+r"(s_d), [r]"=&r"(r), [g]"=&r"(g), [b]"=&r"(b)
+	  : [bgr]"r"(bgr0888));
+	return s_d;
 }
+#define gpuLightingTXT gpuLightingTXTARM
 
-GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+GPU_INLINE u32 gpuLightingTXTGouraudARM(u32 uSrc, gcol_t gCol)
 {
-	// on v6 we have single-cycle mul and sat which is better than the lut
-	int_fast16_t r = smulbb(uSrc & 0x001f, r5);
-	int_fast16_t g = smulbb(uSrc & 0x03e0, g5);
-	int_fast16_t b = smulbb(uSrc & 0x7c00, b5);
-	asm volatile("usat %0, #5, %0, asr #4"  : "=r"(r) : "0"(r));
-	asm volatile("usat %0, #5, %0, asr #9"  : "=r"(g) : "0"(g));
-	asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b));
-	return (uSrc & 0x8000) | (b << 10) | (g << 5) | r;
+	u32 r, g, s_d = uSrc;
+	asm("str    %[b],   [sp, #-4]!\n"        // conserve regs for gcc
+	    "uxtb16 %[b],   %[b],    ror #8\n"   // b = g_rg >> 8 & 0xff00ff
+	    "and    %[r],   %[s_d],  #0x001f\n"
+	    "and    %[g],   %[s_d],  #0x03e0\n"
+	    "smulbb %[r],   %[r],    %[b]\n"
+	    "smulbt %[g],   %[g],    %[b]\n"
+	    "uxtb   %[b],   %[g_b],  ror #8\n"
+	    "tst    %[s_d],          #0x8000\n"
+	    "and    %[s_d], %[s_d],  #0x7c00\n"
+	    "smulbb %[b],   %[b],    %[s_d]\n"
+	    "usat   %[s_d],#5, %[r], asr #7\n"
+	    "usat   %[g],  #5, %[g], asr #12\n"
+	    "usat   %[b],  #5, %[b], asr #17\n"
+	    "orrne  %[s_d], %[s_d],  #0x8000\n"
+	    "orr    %[s_d], %[s_d],  %[g], lsl #5\n"
+	    "orr    %[s_d], %[s_d],  %[b], lsl #10\n"
+	    "ldr    %[b],   [sp], #4\n"
+	  : [s_d]"+r"(s_d), [r]"=&r"(r), [g]"=&r"(g)
+	  : [b]"r"(gCol.raw32[0]), [g_b]"r"(gCol.raw32[1])
+	  : "cc");
+	return s_d;
 }
-#else
-GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
-{
-	uint_fast16_t out = 0x03E0;
-	u32 db, dg;
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudARM
 
-	// Using `g` for src, `G` for dest
-	asm ("and    %[dg],  %[out],    %[src]  \n\t"             // dg holds 0x000000ggggg00000
-	     "orr    %[dg],  %[dg],     %[g5]   \n\t"             // dg holds 0x000000gggggGGGGG
-	     "and    %[db],  %[out],    %[src], lsr #0x05 \n\t"   // db holds 0x000000bbbbb00000
-	     "ldrb   %[dg],  [%[lut],   %[dg]]  \n\t"             // dg holds result 0x00000000000ggggg
-	     "and    %[out], %[out],    %[src], lsl #0x05 \n\t"   // out holds 0x000000rrrrr00000
-	     "orr    %[out], %[out],    %[r5]   \n\t"             // out holds 0x000000rrrrrRRRRR
-	     "orr    %[db],  %[db],     %[b5]   \n\t"             // db holds 0x000000bbbbbBBBBB
-	     "ldrb   %[out], [%[lut],   %[out]] \n\t"             // out holds result 0x00000000000rrrrr
-	     "ldrb   %[db],  [%[lut],   %[db]]  \n\t"             // db holds result 0x00000000000bbbbb
-	     "tst    %[src], #0x8000\n\t"                         // check whether msb was set on uSrc
-	     "orr    %[out], %[out],    %[dg],  lsl #0x05   \n\t" // out holds 0x000000gggggrrrrr
-	     "orrne  %[out], %[out],    #0x8000\n\t"              // add msb to out if set on uSrc
-	     "orr    %[out], %[out],    %[db],  lsl #0x0A   \n\t" // out holds 0xmbbbbbgggggrrrrr
-	     : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg)
-	     : [r5] "r" (r5), [g5] "r" (g5),  [b5] "r" (b5),
-	       [lut] "r" (gpu_unai.LightLUT), [src] "r" (uSrc), "0" (out)
-	     : "cc");
-	return out;
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
-//
-// INPUT:
-//  'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
-//     15.0 is midpoint that does not modify color of texture
-//	   gCol input :	 rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
-//			 ^ bit 31
-//	  'uSrc' input:	 mbbbbbgggggrrrrr
-//			 ^ bit 16
-// RETURNS:
-//	    u16 output:	 mbbbbbgggggrrrrr
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
-////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGouraudARM(uint_fast16_t uSrc, u32 gCol)
-{
-	uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
-	u32 db,dg,gtmp;
-
-	// Using `g` for src, `G` for dest
-	asm ("and    %[dg],  %[out],  %[src]   \n\t"           // dg holds 0x000000ggggg00000
-	     "and    %[gtmp],%[out],  %[gCol], lsr #0x0B \n\t" // gtmp holds 0x000000GGGGG00000
-	     "and    %[db],  %[out],  %[src],  lsr #0x05 \n\t" // db holds 0x000000bbbbb00000
-	     "orr    %[dg],  %[dg],   %[gtmp], lsr #0x05 \n\t" // dg holds 0x000000gggggGGGGG
-	     "and    %[gtmp],%[out],  %[gCol]  \n\t"           // gtmp holds 0x000000BBBBB00000
-	     "ldrb   %[dg],  [%[lut], %[dg]]   \n\t"           // dg holds result 0x00000000000ggggg
-	     "and    %[out], %[out],  %[src],  lsl #0x05 \n\t" // out holds 0x000000rrrrr00000
-	     "orr    %[out], %[out],  %[gCol], lsr #0x1B \n\t" // out holds 0x000000rrrrrRRRRR
-	     "orr    %[db],  %[db],   %[gtmp], lsr #0x05 \n\t" // db holds 0x000000bbbbbBBBBB
-	     "ldrb   %[out], [%[lut], %[out]]  \n\t"           // out holds result 0x00000000000rrrrr
-	     "ldrb   %[db],  [%[lut], %[db]]   \n\t"           // db holds result 0x00000000000bbbbb
-	     "tst    %[src], #0x8000\n\t"                      // check whether msb was set on uSrc
-	     "orr    %[out], %[out],  %[dg],   lsl #0x05 \n\t" // out holds 0x000000gggggrrrrr
-	     "orrne  %[out], %[out],  #0x8000\n\t"             // add msb to out if set on uSrc
-	     "orr    %[out], %[out],  %[db],   lsl #0x0A \n\t" // out holds 0xmbbbbbgggggrrrrr
-	     : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg),
-	       [gtmp] "=&r" (gtmp) \
-	     : [gCol] "r" (gCol), [lut] "r" (gpu_unai.LightLUT), "0" (out), [src] "r" (uSrc)
-	     : "cc");
-
-	return out;
-}
+#endif // HAVE_ARMV6
 
 #endif  //_OP_LIGHT_ARM_H_
diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h
index 5c7b67ce..26c7332b 100644
--- a/plugins/gpu_unai/gpu_raster_sprite.h
+++ b/plugins/gpu_unai/gpu_raster_sprite.h
@@ -63,6 +63,9 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out)
 
 	le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
 
+	gpu_unai.inn.r8 = packet.U1[0];
+	gpu_unai.inn.g8 = packet.U1[1];
+	gpu_unai.inn.b8 = packet.U1[2];
 	gpu_unai.inn.r5 = packet.U1[0] >> 3;
 	gpu_unai.inn.g5 = packet.U1[1] >> 3;
 	gpu_unai.inn.b5 = packet.U1[2] >> 3;
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h
index 91cdb8af..2e30a283 100644
--- a/plugins/gpu_unai/gpu_unai.h
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -56,12 +56,51 @@
 #define s64 int64_t
 #define u64 uint64_t
 
-typedef union {
+union gcol_t {
 	struct {
 		u16 r, g, b;
+#ifdef HAVE_ARMV6
+		u16 counter;
+#else
+		u16 unused;
+#endif
 	} c;
+#if defined(HAVE_ARMV6) || (defined(__SIZEOF_SIZE_T__) && __SIZEOF_SIZE_T__ == 4)
+	u32 raw32[2];
+#else
 	u64 raw;
-} gcol_t;
+#endif
+
+	inline gcol_t & operator+=(const gcol_t &rhs)
+	{
+#ifdef HAVE_ARMV6
+		// prevent bit spills the other versions have,
+		// allowing to use the unused part as a counter
+		asm("uadd16 %[d], %[d], %[s]" : [d]"+r"(raw32[0]) : [s]"r"(rhs.raw32[0]));
+		asm("uadd16 %[d], %[d], %[s]" : [d]"+r"(raw32[1]) : [s]"r"(rhs.raw32[1]));
+#elif defined(__SIZEOF_SIZE_T__) && __SIZEOF_SIZE_T__ == 4
+		// avoid having to do carry that's not needed here
+		raw32[0] += rhs.raw32[0];
+		raw32[1] += rhs.raw32[1];
+#else
+		raw += rhs.raw;
+#endif
+		return *this;
+	}
+
+	inline void set_counter(int counter)
+	{
+#ifdef HAVE_ARMV6
+		c.counter = counter;
+#endif
+	}
+	inline void get_counter(int &counter)
+	{
+#ifdef HAVE_ARMV6
+		counter = raw32[1];
+#endif
+	}
+};
 
 #ifndef NDEBUG
 
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp
index 71eccb1a..71c92728 100644
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "arm_features.h"
 #include "../gpulib/gpu.h"
 #include "old/if.h"
 
-- 
2.47.3