From: notaz <notasas@gmail.com>
Date: Fri, 9 Jan 2026 20:46:54 +0000 (+0200)
Subject: gpu_unai: cleanup
X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0cdf7aa4936cf27a674a60c1bdea0ba91ba12004;p=pcsx_rearmed.git

gpu_unai: cleanup

- BLITMASK removed for good since it's unused and only doubles
  compilation time and generates tons of useless code
- gpuBlending24 removed since real hw has no such thing
- real dithering table is used, now folded in gpuLighting funcs
  to avoid some pack/unpack steps
- pass y to PolySpan to avoid useless recalc from pDst
---

diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h
index 3ac39b66..e561d95e 100644
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -35,21 +35,6 @@
 #define  CF_GOURAUD   ((CF>> 7)&1) // Gouraud shading
 #define  CF_MASKSET   ((CF>> 8)&1) // Mask bit set
 #define  CF_DITHER    ((CF>> 9)&1) // Dithering
-#define  CF_BLITMASK  ((CF>>10)&1) // blit_mask check (skip rendering pixels
-                                   //  that wouldn't end up displayed on
-                                   //  low-res screen using simple downscaler)
-
-//#ifdef __arm__
-//#ifndef ENABLE_GPU_ARMV7
-/* ARMv5 */
-//#include "gpu_inner_blend_arm5.h"
-//#else
-/* ARMv7 optimized */
-//#include "gpu_inner_blend_arm7.h"
-//#endif
-//#else
-//#include "gpu_inner_blend.h"
-//#endif
 
 #include "gpu_inner_blend.h"
 #include "gpu_inner_quantization.h"
@@ -61,9 +46,8 @@
 #include "gpu_arm.h"
 #include "gpu_inner_blend_arm.h"
 #include "gpu_inner_light_arm.h"
-#define gpuBlending gpuBlendingARM
 #endif
-#ifndef gpuBlending
+#ifndef gpuBlending // gpuBlendingARM
 #define gpuBlending gpuBlendingGeneric
 #endif
 #ifndef gpuLightingTXT // gpuLightingTXTARM
@@ -73,10 +57,6 @@
 #define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric
 #endif
 
-// Non-dithering lighting and blending functions preserve uSrc
-// MSB. This saves a few operations and useless load/stores.
-#define MSB_PRESERVED (!CF_DITHER)
-
 // If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
 // This is only for debugging/verification of low-precision colors in C.
 // Low-precision Gouraud is intended for use by SIMD-optimized inner drivers
@@ -129,10 +109,6 @@ static inline u16 gpuGouraudColor15bpp(u32 r, u32 g, u32 b)
 template<int CF>
 static le16_t* gpuPixelSpanFn(le16_t* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
 {
-	// Blend func can save an operation if it knows uSrc MSB is
-	//  unset. For untextured prims, this is always true.
-	const bool skip_uSrc_mask = true;
-
 	u16 col;
 	struct GouraudColor * gcPtr;
 	u32 r, g, b;
@@ -168,7 +144,7 @@ static le16_t* gpuPixelSpanFn(le16_t* pDst, uintptr_t data, ptrdiff_t incr, size
 				uint_fast16_t uSrc = col;
 
 				if (CF_BLEND)
-					uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+					uSrc = gpuBlending<CF_BLENDMODE, true>(uSrc, uDst);
 
 				if (CF_MASKSET) { *pDst = u16_to_le16(uSrc | 0x8000); }
 				else            { *pDst = u16_to_le16(uSrc);          }
@@ -194,12 +170,8 @@ static le16_t* gpuPixelSpanFn(le16_t* pDst, uintptr_t data, ptrdiff_t incr, size
 
 				uint_fast16_t uSrc = col;
 
-				// Blend func can save an operation if it knows uSrc MSB is
-				//  unset. For untextured prims, this is always true.
-				const bool skip_uSrc_mask = true;
-
 				if (CF_BLEND)
-					uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+					uSrc = gpuBlending<CF_BLENDMODE, true>(uSrc, uDst);
 
 				if (CF_MASKSET) { *pDst = u16_to_le16(uSrc | 0x8000); }
 				else            { *pDst = u16_to_le16(uSrc);          }
@@ -306,10 +278,6 @@ static inline void gpuTileSpanFn(le16_t *pDst, u16 data, u32 count)
 		} while (--count);
 	} else
 	{
-		// Blend func can save an operation if it knows uSrc MSB is
-		//  unset. For untextured prims, this is always true.
-		const bool skip_uSrc_mask = true;
-
 		uint_fast16_t uSrc, uDst;
 		do
 		{
@@ -319,16 +287,10 @@ static inline void gpuTileSpanFn(le16_t *pDst, u16 data, u32 count)
 			uSrc = data;
 
 			if (CF_BLEND)
-				uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+				uSrc = gpuBlending<CF_BLENDMODE, true>(uSrc, uDst);
 
 			if (CF_MASKSET) { *pDst = u16_to_le16(uSrc | 0x8000); }
 			else            { *pDst = u16_to_le16(uSrc);          }
-
-			//senquack - Did not apply "Silent Hill" mask-bit fix to here.
-			// It is hard to tell from scarce documentation available and
-			//  lack of comments in code, but I believe the tile-span
-			//  functions here should not bother to preserve any source MSB,
-			//  as they are not drawing from a texture.
 endtile:
 			pDst++;
 		}
@@ -422,21 +384,13 @@ template<int CF>
 static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
 	const gpu_unai_inner_t &inn)
 {
-	// Blend func can save an operation if it knows uSrc MSB is unset.
-	//  Untextured prims can always skip (source color always comes with MSB=0).
-	//  For textured prims, the generic lighting funcs always return it unset. (bonus!)
-	const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
-
-	uint_fast16_t uSrc, uDst, srcMSB;
+	uint_fast16_t uSrc, uDst;
 	bool should_blend;
 	u32 u0_mask = inn.u_msk >> 10;
 	u32 bgr0888;
 
-	if (CF_LIGHT) {
-		bgr0888 = (gpu_unai.inn.b8 << 16) |
-			  (gpu_unai.inn.g8 << 8) |
-			   gpu_unai.inn.r8;
-	}
+	if (CF_LIGHT)
+		bgr0888 = gpu_unai.inn.bgr0888;
 
 	const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA;
 	const u32 v0_mask = inn.v_msk >> 10;
@@ -474,21 +428,15 @@ static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt
 
 		if (!uSrc) goto endsprite;
 
-		//senquack - save source MSB, as blending or lighting macros will not
-		//           (Silent Hill gray rectangles mask bit bug)
-		if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
-		
 		if (CF_LIGHT)
 			uSrc = gpuLightingTXT(uSrc, bgr0888);
 
-		should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
-
+		should_blend = uSrc & 0x8000;
 		if (CF_BLEND && should_blend)
-			uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+			uSrc = gpuBlending<CF_BLENDMODE, false>(uSrc, uDst) | 0x8000;
 
-		if (CF_MASKSET)                                    { *pDst = u16_to_le16(uSrc | 0x8000); }
-		else if (!MSB_PRESERVED && (CF_BLEND || CF_LIGHT)) { *pDst = u16_to_le16(uSrc | srcMSB); }
-		else                                               { *pDst = u16_to_le16(uSrc);          }
+		if (CF_MASKSET) { *pDst = u16_to_le16(uSrc | 0x8000); }
+		else            { *pDst = u16_to_le16(uSrc);          }
 
 endsprite:
 		u0 += (CF_TEXTMODE==3) ? 2 : 1;
@@ -591,6 +539,10 @@ const PS gpuSpriteDrivers[256] = {
 #undef TA
 #undef TA6
 
+// this tries to avoid pointer shifting
+#define DITHER_LKUP(lut, dst) \
+	*(s16 *)((char *)(lut) + ((uintptr_t)(pDst) & 6))
+
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Polygon innerloops generator
 
@@ -613,15 +565,13 @@ const PS gpuSpriteDrivers[256] = {
 //             relevant blend/light headers.
 // (see README_senquack.txt)
 template<int CF>
-static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count, s32 y)
 {
-	// Blend func can save an operation if it knows uSrc MSB is unset.
-	//  Untextured prims can always skip this (src color MSB is always 0).
-	//  For textured prims, the generic lighting funcs always return it unset. (bonus!)
-	const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
 	bool should_blend;
+	s16 DitherLut16[4];
 
-	u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.inn.blit_mask;
+	if (CF_DITHER)
+		memcpy(DitherLut16, &gpu_unai.DitherLut16[y & 3][0], sizeof(DitherLut16));
 
 	if (!CF_TEXTMODE)
 	{
@@ -632,18 +582,13 @@ static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32
 			do {
 				uint_fast16_t uSrc, uDst;
 
-				// NOTE: Don't enable CF_BLITMASK  pixel skipping (speed hack)
-				//  on untextured polys. It seems to do more harm than good: see
-				//  gravestone text at end of Medieval intro sequence. -senquack
-				//if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) { goto endpolynotextnogou; } }
-
 				if (CF_BLEND || CF_MASKCHECK) uDst = le16_to_u16(*pDst);
 				if (CF_MASKCHECK) { if (uDst&0x8000) { goto endpolynotextnogou; } }
 
 				uSrc = pix15;
 
 				if (CF_BLEND)
-					uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+					uSrc = gpuBlending<CF_BLENDMODE, true>(uSrc, uDst);
 
 				if (CF_MASKSET) { *pDst = u16_to_le16(uSrc | 0x8000); }
 				else            { *pDst = u16_to_le16(uSrc);          }
@@ -661,27 +606,19 @@ endpolynotextnogou:
 			do {
 				uint_fast16_t uDst, uSrc;
 
-				// See note in above loop regarding CF_BLITMASK
-				//if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolynotextgou; }
-
 				if (CF_BLEND || CF_MASKCHECK) uDst = le16_to_u16(*pDst);
 				if (CF_MASKCHECK) { if (uDst&0x8000) goto endpolynotextgou; }
 
 				if (CF_DITHER) {
 					// GOURAUD, DITHER
-
-					u32 uSrc24 = gpuLightingRGB24(l_gCol);
-					if (CF_BLEND)
-						uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
-					uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+					int_fast16_t dv = DITHER_LKUP(DitherLut16, pDst);
+					uSrc = gpuLightingRGBDither(l_gCol, dv);
 				} else {
 					// GOURAUD, NO DITHER
-
 					uSrc = gpuLightingRGB(l_gCol);
-
-					if (CF_BLEND)
-						uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
 				}
+				if (CF_BLEND)
+					uSrc = gpuBlending<CF_BLENDMODE, true>(uSrc, uDst);
 
 				if (CF_MASKSET) { *pDst = u16_to_le16(uSrc | 0x8000); }
 				else            { *pDst = u16_to_le16(uSrc);          }
@@ -697,7 +634,7 @@ endpolynotextgou:
 	{
 		// TEXTURED
 
-		uint_fast16_t uDst, uSrc, srcMSB;
+		uint_fast16_t uDst, uSrc;
 
 		//senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
 		// one 32-bit unsigned int, but this proved to lose too much accuracy
@@ -725,17 +662,13 @@ endpolynotextgou:
 				l_gInc.set_counter(-1);
 				l_gCol.set_counter(pcounter);
 			} else {
-				// keep this packed, otherwise gcc runs out of regs
-				bgr0888 = (gpu_unai.inn.b8 << 16) |
-					  (gpu_unai.inn.g8 << 8) |
-					   gpu_unai.inn.r8;
-				// XXX pre-pack
+				// keep this packed, otherwise gcc spills too much
+				bgr0888 = gpu_unai.inn.bgr0888;
 			}
 		}
 
 		do
 		{
-			if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolytext; }
 			if (CF_MASKCHECK || CF_BLEND) { uDst = le16_to_u16(*pDst); }
 			if (CF_MASKCHECK) if (uDst&0x8000) { goto endpolytext; }
 
@@ -759,9 +692,6 @@ endpolynotextgou:
 				if (!uSrc) goto endpolytext;
 			}
 
-			// Save source MSB, as blending or lighting will not (Silent Hill)
-			if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
-
 			// When textured, only dither when LIGHT (texture blend) is enabled
 			// LIGHT &&  BLEND => dither
 			// LIGHT && !BLEND => dither
@@ -769,33 +699,28 @@ endpolynotextgou:
 			//!LIGHT && !BLEND => no dither
 
 			if (CF_DITHER && CF_LIGHT) {
-				u32 uSrc24;
-				if ( CF_GOURAUD)
-					uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol);
-				if (!CF_GOURAUD)
-					uSrc24 = gpuLightingTXT24(uSrc, bgr0888);
-
-				if (CF_BLEND && srcMSB)
-					uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
-
-				uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
-			} else
+				int_fast16_t dv = DITHER_LKUP(DitherLut16, pDst);
+				if (CF_GOURAUD)
+					uSrc = gpuLightingTXTGouraudDither(uSrc, l_gCol, dv);
+				else
+					uSrc = gpuLightingTXTDither(uSrc, bgr0888, dv);
+			}
+			else
 			{
 				if (CF_LIGHT) {
-					if ( CF_GOURAUD)
+					if (CF_GOURAUD)
 						uSrc = gpuLightingTXTGouraud(uSrc, l_gCol);
-					if (!CF_GOURAUD)
+					else
 						uSrc = gpuLightingTXT(uSrc, bgr0888);
 				}
 
-				should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
-				if (CF_BLEND && should_blend)
-					uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
 			}
+			should_blend = uSrc & 0x8000;
+			if (CF_BLEND && should_blend)
+				uSrc = gpuBlending<CF_BLENDMODE, false>(uSrc, uDst) | 0x8000;
 
-			if (CF_MASKSET)                                    { *pDst = u16_to_le16(uSrc | 0x8000); }
-			else if (!MSB_PRESERVED && (CF_BLEND || CF_LIGHT)) { *pDst = u16_to_le16(uSrc | srcMSB); }
-			else                                               { *pDst = u16_to_le16(uSrc);          }
+			if (CF_MASKSET) { *pDst = u16_to_le16(uSrc | 0x8000); }
+			else            { *pDst = u16_to_le16(uSrc);          }
 endpolytext:
 			pDst++;
 			l_u = (l_u + l_u_inc) & l_u_msk;
@@ -812,7 +737,7 @@ endpolytext:
 
 #ifdef __arm__
 template<int CF>
-static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count, s32 y)
 {
 	switch (CF) {
 	case 0x02: poly_untex_st0_asm  (pDst, &gpu_unai.inn, count); break;
@@ -829,12 +754,12 @@ static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count
 	case 0x41: poly_8bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
 	case 0x43: poly_8bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
 #endif
-	default:   gpuPolySpanFn<CF>(gpu_unai, pDst, count);
+	default:   gpuPolySpanFn<CF>(gpu_unai, pDst, count, y);
 	}
 }
 #endif
 
-static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count, s32 y)
 {
 	#ifdef ENABLE_GPU_LOG_SUPPORT
 		fprintf(stdout,"PolyNULL()\n");
@@ -843,7 +768,7 @@ static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 
 ///////////////////////////////////////////////////////////////////////////////
 //  Polygon innerloops driver
-typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
+typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count, s32 y);
 
 // Template instantiation helper macros
 #define TI(cf) gpuPolySpanFn<(cf)>
@@ -892,9 +817,8 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
 	TN,            TN,            TN,            TI((ub)|0xf3), TN,            TN,            TN,            TI((ub)|0xf7), \
 	TN,            TN,            TN,            TI((ub)|0xfb), TN,            TN,            TN,            TI((ub)|0xff)
 
-const PP gpuPolySpanDrivers[2048] = {
-	TIBLOCK(0<<8), TIBLOCK(1<<8), TIBLOCK(2<<8), TIBLOCK(3<<8),
-	TIBLOCK(4<<8), TIBLOCK(5<<8), TIBLOCK(6<<8), TIBLOCK(7<<8)
+const PP gpuPolySpanDrivers[1024] = {
+	TIBLOCK(0<<8), TIBLOCK(1<<8), TIBLOCK(2<<8), TIBLOCK(3<<8)
 };
 
 #undef TI
diff --git a/plugins/gpu_unai/gpu_inner_blend.h b/plugins/gpu_unai/gpu_inner_blend.h
index febc7ede..c0af0721 100644
--- a/plugins/gpu_unai/gpu_inner_blend.h
+++ b/plugins/gpu_unai/gpu_inner_blend.h
@@ -120,69 +120,4 @@ GPU_INLINE u32 gpuGetRGB24(uint_fast16_t uSrc)
 	     | ((uSrc & 0x001F)<< 4);
 }
 
-
-////////////////////////////////////////////////////////////////////////////////
-// Blend padded u32 5.4:5.4:5.4 bgr fixed-pt color triplet in 'uSrc24'
-//  (foreground color) with bgr555 color in 'uDst' (background color),
-//  returning the resulting u32 5.4:5.4:5.4 color.
-//
-// INPUT:
-//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
-//                     ^ bit 31
-//       'uDst' input: -bbbbbgggggrrrrr
-//                     ^ bit 16
-// RETURNS:
-//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
-//                     ^ bit 31
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
-////////////////////////////////////////////////////////////////////////////////
-template <int BLENDMODE>
-GPU_INLINE u32 gpuBlending24(u32 uSrc24, uint_fast16_t uDst)
-{
-	// These use techniques adapted from Blargg's techniques mentioned in
-	//  in gpuBlending() comments above. Not as much bitwise trickery is
-	//  necessary because of presence of 0 padding in uSrc24 format.
-
-	u32 uDst24 = gpuGetRGB24(uDst);
-	u32 mix;
-
-	// 0.5 x Back + 0.5 x Forward
-	if (BLENDMODE==0) {
-		const u32 uMsk = 0x1FE7F9FE;
-		// Only need to mask LSBs of uSrc24, uDst24's LSBs are 0 already
-		mix = (uDst24 + (uSrc24 & uMsk)) >> 1;
-	}
-
-	// 1.0 x Back + 1.0 x Forward
-	if (BLENDMODE==1) {
-		u32 sum     = uSrc24 + uDst24;
-		u32 carries = sum & 0x20080200;
-		u32 modulo  = sum - carries;
-		u32 clamp   = carries - (carries >> 9);
-		mix = modulo | clamp;
-	}
-
-	// 1.0 x Back - 1.0 x Forward
-	if (BLENDMODE==2) {
-		// Insert ones in 0-padded borrow slot of color to be subtracted from
-		uDst24 |= 0x20080200;
-		u32 diff    = uDst24 - uSrc24;
-		u32 borrows = diff & 0x20080200;
-		u32 clamp   = borrows - (borrows >> 9);
-		mix = diff & clamp;
-	}
-
-	// 1.0 x Back + 0.25 x Forward
-	if (BLENDMODE==3) {
-		uSrc24 = (uSrc24 & 0x1FC7F1FC) >> 2;
-		u32 sum     = uSrc24 + uDst24;
-		u32 carries = sum & 0x20080200;
-		u32 modulo  = sum - carries;
-		u32 clamp   = carries - (carries >> 9);
-		mix = modulo | clamp;
-	}
-
-	return mix;
-}
-
 #endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm.h b/plugins/gpu_unai/gpu_inner_blend_arm.h
index f887374c..f53a5ee9 100644
--- a/plugins/gpu_unai/gpu_inner_blend_arm.h
+++ b/plugins/gpu_unai/gpu_inner_blend_arm.h
@@ -95,13 +95,8 @@ GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst)
 		     : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x8420));
 	}
 
-	// There's not a case where we can get into this function,
-	// SKIP_USRC_MSB_MASK is false, and the msb of uSrc is unset.
-	if (!SKIP_USRC_MSB_MASK) {
-		asm ("orr %[mix], %[mix], #0x8000" : [mix] "+r" (mix));
-	}
-  
 	return mix;
 }
+#define gpuBlending gpuBlendingARM
 
 #endif  //_OP_BLEND_ARM_H_
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm5.h b/plugins/gpu_unai/gpu_inner_blend_arm5.h
deleted file mode 100644
index 0e9b74f1..00000000
--- a/plugins/gpu_unai/gpu_inner_blend_arm5.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/***************************************************************************
-*   Copyright (C) 2010 PCSX4ALL Team                                      *
-*   Copyright (C) 2010 Unai                                               *
-*                                                                         *
-*   This program is free software; you can redistribute it and/or modify  *
-*   it under the terms of the GNU General Public License as published by  *
-*   the Free Software Foundation; either version 2 of the License, or     *
-*   (at your option) any later version.                                   *
-*                                                                         *
-*   This program is distributed in the hope that it will be useful,       *
-*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
-*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
-*   GNU General Public License for more details.                          *
-*                                                                         *
-*   You should have received a copy of the GNU General Public License     *
-*   along with this program; if not, write to the                         *
-*   Free Software Foundation, Inc.,                                       *
-*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
-***************************************************************************/
-
-#ifndef _OP_BLEND_H_
-#define _OP_BLEND_H_
-
-//  GPU Blending operations functions
-
-#define gpuBlending00(uSrc,uDst) \
-{ \
-	asm ("and  %[src], %[src], %[msk]  " : [src] "=r" (uSrc) : "0" (uSrc), [msk] "r" (uMsk)                  ); \
-	asm ("and  %[dst], %[dst], %[msk]  " : [dst] "=r" (uDst) : "0" (uDst), [msk] "r" (uMsk)                  ); \
-	asm ("add  %[src], %[dst], %[src]  " : [src] "=r" (uSrc) :             [dst] "r" (uDst), "0" (uSrc)      ); \
-	asm ("mov  %[src], %[src], lsr #1  " : [src] "=r" (uSrc) : "0" (uSrc)                                    ); \
-}
-
-//	1.0 x Back + 1.0 x Forward
-#define gpuBlending01(uSrc,uDst) \
-{ \
-	u16 st,dt,out; \
-	asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-	asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-	asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
-	asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
-	asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
-	asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-	asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-	asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
-	asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
-	asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
-	asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
-	asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-	asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-	asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
-	asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
-	asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
-	asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
-}
-
-//	1.0 x Back - 1.0 x Forward	*/
-#define gpuBlending02(uSrc,uDst) \
-{ \
-	u16 st,dt,out; \
-	asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-	asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-	asm ("subs   %[out], %[dt],    %[st]    " : [out] "=r" (out)  : [dt]  "r" (dt),   [st]  "r" (st) : "cc"         ); \
-	asm ("movmi  %[out], #0x0000            " : [out] "=r" (out)  : "0" (out)                                       ); \
-	asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-	asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-	asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
-	asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
-	asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-	asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-	asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
-	asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
-	asm ("mov %[uSrc], %[out]" : [uSrc] "=r" (uSrc) : [out] "r" (out) ); \
-}
-
-//	1.0 x Back + 0.25 x Forward	*/
-#define gpuBlending03(uSrc,uDst) \
-{ \
-		u16 st,dt,out; \
-		asm ("mov    %[src], %[src],   lsr #2   " : [src] "=r" (uSrc) : "0" (uSrc)                                      ); \
-		asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-		asm ("and    %[st],  %[src],   #0x1C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-		asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
-		asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
-		asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
-		asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-		asm ("and    %[st],  %[src],   #0x00E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-		asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
-		asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
-		asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
-		asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
-		asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
-		asm ("and    %[st],  %[src],   #0x0007  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
-		asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
-		asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
-		asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
-		asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
-}
-
-#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm7.h b/plugins/gpu_unai/gpu_inner_blend_arm7.h
deleted file mode 100644
index 083e62d8..00000000
--- a/plugins/gpu_unai/gpu_inner_blend_arm7.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/***************************************************************************
-*   Copyright (C) 2010 PCSX4ALL Team                                      *
-*   Copyright (C) 2010 Unai                                               *
-*                                                                         *
-*   This program is free software; you can redistribute it and/or modify  *
-*   it under the terms of the GNU General Public License as published by  *
-*   the Free Software Foundation; either version 2 of the License, or     *
-*   (at your option) any later version.                                   *
-*                                                                         *
-*   This program is distributed in the hope that it will be useful,       *
-*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
-*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
-*   GNU General Public License for more details.                          *
-*                                                                         *
-*   You should have received a copy of the GNU General Public License     *
-*   along with this program; if not, write to the                         *
-*   Free Software Foundation, Inc.,                                       *
-*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
-***************************************************************************/
-
-#ifndef _OP_BLEND_H_
-#define _OP_BLEND_H_
-
-//  GPU Blending operations functions
-
-#define gpuBlending00(uSrc,uDst) \
-{ \
-	asm ("and  %[src], %[src], %[msk]\n" \
-	     "and  %[dst], %[dst], %[msk]\n" \
-	     "add  %[src], %[dst], %[src]\n" \
-	     "mov  %[src], %[src], lsr #1\n" \
-	 : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \
-}
-
-//	1.0 x Back + 1.0 x Forward
-#define gpuBlending01(uSrc,uDst) \
-{ \
-	u32 st,dt,out; \
-	asm ("and    %[dt],  %[dst],   #0x7C00\n" \
-	     "and    %[st],  %[src],   #0x7C00\n" \
-	     "add    %[out], %[dt],    %[st]  \n" \
-	     "cmp    %[out], #0x7C00          \n" \
-	     "movhi  %[out], #0x7C00          \n" \
-	     "and    %[dt],  %[dst],   #0x03E0\n" \
-	     "and    %[st],  %[src],   #0x03E0\n" \
-	     "add    %[dt],  %[dt],    %[st]  \n" \
-	     "cmp    %[dt],  #0x03E0          \n" \
-	     "movhi  %[dt],  #0x03E0          \n" \
-	     "orr    %[out], %[out],   %[dt]  \n" \
-	     "and    %[dt],  %[dst],   #0x001F\n" \
-	     "and    %[st],  %[src],   #0x001F\n" \
-	     "add    %[dt],  %[dt],    %[st]  \n" \
-	     "cmp    %[dt],  #0x001F          \n" \
-	     "movhi  %[dt],  #0x001F          \n" \
-	     "orr    %[src], %[out],  %[dt]  \n" \
-	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
-}
-
-//	1.0 x Back - 1.0 x Forward	*/
-#define gpuBlending02(uSrc,uDst) \
-{ \
-	u32 st,dt,out; \
-	asm ("and    %[dt],  %[dst],   #0x7C00\n" \
-	     "and    %[st],  %[src],   #0x7C00\n" \
-	     "subs   %[out], %[dt],    %[st]  \n" \
-	     "movmi  %[out], #0x0000          \n" \
-	     "and    %[dt],  %[dst],   #0x03E0\n" \
-	     "and    %[st],  %[src],   #0x03E0\n" \
-	     "subs   %[dt],  %[dt],    %[st]  \n" \
-	     "orrpl  %[out], %[out],   %[dt]  \n" \
-	     "and    %[dt],  %[dst],   #0x001F\n" \
-	     "and    %[st],  %[src],   #0x001F\n" \
-	     "subs   %[dt],  %[dt],    %[st]  \n" \
-	     "orrpl  %[out], %[out],   %[dt]  \n" \
-	     "mov    %[src], %[out]           \n" \
-	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
-}
-
-//	1.0 x Back + 0.25 x Forward	*/
-#define gpuBlending03(uSrc,uDst) \
-{ \
-	u32 st,dt,out; \
-	asm ("mov    %[src], %[src],   lsr #2 \n" \
-	     "and    %[dt],  %[dst],   #0x7C00\n" \
-	     "and    %[st],  %[src],   #0x1C00\n" \
-	     "add    %[out], %[dt],    %[st]  \n" \
-	     "cmp    %[out], #0x7C00          \n" \
-	     "movhi  %[out], #0x7C00          \n" \
-	     "and    %[dt],  %[dst],   #0x03E0\n" \
-	     "and    %[st],  %[src],   #0x00E0\n" \
-	     "add    %[dt],  %[dt],    %[st]  \n" \
-	     "cmp    %[dt],  #0x03E0          \n" \
-	     "movhi  %[dt],  #0x03E0          \n" \
-	     "orr    %[out], %[out],   %[dt]  \n" \
-	     "and    %[dt],  %[dst],   #0x001F\n" \
-	     "and    %[st],  %[src],   #0x0007\n" \
-	     "add    %[dt],  %[dt],    %[st]  \n" \
-	     "cmp    %[dt],  #0x001F          \n" \
-	     "movhi  %[dt],  #0x001F          \n" \
-	     "orr    %[src], %[out],   %[dt]  \n" \
-	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
-}
-
-#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_unai/gpu_inner_light.h b/plugins/gpu_unai/gpu_inner_light.h
index f4ec2134..643c6e06 100644
--- a/plugins/gpu_unai/gpu_inner_light.h
+++ b/plugins/gpu_unai/gpu_inner_light.h
@@ -70,6 +70,12 @@ static void SetupLightLUT()
 	}
 }
 
+// gcc5+ and clang13+ understarnd this on ARM
+GPU_INLINE s32 clamp_c(s32 x) {
+    if (x < 0) return 0;
+    if (x > 31) return 31;
+    return x;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Create packed Gouraud fixed-pt 8.8 rgb triplet
@@ -111,11 +117,12 @@ GPU_INLINE gcol_t gpuPackGouraudCol(u32 r, u32 g, u32 b)
 ////////////////////////////////////////////////////////////////////////////////
 GPU_INLINE gcol_t gpuPackGouraudColInc(s32 dr, s32 dg, s32 db)
 {
-	return (gcol_t){
+	return (gcol_t){{
 		(u16)((dr >> 2) + (dr < 0)),
 		(u16)((dg >> 2) + (dg < 0)),
 		(u16)((db >> 2) + (db < 0)),
-	};
+		0
+	}};
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -136,41 +143,29 @@ GPU_INLINE uint_fast16_t gpuLightingRGB(gcol_t gCol)
 		((gCol.c.b >> 1) & 0x7c00);
 }
 
-////////////////////////////////////////////////////////////////////////////////
-// Convert packed Gouraud u32 fixed-pt 8.8 rgb triplet in 'gCol'
-//  to padded u32 5.4 bgr fixed-pt triplet, suitable for use
-//  with HQ 24-bit lighting/quantization.
-//
-// INPUT:
-//       'gCol' input:  ccccccccXXXXXXXX for c in [r, g, b]
-//                      ^ bit 16
-// RETURNS:
-//         u32 output:  000bbbbbXXXX0gggggXXXX0rrrrrXXXX
-//                      ^ bit 31
-//  Where 'X' are fixed-pt bits, '0' zero-padding, and '-' is don't care
-////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuLightingRGB24(gcol_t gCol)
+GPU_INLINE uint_fast16_t gpuLightingRGBDither(gcol_t gCol, int_fast16_t dt)
 {
-	return (gCol.c.r >> 7)
-		| ((gCol.c.g >> 7) << 10)
-		| ((gCol.c.b >> 7) << 20);
+	dt <<= 4;
+	return  clamp_c(((s32)gCol.c.r + dt) >> 11) |
+	       (clamp_c(((s32)gCol.c.g + dt) >> 11) << 5) |
+	       (clamp_c(((s32)gCol.c.b + dt) >> 11) << 10);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
 //
 // INPUT:
-//        'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//        'r8','g8','b8' are unsigned 8-bit color values, value of 127
 //          is midpoint that doesn't modify that component of texture
-//        'uSrc' input:  -bbbbbgggggrrrrr
+//        'uSrc' input:  mbbbbbgggggrrrrr
 //                       ^ bit 16
 // RETURNS:
-//          u16 output:  0bbbbbgggggrrrrr
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+//          u16 output:  mbbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, 'm' is the MSB to preserve
 ////////////////////////////////////////////////////////////////////////////////
 GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u32 bgr0888)
 {
-	// gcc can move this out of the loop if it wants to
+	// the compiler can move this out of the loop if it wants to
 	uint_fast32_t b5 = (bgr0888 >> 19);
 	uint_fast32_t g5 = (bgr0888 >> 11) & 0x1f;
 	uint_fast32_t r5 = (bgr0888 >>  3) & 0x1f;
@@ -189,11 +184,11 @@ GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u32 bgr0888)
 //  'gCol' is a Gouraud fixed-pt 8.8 rgb triplet
 //        'gCol' input:  ccccccccXXXXXXXX for c in [r, g, b]
 //                       ^ bit 16
-//        'uSrc' input:  -bbbbbgggggrrrrr
+//        'uSrc' input:  mbbbbbgggggrrrrr
 //                       ^ bit 16
 // RETURNS:
-//          u16 output:  0bbbbbgggggrrrrr
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+//          u16 output:  mbbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, 'm' is the MSB to preserve
 ////////////////////////////////////////////////////////////////////////////////
 GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, gcol_t gCol)
 {
@@ -205,72 +200,41 @@ GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, gcol_t
 
 ////////////////////////////////////////////////////////////////////////////////
 // Apply high-precision 8-bit lighting to bgr555 texture color,
-//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
-//  suitable for use with HQ 24-bit lighting/quantization.
 //
 // INPUT:
-//        'r8','g8','b8' are unsigned 8-bit color component values, value of
+//        'r','g','b' are unsigned 8-bit color component values, value of
 //          127 is midpoint that doesn't modify that component of texture
 //
-//         uSrc input: -bbbbbgggggrrrrr
+//         uSrc input: mbbbbbgggggrrrrr
 //                     ^ bit 16
 // RETURNS:
-//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
-//                     ^ bit 31
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+//        u16 output:  mbbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, 'm' is the MSB to preserve
 ////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u32 bgr0888)
+GPU_INLINE uint_fast16_t gpuLightingTXTDitherRGB(uint_fast16_t uSrc,
+	uint_fast8_t r, uint_fast8_t g, uint_fast8_t b, int_fast16_t dv)
 {
-	uint_fast16_t r1 = uSrc&0x001F;
-	uint_fast16_t g1 = uSrc&0x03E0;
-	uint_fast16_t b1 = uSrc&0x7C00;
-
-	uint_fast16_t r2 = bgr0888 & 0x0000ff;
-	uint_fast32_t g2 = bgr0888 & 0x00ff00;
-	uint_fast16_t b2 = bgr0888 >> 16;
-
-	u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
-	u32 g3 = g1 * g2; if (g3 & 0xFE000000) g3 = ~0xFE000000;
-	u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
-
-	return ((r3>> 3)    ) |
-	       ((g3>>16)<<10) |
-	       ((b3>>13)<<20);
+	uint_fast16_t rs = uSrc & 0x001F;
+	uint_fast16_t gs = uSrc & 0x03E0;
+	uint_fast16_t bs = uSrc & 0x7C00;
+	s32 r3 = rs * r +  dv;
+	s32 g3 = gs * g + (dv << 5);
+	s32 b3 = bs * b + (dv << 10);
+	return  clamp_c(r3 >> 7) |
+	       (clamp_c(g3 >> 12) << 5) |
+	       (clamp_c(b3 >> 17) << 10) |
+	       (uSrc & 0x8000);
 }
 
-
-////////////////////////////////////////////////////////////////////////////////
-// Apply high-precision 8-bit lighting to bgr555 texture color in 'uSrc',
-//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
-//  suitable for use with HQ 24-bit lighting/quantization.
-//
-// INPUT:
-//       'uSrc' input: -bbbbbgggggrrrrr
-//                     ^ bit 16
-//       'gCol' input: ccccccccXXXXXXXX for c in [r, g, b]
-//                     ^ bit 16
-// RETURNS:
-//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
-//                     ^ bit 31
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
-////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuLightingTXT24Gouraud(uint_fast16_t uSrc, gcol_t gCol)
+GPU_INLINE uint_fast16_t gpuLightingTXTDither(uint_fast16_t uSrc, u32 bgr0888, int_fast16_t dv)
 {
-	uint_fast16_t r1 = uSrc&0x001F;
-	uint_fast16_t g1 = uSrc&0x03E0;
-	uint_fast16_t b1 = uSrc&0x7C00;
-
-	uint_fast16_t r2 = gCol.c.r >> 8;
-	uint_fast16_t g2 = gCol.c.g >> 8;
-	uint_fast16_t b2 = gCol.c.b >> 8;
-
-	u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
-	u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
-	u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+	return gpuLightingTXTDitherRGB(uSrc, bgr0888 & 0xff,
+			(bgr0888 >> 8) & 0xff, bgr0888 >> 16, dv);
+}
 
-	return ((r3>> 3)    ) |
-	       ((g3>> 8)<<10) |
-	       ((b3>>13)<<20);
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudDither(uint_fast16_t uSrc, gcol_t gCol, int_fast8_t dv)
+{
+	return gpuLightingTXTDitherRGB(uSrc, gCol.c.r >> 8, gCol.c.g >> 8, gCol.c.b >> 8, dv);
 }
 
 #endif  //_OP_LIGHT_H_
diff --git a/plugins/gpu_unai/gpu_inner_quantization.h b/plugins/gpu_unai/gpu_inner_quantization.h
index 8a4e9354..5abcd2d3 100644
--- a/plugins/gpu_unai/gpu_inner_quantization.h
+++ b/plugins/gpu_unai/gpu_inner_quantization.h
@@ -22,87 +22,17 @@
 
 static void SetupDitheringConstants()
 {
-	// Initialize Dithering Constants
-	// The screen is divided into 8x8 chunks and sub-unitary noise is applied
-	// using the following matrix. This ensures that data lost in color
-	// quantization will be added back to the image 'by chance' in predictable
-	// patterns that are naturally 'smoothed' by your sight when viewed from a
-	// certain distance.
-	//
-	// http://caca.zoy.org/study/index.html
-	//
-	// Shading colors are encoded in 4.5, and then are quantitized to 5.0,
-	// DitherMatrix constants reflect that.
-
-	static const u8 DitherMatrix[] = {
-		 0, 32,  8, 40,  2, 34, 10, 42,
-		48, 16, 56, 24, 50, 18, 58, 26,
-		12, 44,  4, 36, 14, 46,  6, 38,
-		60, 28, 52, 20, 62, 30, 54, 22,
-		 3, 35, 11, 43,  1, 33,  9, 41,
-		51, 19, 59, 27, 49, 17, 57, 25,
-		15, 47,  7, 39, 13, 45,  5, 37,
-		63, 31, 55, 23, 61, 29, 53, 21
+	static const s8 DitherMatrix[4][4] = {
+		{ -4,  0, -3,  1 },
+		{  2, -2,  3, -1 },
+		{ -3,  1, -4,  0 },
+		{  3, -1,  2, -2 }
 	};
 
 	int i, j;
-	for (i = 0; i < 8; i++)
-	{
-		for (j = 0; j < 8; j++)
-		{
-			u16 offset = (i << 3) | j;
-
-			u32 component = ((DitherMatrix[offset] + 1) << 4) / 65; //[5.5] -> [5]
-
-			// XXX - senquack - hack Dec 2016
-			//  Until JohnnyF gets the time to work further on dithering,
-			//   force lower bit of component to 0. This fixes grid pattern
-			//   affecting quality of dithered image, as well as loss of
-			//   detail in dark areas. With lower bit unset like this, existing
-			//   27-bit accuracy of dithering math is unneeded, could be 24-bit.
-			//   Is 8x8 matrix overkill as a result, can we use 4x4?
-			component &= ~1;
-
-			gpu_unai.DitherMatrix[offset] = (component)
-			                              | (component << 10)
-			                              | (component << 20);
-		}
-	}
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Convert padded u32 5.4:5.4:5.4 bgr fixed-pt triplet to final bgr555 color,
-//  applying dithering if specified by template parameter.
-//
-// INPUT:
-//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
-//                     ^ bit 31
-//       'pDst' is a pointer to destination framebuffer pixel, used
-//         to determine which DitherMatrix[] entry to apply.
-// RETURNS:
-//         u16 output: 0bbbbbgggggrrrrr
-//                     ^ bit 16
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
-////////////////////////////////////////////////////////////////////////////////
-template <int DITHER>
-GPU_INLINE u16 gpuColorQuantization24(u32 uSrc24, const le16_t *pDst)
-{
-	if (DITHER)
-	{
-		uintptr_t fbpos = pDst - gpu_unai.vram;
-		u16 offset = ((fbpos & (0x7 << 10)) >> 7) | (fbpos & 0x7);
-
-		//clean overflow flags and add
-		uSrc24 = (uSrc24 & 0x1FF7FDFF) + gpu_unai.DitherMatrix[offset];
-
-		if (uSrc24 & (1<< 9)) uSrc24 |= (0x1FF    );
-		if (uSrc24 & (1<<19)) uSrc24 |= (0x1FF<<10);
-		if (uSrc24 & (1<<29)) uSrc24 |= (0x1FF<<20);
-	}
-
-	return ((uSrc24>> 4) & (0x1F    ))
-	     | ((uSrc24>> 9) & (0x1F<<5 ))
-	     | ((uSrc24>>14) & (0x1F<<10));
+	for (i = 0; i < 4; i++)
+		for (j = 0; j < 4; j++)
+			gpu_unai.DitherLut16[i][j] = (u16)DitherMatrix[i][j] << 4;
 }
 
 #endif //_OP_DITHER_H_
diff --git a/plugins/gpu_unai/gpu_raster_polygon.h b/plugins/gpu_unai/gpu_raster_polygon.h
index 9b259bb1..988e721f 100644
--- a/plugins/gpu_unai/gpu_raster_polygon.h
+++ b/plugins/gpu_unai/gpu_raster_polygon.h
@@ -371,7 +371,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 				if ((xmin - xa) > 0) xa = xmin;
 				if (xb > xmax) xb = xmax;
 				if ((xb - xa) > 0)
-					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
+					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa), ya);
 			}
 		}
 	} while (++cur_pass < total_passes);
@@ -387,10 +387,6 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 	gpu_unai.inn.r8 = packet.U1[0];
 	gpu_unai.inn.g8 = packet.U1[1];
 	gpu_unai.inn.b8 = packet.U1[2];
-	// r5/g5/b5 used if just texture-blending is applied (15-bit light)
-	gpu_unai.inn.r5 = packet.U1[0] >> 3;
-	gpu_unai.inn.g5 = packet.U1[1] >> 3;
-	gpu_unai.inn.b5 = packet.U1[2] >> 3;
 
 	PolyVertex vbuf[4];
 	polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
@@ -706,7 +702,7 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 
 				if (xb > xmax) xb = xmax;
 				if ((xb - xa) > 0)
-					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
+					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa), ya);
 			}
 		}
 	} while (++cur_pass < total_passes);
@@ -1055,7 +1051,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 
 				if (xb > xmax) xb = xmax;
 				if ((xb - xa) > 0)
-					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
+					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa), ya);
 			}
 		}
 	} while (++cur_pass < total_passes);
@@ -1462,7 +1458,7 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 
 				if (xb > xmax) xb = xmax;
 				if ((xb - xa) > 0)
-					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
+					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa), ya);
 			}
 		}
 	} while (++cur_pass < total_passes);
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h
index 2e30a283..ec0e7151 100644
--- a/plugins/gpu_unai/gpu_unai.h
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -255,12 +255,20 @@ struct gpu_unai_inner_t {
 	};
 
 	// Color for flat-shaded, texture-blended prims
-	u8  r5, g5, b5, pad5;     // 20 5-bit light for undithered prims
-	u8  r8, g8, b8, pad8;     // 24 8-bit light for dithered prims
+	u8  r5, g5, b5, pad5;     // 20 5-bit light for sprite asm
+	union {
+	  u32 bgr0888;            // 24 8-bit light for dithered prims
+	  struct {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	    u8 pad8, b8, g8, r8;
+#else
+	    u8 r8, g8, b8, pad8;
+#endif
+	  };
+	};
 
 	// Color for Gouraud-shaded prims
 	// Fixed-pt 8.8 rgb triplet
-	// Packed fixed-pt 8.3:8.3:8.2 rgb triplet
 	//  layout:  ccccccccXXXXXXXX for c in [r, g, b]
 	//           ^ bit 16
 	gcol_t gCol;       // 28
@@ -269,10 +277,7 @@ struct gpu_unai_inner_t {
 	// Color for flat-shaded, untextured prims
 	u16 PixelData;     // 38 bgr555 color for untextured flat-shaded polys
 
-	u8 blit_mask;           // Determines what pixels to skip when rendering.
-	                        //  Only useful on low-resolution devices using
-	                        //  a simple pixel-dropping downscaler for PS1
-	                        //  high-res modes. See 'pixel_skip' option.
+	u8 unused2;
 
 	u8 ilace_mask;          // Determines what lines to skip when rendering.
 	                        //  Normally 0 when PS1 240 vertical res is in
@@ -356,6 +361,8 @@ struct gpu_unai_t {
 	// End of inner Loop parameters
 	////////////////////////////////////////////////////////////////////////////
 
+	s16 DitherLut16[4][4];  // shifted up by 4 and s16 to simplify lookup asm
+
 	bool prog_ilace_flag;   // Tracks successive frames for 'prog_ilace' option
 
 	u8 BLEND_MODE;
@@ -367,7 +374,6 @@ struct gpu_unai_t {
 	gpu_unai_config_t config;
 
 	u8  LightLUT[32*32];    // 5-bit lighting LUT (gpu_inner_light.h)
-	u32 DitherMatrix[64];   // Matrix of dither coefficients
 };
 
 static __attribute__((aligned(32))) gpu_unai_t gpu_unai;
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp
index 71c92728..2a8ae75e 100644
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -274,7 +274,6 @@ int renderer_do_cmd_list(u32 *list_, int list_len, uint32_t *ex_regs,
       case 0x22:
       case 0x23: {          // Monochrome 3-pt poly
         PP driver = gpuPolySpanDrivers[
-          //(gpu_unai.blit_mask?1024:0) |
           Blending_Mode |
           gpu_unai.Masking | Blending | gpu_unai.PixelMSB
         ];
@@ -290,7 +289,6 @@ int renderer_do_cmd_list(u32 *list_, int list_len, uint32_t *ex_regs,
         gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[4]) >> 16);
 
         u32 driver_idx =
-          //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode | gpu_unai.TEXT_MODE |
           gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
@@ -312,7 +310,6 @@ int renderer_do_cmd_list(u32 *list_, int list_len, uint32_t *ex_regs,
       case 0x2A:
       case 0x2B: {          // Monochrome 4-pt poly
         PP driver = gpuPolySpanDrivers[
-          //(gpu_unai.blit_mask?1024:0) |
           Blending_Mode |
           gpu_unai.Masking | Blending | gpu_unai.PixelMSB
         ];
@@ -340,7 +337,6 @@ int renderer_do_cmd_list(u32 *list_, int list_len, uint32_t *ex_regs,
         gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
 
         u32 driver_idx =
-          //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode | gpu_unai.TEXT_MODE |
           gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
@@ -372,7 +368,6 @@ int renderer_do_cmd_list(u32 *list_, int list_len, uint32_t *ex_regs,
         if ((xor_ & HTOLE32(0xf8f8f8)) == 0)
           gouraud = 0;
         PP driver = gpuPolySpanDrivers[
-          //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode |
           gpu_unai.Masking | Blending | gouraud | gpu_unai.PixelMSB
@@ -403,7 +398,6 @@ int renderer_do_cmd_list(u32 *list_, int list_len, uint32_t *ex_regs,
           }
         }
         PP driver = gpuPolySpanDrivers[
-          //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode | gpu_unai.TEXT_MODE |
           gpu_unai.Masking | Blending | gouraud | lighting | gpu_unai.PixelMSB
@@ -427,7 +421,6 @@ int renderer_do_cmd_list(u32 *list_, int list_len, uint32_t *ex_regs,
         if ((xor_ & HTOLE32(0xf8f8f8)) == 0)
           gouraud = 0;
         PP driver = gpuPolySpanDrivers[
-          //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode |
           gpu_unai.Masking | Blending | gouraud | gpu_unai.PixelMSB
@@ -470,7 +463,6 @@ int renderer_do_cmd_list(u32 *list_, int list_len, uint32_t *ex_regs,
           }
         }
         PP driver = gpuPolySpanDrivers[
-          //(gpu_unai.blit_mask?1024:0) |
           Dithering |
           Blending_Mode | gpu_unai.TEXT_MODE |
           gpu_unai.Masking | Blending | gouraud | lighting | gpu_unai.PixelMSB