-// 1.0 x Back + 0.25 x Forward */
-#ifdef __arm__
-#define gpuBlending03(uSrc,uDst) \
-{ \
- u32 st,dt,out; \
- asm ("mov %[src], %[src], lsr #2 \n" \
- "and %[dt], %[dst], #0x7C00\n" \
- "and %[st], %[src], #0x1C00\n" \
- "add %[out], %[dt], %[st] \n" \
- "cmp %[out], #0x7C00 \n" \
- "movhi %[out], #0x7C00 \n" \
- "and %[dt], %[dst], #0x03E0\n" \
- "and %[st], %[src], #0x00E0\n" \
- "add %[dt], %[dt], %[st] \n" \
- "cmp %[dt], #0x03E0 \n" \
- "movhi %[dt], #0x03E0 \n" \
- "orr %[out], %[out], %[dt] \n" \
- "and %[dt], %[dst], #0x001F\n" \
- "and %[st], %[src], #0x0007\n" \
- "add %[dt], %[dt], %[st] \n" \
- "cmp %[dt], #0x001F \n" \
- "movhi %[dt], #0x001F \n" \
- "orr %[src], %[out], %[dt] \n" \
- : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
- : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
-}
-#else
-#define gpuBlending03(uSrc,uDst) \
-{ \
- u16 rr, gg, bb; \
- uSrc >>= 2; \
- bb = (uDst & 0x7C00) + (uSrc & 0x1C00); if (bb > 0x7C00) bb = 0x7C00; \
- gg = (uDst & 0x03E0) + (uSrc & 0x00E0); if (gg > 0x03E0) gg = 0x03E0; bb |= gg; \
- rr = (uDst & 0x001F) + (uSrc & 0x0007); if (rr > 0x001F) rr = 0x001F; bb |= rr; \
- uSrc = bb; \
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend padded u32 5.4:5.4:5.4 bgr fixed-pt color triplet in 'uSrc24'
+// (foreground color) with bgr555 color in 'uDst' (background color),
+// returning the resulting u32 5.4:5.4:5.4 color.
+//
+// INPUT:
+// 'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+// ^ bit 31
+// 'uDst' input: -bbbbbgggggrrrrr
+// ^ bit 16
+// RETURNS:
+// u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+// ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE>
+GPU_INLINE u32 gpuBlending24(u32 uSrc24, uint_fast16_t uDst)
+{
+ // These use techniques adapted from Blargg's techniques mentioned in
+ // in gpuBlending() comments above. Not as much bitwise trickery is
+ // necessary because of presence of 0 padding in uSrc24 format.
+
+ u32 uDst24 = gpuGetRGB24(uDst);
+ u32 mix;
+
+ // 0.5 x Back + 0.5 x Forward
+ if (BLENDMODE==0) {
+ const u32 uMsk = 0x1FE7F9FE;
+ // Only need to mask LSBs of uSrc24, uDst24's LSBs are 0 already
+ mix = (uDst24 + (uSrc24 & uMsk)) >> 1;
+ }
+
+ // 1.0 x Back + 1.0 x Forward
+ if (BLENDMODE==1) {
+ u32 sum = uSrc24 + uDst24;
+ u32 carries = sum & 0x20080200;
+ u32 modulo = sum - carries;
+ u32 clamp = carries - (carries >> 9);
+ mix = modulo | clamp;
+ }
+
+ // 1.0 x Back - 1.0 x Forward
+ if (BLENDMODE==2) {
+ // Insert ones in 0-padded borrow slot of color to be subtracted from
+ uDst24 |= 0x20080200;
+ u32 diff = uDst24 - uSrc24;
+ u32 borrows = diff & 0x20080200;
+ u32 clamp = borrows - (borrows >> 9);
+ mix = diff & clamp;
+ }
+
+ // 1.0 x Back + 0.25 x Forward
+ if (BLENDMODE==3) {
+ uSrc24 = (uSrc24 & 0x1FC7F1FC) >> 2;
+ u32 sum = uSrc24 + uDst24;
+ u32 carries = sum & 0x20080200;
+ u32 modulo = sum - carries;
+ u32 clamp = carries - (carries >> 9);
+ mix = modulo | clamp;
+ }
+
+ return mix;