//#endif
#include "gpu_inner_blend.h"
-
-#ifdef __arm__
-#include "gpu_inner_blend_arm.h"
-#define gpuBlending gpuBlendingARM
-#else
-#define gpuBlending gpuBlendingGeneric
-#endif
-
#include "gpu_inner_quantization.h"
#include "gpu_inner_light.h"
#ifdef __arm__
+#include "gpu_inner_blend_arm.h"
#include "gpu_inner_light_arm.h"
+#define gpuBlending gpuBlendingARM
#define gpuLightingRGB gpuLightingRGBARM
#define gpuLightingTXT gpuLightingTXTARM
#define gpuLightingTXTGouraud gpuLightingTXTGouraudARM
+// Non-dithering lighting and blending functions preserve uSrc
+// MSB. This saves a few operations and useless load/stores.
+#define MSB_PRESERVED (!CF_DITHER)
#else
+#define gpuBlending gpuBlendingGeneric
#define gpuLightingRGB gpuLightingRGBGeneric
#define gpuLightingTXT gpuLightingTXTGeneric
#define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric
+#define MSB_PRESERVED 0
#endif
else { *(u16*)pDst = col; }
}
} else {
- u16 uDst = *(u16*)pDst;
+ uint_fast16_t uDst = *(u16*)pDst;
if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
- u16 uSrc = col;
+ uint_fast16_t uSrc = col;
if (CF_BLEND)
uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
else { *(u16*)pDst = col; }
}
} else {
- u16 uDst = *(u16*)pDst;
+ uint_fast16_t uDst = *(u16*)pDst;
if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
col = gpuGouraudColor15bpp(r, g, b);
- u16 uSrc = col;
+ uint_fast16_t uSrc = col;
// Blend func can save an operation if it knows uSrc MSB is
// unset. For untextured prims, this is always true.
// unset. For untextured prims, this is always true.
const bool skip_uSrc_mask = true;
- u16 uSrc, uDst;
+ uint_fast16_t uSrc, uDst;
do
{
if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
{
// Blend func can save an operation if it knows uSrc MSB is unset.
// Untextured prims can always skip (source color always comes with MSB=0).
- // For textured prims, lighting funcs always return it unset. (bonus!)
- const bool skip_uSrc_mask = (!CF_TEXTMODE) || CF_LIGHT;
+ // For textured prims, the generic lighting funcs always return it unset. (bonus!)
+ const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
- u16 uSrc, uDst, srcMSB;
+ uint_fast16_t uSrc, uDst, srcMSB;
+ bool should_blend;
u32 u0_mask = gpu_unai.TextureWindow[2];
u8 r5, g5, b5;
if (CF_LIGHT)
uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
- if (CF_BLEND && srcMSB)
+ should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
+
+ if (CF_BLEND && should_blend)
uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
- if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
- else if (CF_BLEND || CF_LIGHT) { *pDst = uSrc | srcMSB; }
- else { *pDst = uSrc; }
+ if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+ else if (!MSB_PRESERVED && (CF_BLEND || CF_LIGHT)) { *pDst = uSrc | srcMSB; }
+ else { *pDst = uSrc; }
endsprite:
u0 += (CF_TEXTMODE==3) ? 2 : 1;
{
// Blend func can save an operation if it knows uSrc MSB is unset.
// Untextured prims can always skip this (src color MSB is always 0).
- // For textured prims, lighting funcs always return it unset. (bonus!)
- const bool skip_uSrc_mask = (!CF_TEXTMODE) || CF_LIGHT;
+ // For textured prims, the generic lighting funcs always return it unset. (bonus!)
+ const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
+ bool should_blend;
u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.blit_mask;
// UNTEXTURED, NO GOURAUD
const u16 pix15 = gpu_unai.PixelData;
do {
- u16 uSrc, uDst;
+ uint_fast16_t uSrc, uDst;
// NOTE: Don't enable CF_BLITMASK pixel skipping (speed hack)
// on untextured polys. It seems to do more harm than good: see
u32 l_gInc = gpu_unai.gInc;
do {
- u16 uDst, uSrc;
+ uint_fast16_t uDst, uSrc;
// See note in above loop regarding CF_BLITMASK
//if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolynotextgou; }
{
// TEXTURED
- u16 uDst, uSrc, srcMSB;
+ uint_fast16_t uDst, uSrc, srcMSB;
//senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
// one 32-bit unsigned int, but this proved to lose too much accuracy
uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
}
- if (CF_BLEND && srcMSB)
+ should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
+ if (CF_BLEND && should_blend)
uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
}
- if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
- else if (CF_BLEND || CF_LIGHT) { *pDst = uSrc | srcMSB; }
- else { *pDst = uSrc; }
+ if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+ else if (!MSB_PRESERVED && (CF_BLEND || CF_LIGHT)) { *pDst = uSrc | srcMSB; }
+ else { *pDst = uSrc; }
endpolytext:
pDst++;
l_u = (l_u + l_u_inc) & l_u_msk;
// Where '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
template <int BLENDMODE, bool SKIP_USRC_MSB_MASK>
-GPU_INLINE u16 gpuBlendingGeneric(u16 uSrc, u16 uDst)
+GPU_INLINE uint_fast16_t gpuBlendingGeneric(uint_fast16_t uSrc, uint_fast16_t uDst)
{
// These use Blargg's bitwise modulo-clamping:
// http://blargg.8bitalley.com/info/rgb_mixing.html
// http://blargg.8bitalley.com/info/rgb_clamped_add.html
// http://blargg.8bitalley.com/info/rgb_clamped_sub.html
- u16 mix;
+ uint_fast16_t mix;
// 0.5 x Back + 0.5 x Forward
if (BLENDMODE==0) {
// ^ bit 31
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuGetRGB24(u16 uSrc)
+GPU_INLINE u32 gpuGetRGB24(uint_fast16_t uSrc)
{
return ((uSrc & 0x7C00)<<14)
| ((uSrc & 0x03E0)<< 9)
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
template <int BLENDMODE>
-GPU_INLINE u32 gpuBlending24(u32 uSrc24, u16 uDst)
+GPU_INLINE u32 gpuBlending24(u32 uSrc24, uint_fast16_t uDst)
{
// These use techniques adapted from Blargg's techniques mentioned in
// in gpuBlending() comments above. Not as much bitwise trickery is
// Where '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
template <int BLENDMODE, bool SKIP_USRC_MSB_MASK>
-GPU_INLINE u16 gpuBlendingARM(u16 uSrc, u16 uDst)
+GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst)
{
// These use Blargg's bitwise modulo-clamping:
// http://blargg.8bitalley.com/info/rgb_mixing.html
// http://blargg.8bitalley.com/info/rgb_clamped_sub.html
- u16 mix;
+ uint_fast16_t mix;
asm ("bic %[uDst], %[uDst], #0x8000" : [uDst] "+r" (uDst));
: [diff] "=&r" (diff), [mix] "=&r" (mix)
: [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x8420));
}
+
+ // There's not a case where we can get into this function,
+ // SKIP_USRC_MSB_MASK is false, and the msb of uSrc is unset.
+ if (!SKIP_USRC_MSB_MASK) {
+ asm ("orr %[mix], %[mix], #0x8000" : [mix] "+r" (mix));
+ }
return mix;
}
// ^ bit 16
// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u16 gpuLightingRGBGeneric(u32 gCol)
+GPU_INLINE uint_fast16_t gpuLightingRGBGeneric(u32 gCol)
{
return ((gCol<< 5)&0x7C00) |
((gCol>>11)&0x03E0) |
// u16 output: 0bbbbbgggggrrrrr
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u16 gpuLightingTXTGeneric(u16 uSrc, u8 r5, u8 g5, u8 b5)
+GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
{
return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) |
(gpu_unai.LightLUT[ (uSrc&0x03E0) | g5] << 5) |
// u16 output: 0bbbbbgggggrrrrr
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u16 gpuLightingTXTGouraudGeneric(u16 uSrc, u32 gCol)
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, u32 gCol)
{
return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | ((gCol>> 5)&0x1F)]<<10) |
(gpu_unai.LightLUT[ (uSrc&0x03E0) | ((gCol>>16)&0x1F)]<< 5) |
// ^ bit 31
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuLightingTXT24(u16 uSrc, u8 r8, u8 g8, u8 b8)
+GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u8 r8, u8 g8, u8 b8)
{
- u16 r1 = uSrc&0x001F;
- u16 g1 = uSrc&0x03E0;
- u16 b1 = uSrc&0x7C00;
+ uint_fast16_t r1 = uSrc&0x001F;
+ uint_fast16_t g1 = uSrc&0x03E0;
+ uint_fast16_t b1 = uSrc&0x7C00;
- u16 r2 = r8;
- u16 g2 = g8;
- u16 b2 = b8;
+ uint_fast16_t r2 = r8;
+ uint_fast16_t g2 = g8;
+ uint_fast16_t b2 = b8;
u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
// ^ bit 31
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuLightingTXT24Gouraud(u16 uSrc, u32 gCol)
+GPU_INLINE u32 gpuLightingTXT24Gouraud(uint_fast16_t uSrc, u32 gCol)
{
- u16 r1 = uSrc&0x001F;
- u16 g1 = uSrc&0x03E0;
- u16 b1 = uSrc&0x7C00;
+ uint_fast16_t r1 = uSrc&0x001F;
+ uint_fast16_t g1 = uSrc&0x03E0;
+ uint_fast16_t b1 = uSrc&0x7C00;
- u16 r2 = (gCol>>24) & 0xFF;
- u16 g2 = (gCol>>13) & 0xFF;
- u16 b2 = (gCol>> 2) & 0xFF;
+ uint_fast16_t r2 = (gCol>>24) & 0xFF;
+ uint_fast16_t g2 = (gCol>>13) & 0xFF;
+ uint_fast16_t b2 = (gCol>> 2) & 0xFF;
u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
// ^ bit 16
// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u16 gpuLightingRGBARM(u32 gCol)
+GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
{
- u16 out = 0x03E0; // don't need the mask after starting to write output
+ uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
u32 tmp;
asm ("and %[tmp], %[gCol], %[out]\n\t" // tmp holds 0x000000bbbbb00000
}
-GPU_INLINE u16 gpuLightingTXTARM(u16 uSrc, u8 r5, u8 g5, u8 b5)
+GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
{
- u16 out = 0x03E0;
+ uint_fast16_t out = 0x03E0;
u32 db, dg;
asm ("and %[dg], %[out], %[src] \n\t"
"orr %[dg], %[dg], %[g5] \n\t"
"orr %[db], %[db], %[b5] \n\t"
"ldrb %[out], [%[lut], %[out]] \n\t"
"ldrb %[db], [%[lut], %[db]] \n\t"
+ "tst %[src], #0x8000\n\t"
"orr %[out], %[out], %[dg], lsl #0x05 \n\t"
+ "orrne %[out], %[out], #0x8000\n\t"
"orr %[out], %[out], %[db], lsl #0x0A \n\t"
: [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg)
: [r5] "r" (r5), [g5] "r" (g5), [b5] "r" (b5),
return out;
}
-GPU_INLINE u16 gpuLightingTXTGouraudARM(u16 uSrc, u32 gCol)
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudARM(uint_fast16_t uSrc, u32 gCol)
{
- u16 out = 0x03E0; // don't need the mask after starting to write output
+ uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
u32 db,dg,gtmp;
asm ("and %[dg], %[out], %[src] \n\t"
"and %[gtmp],%[out], %[gCol], lsr #0x0B \n\t"
"orr %[db], %[db], %[gtmp], lsr #0x05 \n\t"
"ldrb %[out], [%[lut], %[out]] \n\t"
"ldrb %[db], [%[lut], %[db]] \n\t"
+ "tst %[src], #0x8000\n\t"
"orr %[out], %[out], %[dg], lsl #0x05 \n\t"
+ "orrne %[out], %[out], #0x8000\n\t"
"orr %[out], %[out], %[db], lsl #0x0A \n\t"
: [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg),
[gtmp] "=&r" (gtmp) \