#include "gpu_inner_blend_arm.h"
#include "gpu_inner_light_arm.h"
#define gpuBlending gpuBlendingARM
-#define gpuLightingTXT gpuLightingTXTARM
-#else
+#endif
+#ifndef gpuBlending
#define gpuBlending gpuBlendingGeneric
+#endif
+#ifndef gpuLightingTXT // gpuLightingTXTARM
#define gpuLightingTXT gpuLightingTXTGeneric
#endif
+#ifndef gpuLightingTXTGouraud // gpuLightingTXTGouraudARM
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric
+#endif
// Non-dithering lighting and blending functions preserve uSrc
// MSB. This saves a few operations and useless load/stores.
uint_fast16_t uSrc, uDst, srcMSB;
bool should_blend;
u32 u0_mask = inn.u_msk >> 10;
+ u32 bgr0888;
- u8 r5, g5, b5;
if (CF_LIGHT) {
- r5 = inn.r5;
- g5 = inn.g5;
- b5 = inn.b5;
+ bgr0888 = (gpu_unai.inn.b8 << 16) |
+ (gpu_unai.inn.g8 << 8) |
+ gpu_unai.inn.r8;
}
const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA;
if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
if (CF_LIGHT)
- uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+ uSrc = gpuLightingTXT(uSrc, bgr0888);
should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
endpolynotextgou:
pDst++;
- l_gCol.raw += l_gInc.raw;
+ l_gCol += l_gInc;
}
while (--count);
}
const le16_t* TBA_ = gpu_unai.inn.TBA;
const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA;
- u8 r5, g5, b5;
- u8 r8, g8, b8;
+ u32 bgr0888;
gcol_t l_gInc, l_gCol;
+ int pcounter = count - 1; // "repeat while positive" counter
if (CF_LIGHT) {
if (CF_GOURAUD) {
l_gInc = gpu_unai.inn.gInc;
l_gCol = gpu_unai.inn.gCol;
+
+ l_gInc.set_counter(-1);
+ l_gCol.set_counter(pcounter);
} else {
- if (CF_DITHER) {
- r8 = gpu_unai.inn.r8;
- g8 = gpu_unai.inn.g8;
- b8 = gpu_unai.inn.b8;
- } else {
- r5 = gpu_unai.inn.r5;
- g5 = gpu_unai.inn.g5;
- b5 = gpu_unai.inn.b5;
- }
+ // keep this packed, otherwise gcc runs out of regs
+ bgr0888 = (gpu_unai.inn.b8 << 16) |
+ (gpu_unai.inn.g8 << 8) |
+ gpu_unai.inn.r8;
+ // XXX pre-pack
}
}
if ( CF_GOURAUD)
uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol);
if (!CF_GOURAUD)
- uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8);
+ uSrc24 = gpuLightingTXT24(uSrc, bgr0888);
if (CF_BLEND && srcMSB)
uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
if ( CF_GOURAUD)
uSrc = gpuLightingTXTGouraud(uSrc, l_gCol);
if (!CF_GOURAUD)
- uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+ uSrc = gpuLightingTXT(uSrc, bgr0888);
}
should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
pDst++;
l_u = (l_u + l_u_inc) & l_u_msk;
l_v += l_v_inc;
- if (CF_LIGHT && CF_GOURAUD)
- l_gCol.raw += l_gInc.raw;
+ if (CF_LIGHT && CF_GOURAUD) {
+ l_gCol += l_gInc;
+ l_gCol.get_counter(pcounter);
+ }
+ pcounter--;
}
- while (--count);
+ while (pcounter >= 0);
}
}
////////////////////////////////////////////////////////////////////////////////
GPU_INLINE gcol_t gpuPackGouraudCol(u32 r, u32 g, u32 b)
{
- return (gcol_t){
+ return (gcol_t){{
(u16)(r >> 2),
(u16)(g >> 2),
(u16)(b >> 2),
- };
+ 0
+ }};
}
////////////////////////////////////////////////////////////////////////////////
// u16 output: 0bbbbbgggggrrrrr
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u32 bgr0888)
{
+ // gcc can move this out of the loop if it wants to
+ uint_fast32_t b5 = (bgr0888 >> 19);
+ uint_fast32_t g5 = (bgr0888 >> 11) & 0x1f;
+ uint_fast32_t r5 = (bgr0888 >> 3) & 0x1f;
+
return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) |
(gpu_unai.LightLUT[ (uSrc&0x03E0) | g5] << 5) |
(gpu_unai.LightLUT[((uSrc&0x001F)<<5) | r5] ) |
// u16 output: 0bbbbbgggggrrrrr
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGouraud(uint_fast16_t uSrc, gcol_t gCol)
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, gcol_t gCol)
{
return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | (gCol.c.b >> 11)] << 10) |
(gpu_unai.LightLUT[ (uSrc&0x03E0) | (gCol.c.g >> 11)] << 5) |
// ^ bit 31
// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u8 r8, u8 g8, u8 b8)
+GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u32 bgr0888)
{
uint_fast16_t r1 = uSrc&0x001F;
uint_fast16_t g1 = uSrc&0x03E0;
uint_fast16_t b1 = uSrc&0x7C00;
- uint_fast16_t r2 = r8;
- uint_fast16_t g2 = g8;
- uint_fast16_t b2 = b8;
+ uint_fast16_t r2 = bgr0888 & 0x0000ff;
+ uint_fast32_t g2 = bgr0888 & 0x00ff00;
+ uint_fast16_t b2 = bgr0888 >> 16;
u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
- u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+ u32 g3 = g1 * g2; if (g3 & 0xFE000000) g3 = ~0xFE000000;
u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
return ((r3>> 3) ) |
- ((g3>> 8)<<10) |
+ ((g3>>16)<<10) |
((b3>>13)<<20);
}
// ^ bit 16
// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
////////////////////////////////////////////////////////////////////////////////
+// note: outdated, unused
GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
{
uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
return out;
}
+//#ifdef HAVE_ARMV5E // todo?
+#ifdef HAVE_ARMV6
+
////////////////////////////////////////////////////////////////////////////////
-// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+// Apply 8-bit lighting to bgr555 texture color:
//
// INPUT:
-// 'r5','g5','b5' are unsigned 5-bit color values, value of 15
+// 'r8','g8','b8' are unsigned 8-bit color values, value of 127
// is midpoint that doesn't modify that component of texture
// 'uSrc' input: mbbbbbgggggrrrrr
// ^ bit 16
// u16 output: mbbbbbgggggrrrrr
// Where 'X' are fixed-pt bits.
////////////////////////////////////////////////////////////////////////////////
-#ifdef HAVE_ARMV6
-// clang uses smulbb but not gcc, so we need this
-GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b)
+// on v6 we have single-cycle mul and sat which is better than the LightLUT
+GPU_INLINE u32 gpuLightingTXTARM(u32 uSrc, u32 bgr0888)
{
- int_fast16_t r;
- asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
- return r;
+ int_fast32_t r, g, b, s_d = uSrc;
+ // has to be in a block, otherwise gcc schedules the insns poorly
+ asm("and %[r], %[s_d], #0x001f\n"
+ "and %[b], %[bgr], #0xff\n"
+ "smulbb %[r], %[r], %[b]\n"
+ "uxtb %[b], %[bgr], ror #8\n"
+ "and %[g], %[s_d], #0x03e0\n"
+ "smulbb %[g], %[g], %[b]\n"
+ "and %[b], %[s_d], #0x7c00\n"
+ "and %[s_d],%[s_d], #0x8000\n"
+ "smulbt %[b], %[b], %[bgr]\n"
+ "usat %[r], #5, %[r], asr #7\n"
+ "usat %[g], #5, %[g], asr #12\n"
+ "usat %[b], #5, %[b], asr #17\n"
+ "orr %[s_d],%[s_d], %[r]\n"
+ "orr %[s_d],%[s_d], %[g], lsl #5\n"
+ "orr %[s_d],%[s_d], %[b], lsl #10\n"
+ : [s_d]"+r"(s_d), [r]"=&r"(r), [g]"=&r"(g), [b]"=&r"(b)
+ : [bgr]"r"(bgr0888));
+ return s_d;
}
+#define gpuLightingTXT gpuLightingTXTARM
-GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+GPU_INLINE u32 gpuLightingTXTGouraudARM(u32 uSrc, gcol_t gCol)
{
- // on v6 we have single-cycle mul and sat which is better than the lut
- int_fast16_t r = smulbb(uSrc & 0x001f, r5);
- int_fast16_t g = smulbb(uSrc & 0x03e0, g5);
- int_fast16_t b = smulbb(uSrc & 0x7c00, b5);
- asm volatile("usat %0, #5, %0, asr #4" : "=r"(r) : "0"(r));
- asm volatile("usat %0, #5, %0, asr #9" : "=r"(g) : "0"(g));
- asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b));
- return (uSrc & 0x8000) | (b << 10) | (g << 5) | r;
+ u32 r, g, s_d = uSrc;
+ asm("str %[b], [sp, #-4]!\n" // conserve regs for gcc
+ "uxtb16 %[b], %[b], ror #8\n" // b = g_rg >> 8 & 0xff00ff
+ "and %[r], %[s_d], #0x001f\n"
+ "and %[g], %[s_d], #0x03e0\n"
+ "smulbb %[r], %[r], %[b]\n"
+ "smulbt %[g], %[g], %[b]\n"
+ "uxtb %[b], %[g_b], ror #8\n"
+ "tst %[s_d], #0x8000\n"
+ "and %[s_d], %[s_d], #0x7c00\n"
+ "smulbb %[b], %[b], %[s_d]\n"
+ "usat %[s_d],#5, %[r], asr #7\n"
+ "usat %[g], #5, %[g], asr #12\n"
+ "usat %[b], #5, %[b], asr #17\n"
+ "orrne %[s_d], %[s_d], #0x8000\n"
+ "orr %[s_d], %[s_d], %[g], lsl #5\n"
+ "orr %[s_d], %[s_d], %[b], lsl #10\n"
+ "ldr %[b], [sp], #4\n"
+ : [s_d]"+r"(s_d), [r]"=&r"(r), [g]"=&r"(g)
+ : [b]"r"(gCol.raw32[0]), [g_b]"r"(gCol.raw32[1])
+ : "cc");
+ return s_d;
}
-#else
-GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
-{
- uint_fast16_t out = 0x03E0;
- u32 db, dg;
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudARM
- // Using `g` for src, `G` for dest
- asm ("and %[dg], %[out], %[src] \n\t" // dg holds 0x000000ggggg00000
- "orr %[dg], %[dg], %[g5] \n\t" // dg holds 0x000000gggggGGGGG
- "and %[db], %[out], %[src], lsr #0x05 \n\t" // db holds 0x000000bbbbb00000
- "ldrb %[dg], [%[lut], %[dg]] \n\t" // dg holds result 0x00000000000ggggg
- "and %[out], %[out], %[src], lsl #0x05 \n\t" // out holds 0x000000rrrrr00000
- "orr %[out], %[out], %[r5] \n\t" // out holds 0x000000rrrrrRRRRR
- "orr %[db], %[db], %[b5] \n\t" // db holds 0x000000bbbbbBBBBB
- "ldrb %[out], [%[lut], %[out]] \n\t" // out holds result 0x00000000000rrrrr
- "ldrb %[db], [%[lut], %[db]] \n\t" // db holds result 0x00000000000bbbbb
- "tst %[src], #0x8000\n\t" // check whether msb was set on uSrc
- "orr %[out], %[out], %[dg], lsl #0x05 \n\t" // out holds 0x000000gggggrrrrr
- "orrne %[out], %[out], #0x8000\n\t" // add msb to out if set on uSrc
- "orr %[out], %[out], %[db], lsl #0x0A \n\t" // out holds 0xmbbbbbgggggrrrrr
- : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg)
- : [r5] "r" (r5), [g5] "r" (g5), [b5] "r" (b5),
- [lut] "r" (gpu_unai.LightLUT), [src] "r" (uSrc), "0" (out)
- : "cc");
- return out;
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
-//
-// INPUT:
-// 'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
-// 15.0 is midpoint that does not modify color of texture
-// gCol input : rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
-// ^ bit 31
-// 'uSrc' input: mbbbbbgggggrrrrr
-// ^ bit 16
-// RETURNS:
-// u16 output: mbbbbbgggggrrrrr
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
-////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGouraudARM(uint_fast16_t uSrc, u32 gCol)
-{
- uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
- u32 db,dg,gtmp;
-
- // Using `g` for src, `G` for dest
- asm ("and %[dg], %[out], %[src] \n\t" // dg holds 0x000000ggggg00000
- "and %[gtmp],%[out], %[gCol], lsr #0x0B \n\t" // gtmp holds 0x000000GGGGG00000
- "and %[db], %[out], %[src], lsr #0x05 \n\t" // db holds 0x000000bbbbb00000
- "orr %[dg], %[dg], %[gtmp], lsr #0x05 \n\t" // dg holds 0x000000gggggGGGGG
- "and %[gtmp],%[out], %[gCol] \n\t" // gtmp holds 0x000000BBBBB00000
- "ldrb %[dg], [%[lut], %[dg]] \n\t" // dg holds result 0x00000000000ggggg
- "and %[out], %[out], %[src], lsl #0x05 \n\t" // out holds 0x000000rrrrr00000
- "orr %[out], %[out], %[gCol], lsr #0x1B \n\t" // out holds 0x000000rrrrrRRRRR
- "orr %[db], %[db], %[gtmp], lsr #0x05 \n\t" // db holds 0x000000bbbbbBBBBB
- "ldrb %[out], [%[lut], %[out]] \n\t" // out holds result 0x00000000000rrrrr
- "ldrb %[db], [%[lut], %[db]] \n\t" // db holds result 0x00000000000bbbbb
- "tst %[src], #0x8000\n\t" // check whether msb was set on uSrc
- "orr %[out], %[out], %[dg], lsl #0x05 \n\t" // out holds 0x000000gggggrrrrr
- "orrne %[out], %[out], #0x8000\n\t" // add msb to out if set on uSrc
- "orr %[out], %[out], %[db], lsl #0x0A \n\t" // out holds 0xmbbbbbgggggrrrrr
- : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg),
- [gtmp] "=&r" (gtmp) \
- : [gCol] "r" (gCol), [lut] "r" (gpu_unai.LightLUT), "0" (out), [src] "r" (uSrc)
- : "cc");
-
- return out;
-}
+#endif // HAVE_ARMV6
#endif //_OP_LIGHT_ARM_H_