.cfi_endproc
+.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
+poly_4bpp_asm:
+ .cfi_startproc
+ add r12, r1, #4
+ stmfd sp!, {r4-r7,lr}
+ .cfi_def_cfa_offset 4*5
+ .cfi_rel_offset lr, 4*4
+ ldmia r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk
+ ldr r5, [r1, #0x18] @ u_inc
+ mov r6, r12
+ ldr r12,[r1, #0x1c] @ v_inc
+ and r4, r4, r6
+ and lr, lr, r7 @ v_msk & v
+ and lr, lr, #0xff<<10
+ tst r12,r12
+ bne poly_4bpp_asm_v
+ ldr r1, [r1] @ src
+ mov r7, r4, lsr #13
+ add r1, r1, lr, lsl #1
+ add r12,r1, r7, lsl #2
+ pld_ r12,#2048
+0:
+ ldr lr, [r1, r7, lsl #2]
+ lsr r12,r4, #8
+ and r12,r12,#0x1c
+ sub r12,r12,#1
+ mov r12,lr, ror r12
+ add r4, r4, r5
+ and r12,r12,#0x1e
+ and r4, r4, r6
+ ldrh r12,[r3, r12]
+ add r0, r0, #2
+ mov r7, r4, lsr #13
+ tst r12,r12
+ strneh r12,[r0, #-2]
+ subs r2, r2, #1
+ bgt 0b
+
+ ldmfd sp!, {r4-r7,pc}
+
+poly_4bpp_asm_v: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+ stmfd sp!, {r8-r9}
+ ldr r9, [r1, #0x14] @ v_msk
+ ldr r1, [r1] @ src
+ mov r8, r12 @ v_inc
+ mov r12,r4, lsr #13
+ add lr, r1, lr, lsl #1
+ and r9, r9, #0xff<<10 @ v_msk_final
+0:
+ ldr lr, [lr, r12, lsl #2]
+ lsr r12,r4, #8
+ and r12,r12,#0x1c
+ sub r12,r12,#1
+ mov r12,lr, ror r12
+ add r4, r4, r5
+ and r12,r12,#0x1e
+ and r4, r4, r6
+ ldrh r12,[r3, r12]
+ add r0, r0, #2
+ add r7, r7, r8
+ and lr, r7, r9
+ tst r12,r12
+ add lr, r1, lr, lsl #1
+ strneh r12,[r0, #-2]
+ mov r12,r4, lsr #13
+ subs r2, r2, #1
+ bgt 0b
+
+ ldmfd sp!, {r8-r9}
+ ldmfd sp!, {r4-r7,pc}
+ .cfi_endproc
+
@ vim:filetype=armasm
extern "C" {
#endif
+struct gpu_unai_inner_t;
struct spriteDriverArg;
void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base,
u32 count, const struct spriteDriverArg *arg);
void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
+void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+
#ifdef __cplusplus
}
#endif
gpu_unai.BLEND_MODE = ((tpage>>5) & 3) << 3;
gpu_unai.TEXT_MODE = (tmode + 1) << 5; // gpu_unai.TEXT_MODE should be values 1..3, so add one
- gpu_unai.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)];
+ gpu_unai.inn.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)];
}
///////////////////////////////////////////////////////////////////////////////
INLINE void gpuSetCLUT(u16 clut)
{
- gpu_unai.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4];
+ gpu_unai.inn.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4];
}
#ifdef ENABLE_GPU_NULL_SUPPORT
u8 r5, g5, b5;
if (CF_LIGHT) {
- r5 = gpu_unai.r5;
- g5 = gpu_unai.g5;
- b5 = gpu_unai.b5;
+ r5 = gpu_unai.inn.r5;
+ g5 = gpu_unai.inn.g5;
+ b5 = gpu_unai.inn.b5;
}
if (CF_TEXTMODE==3) {
#undef TI
#undef TN
#undef TIBLOCK
+#undef TA4
+#undef TA8
///////////////////////////////////////////////////////////////////////////////
// GPU Polygon innerloops generator
if (!CF_GOURAUD)
{
// UNTEXTURED, NO GOURAUD
- const u16 pix15 = gpu_unai.PixelData;
+ const u16 pix15 = gpu_unai.inn.PixelData;
do {
uint_fast16_t uSrc, uDst;
else
{
// UNTEXTURED, GOURAUD
- gcol_t l_gCol = gpu_unai.gCol;
- gcol_t l_gInc = gpu_unai.gInc;
+ gcol_t l_gCol = gpu_unai.inn.gCol;
+ gcol_t l_gInc = gpu_unai.inn.gInc;
do {
uint_fast16_t uDst, uSrc;
//senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
// one 32-bit unsigned int, but this proved to lose too much accuracy
// (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
- u32 l_u_msk = gpu_unai.u_msk; u32 l_v_msk = gpu_unai.v_msk;
- u32 l_u = gpu_unai.u & l_u_msk; u32 l_v = gpu_unai.v & l_v_msk;
- s32 l_u_inc = gpu_unai.u_inc; s32 l_v_inc = gpu_unai.v_inc;
+ u32 l_u_msk = gpu_unai.inn.u_msk; u32 l_v_msk = gpu_unai.inn.v_msk;
+ u32 l_u = gpu_unai.inn.u & l_u_msk; u32 l_v = gpu_unai.inn.v & l_v_msk;
+ s32 l_u_inc = gpu_unai.inn.u_inc; s32 l_v_inc = gpu_unai.inn.v_inc;
+ l_v <<= 1;
+ l_v_inc <<= 1;
+ l_v_msk = (l_v_msk & (0xff<<10)) << 1;
- const le16_t* TBA_ = gpu_unai.TBA;
- const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
+ const le16_t* TBA_ = gpu_unai.inn.TBA;
+ const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA;
u8 r5, g5, b5;
u8 r8, g8, b8;
if (CF_LIGHT) {
if (CF_GOURAUD) {
- l_gInc = gpu_unai.gInc;
- l_gCol = gpu_unai.gCol;
+ l_gInc = gpu_unai.inn.gInc;
+ l_gCol = gpu_unai.inn.gCol;
} else {
if (CF_DITHER) {
- r8 = gpu_unai.r8;
- g8 = gpu_unai.g8;
- b8 = gpu_unai.b8;
+ r8 = gpu_unai.inn.r8;
+ g8 = gpu_unai.inn.g8;
+ b8 = gpu_unai.inn.b8;
} else {
- r5 = gpu_unai.r5;
- g5 = gpu_unai.g5;
- b5 = gpu_unai.b5;
+ r5 = gpu_unai.inn.r5;
+ g5 = gpu_unai.inn.g5;
+ b5 = gpu_unai.inn.b5;
}
}
}
// (UNAI originally used 16.16)
if (CF_TEXTMODE==1) { // 4bpp (CLUT)
u32 tu=(l_u>>10);
- u32 tv=(l_v<<1)&(0xff<<11);
+ u32 tv=l_v&l_v_msk;
u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
uSrc=le16_to_u16(CBA_[(rgb>>((tu&1)<<2))&0xf]);
if (!uSrc) goto endpolytext;
}
if (CF_TEXTMODE==2) { // 8bpp (CLUT)
- uSrc = le16_to_u16(CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])]);
+ u32 tv=l_v&l_v_msk;
+ uSrc = le16_to_u16(CBA_[((u8*)TBA_)[tv+(l_u>>10)]]);
if (!uSrc) goto endpolytext;
}
if (CF_TEXTMODE==3) { // 16bpp
- uSrc = le16_to_u16(TBA_[(l_u>>10)+((l_v)&(0xff<<10))]);
+ u32 tv=(l_v&l_v_msk)>>1;
+ uSrc = le16_to_u16(TBA_[tv+(l_u>>10)]);
if (!uSrc) goto endpolytext;
}
endpolytext:
pDst++;
l_u = (l_u + l_u_inc) & l_u_msk;
- l_v = (l_v + l_v_inc) & l_v_msk;
+ l_v += l_v_inc;
if (CF_LIGHT && CF_GOURAUD)
l_gCol.raw += l_gInc.raw;
}
}
}
+#ifdef __arm__
+static void PolySpan4bppAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+{
+ poly_4bpp_asm(pDst, &gpu_unai.inn, count);
+}
+#endif
+
static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
{
#ifdef ENABLE_GPU_LOG_SUPPORT
// Template instantiation helper macros
#define TI(cf) gpuPolySpanFn<(cf)>
#define TN PolyNULL
+#ifdef __arm__
+#define TA4(cf) PolySpan4bppAsm
+#else
+#define TA4(cf) TI(cf)
+#endif
#define TIBLOCK(ub) \
TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
TN, TN, TI((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \
TN, TN, TI((ub)|0x12), TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \
TN, TN, TI((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \
- TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+ TA4((ub)|0x20),TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \
TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \
TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \
#undef TI
#undef TN
#undef TIBLOCK
+#undef TA4
+#undef TA8
#endif /* __GPU_UNAI_GPU_INNER_H__ */
PolyType ptype = POLYTYPE_F)
{
// Set up bgr555 color to be used across calls in inner driver
- gpu_unai.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0]));
+ gpu_unai.inn.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0]));
PolyVertex vbuf[4];
polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
PolyType ptype = POLYTYPE_FT)
{
// r8/g8/b8 used if texture-blending & dithering is applied (24-bit light)
- gpu_unai.r8 = packet.U1[0];
- gpu_unai.g8 = packet.U1[1];
- gpu_unai.b8 = packet.U1[2];
+ gpu_unai.inn.r8 = packet.U1[0];
+ gpu_unai.inn.g8 = packet.U1[1];
+ gpu_unai.inn.b8 = packet.U1[2];
// r5/g5/b5 used if just texture-blending is applied (15-bit light)
- gpu_unai.r5 = packet.U1[0] >> 3;
- gpu_unai.g5 = packet.U1[1] >> 3;
- gpu_unai.b5 = packet.U1[2] >> 3;
+ gpu_unai.inn.r5 = packet.U1[0] >> 3;
+ gpu_unai.inn.g5 = packet.U1[1] >> 3;
+ gpu_unai.inn.b5 = packet.U1[2] >> 3;
PolyVertex vbuf[4];
polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
#endif
#endif
// Set u,v increments for inner driver
- gpu_unai.u_inc = du4;
- gpu_unai.v_inc = dv4;
+ gpu_unai.inn.u_inc = du4;
+ gpu_unai.inn.v_inc = dv4;
//senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here?
// (SAME ISSUE ELSEWHERE)
}
// Set u,v coords for inner driver
- gpu_unai.u = u4;
- gpu_unai.v = v4;
+ gpu_unai.inn.u = u4;
+ gpu_unai.inn.v = v4;
if (xb > xmax) xb = xmax;
if ((xb - xa) > 0)
#endif
#endif
// Setup packed Gouraud increment for inner driver
- gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+ gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
for (s32 loop0 = 2; loop0; loop0--) {
if (loop0 == 2) {
}
// Setup packed Gouraud color for inner driver
- gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+ gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4);
if (xb > xmax) xb = xmax;
if ((xb - xa) > 0)
#endif
#endif
// Set u,v increments and packed Gouraud increment for inner driver
- gpu_unai.u_inc = du4;
- gpu_unai.v_inc = dv4;
- gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+ gpu_unai.inn.u_inc = du4;
+ gpu_unai.inn.v_inc = dv4;
+ gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
for (s32 loop0 = 2; loop0; loop0--) {
if (loop0 == 2) {
}
// Set packed Gouraud color and u,v coords for inner driver
- gpu_unai.u = u4;
- gpu_unai.v = v4;
- gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+ gpu_unai.inn.u = u4;
+ gpu_unai.inn.v = v4;
+ gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4);
if (xb > xmax) xb = xmax;
if ((xb - xa) > 0)
*w_out = x1;
*h_out = y1 - y0;
- gpu_unai.r5 = packet.U1[0] >> 3;
- gpu_unai.g5 = packet.U1[1] >> 3;
- gpu_unai.b5 = packet.U1[2] >> 3;
+ gpu_unai.inn.r5 = packet.U1[0] >> 3;
+ gpu_unai.inn.g5 = packet.U1[1] >> 3;
+ gpu_unai.inn.b5 = packet.U1[2] >> 3;
le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
const int li=gpu_unai.ilace_mask;
//const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
//const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
unsigned int tmode = gpu_unai.TEXT_MODE >> 5;
- u8* pTxt_base = (u8*)gpu_unai.TBA;
+ u8* pTxt_base = (u8*)gpu_unai.inn.TBA;
// Texture is accessed byte-wise, so adjust idx if 16bpp
if (tmode == 3) u0 <<= 1;
spriteDriverArg arg;
- arg.CBA = gpu_unai.CBA;
+ arg.CBA = gpu_unai.inn.CBA;
arg.u0 = u0;
arg.v0 = v0;
arg.u0_mask = gpu_unai.TextureWindow[2];
// 'Unsafe' version of above that doesn't check for div-by-zero
#define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt))
+// warning: gpu_arm.S asm uses this struct, update the asm if you change this
+struct gpu_unai_inner_t {
+ le16_t* TBA; // 00 Ptr to current texture in VRAM
+ le16_t* CBA; // 04 Ptr to current CLUT in VRAM
+
+ // 22.10 Fixed-pt texture coords, mask, scanline advance
+ // NOTE: U,V are no longer packed together into one u32, this proved to be
+ // too imprecise, leading to pixel dropouts. Example: NFS3's skybox.
+ u32 u, v; // 08
+ u32 u_msk, v_msk; // 10
+ s32 u_inc, v_inc; // 18
+
+ // Color for Gouraud-shaded prims
+ // Fixed-pt 8.8 rgb triplet
+ // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
+ // layout: ccccccccXXXXXXXX for c in [r, g, b]
+ // ^ bit 16
+ gcol_t gCol;
+ gcol_t gInc; // Increment along scanline for gCol
+
+ // Color for flat-shaded, texture-blended prims
+ u8 r5, g5, b5; // 5-bit light for undithered prims
+ u8 r8, g8, b8; // 8-bit light for dithered prims
+
+ // Color for flat-shaded, untextured prims
+ u16 PixelData; // bgr555 color for untextured flat-shaded polys
+};
+
struct gpu_unai_t {
u32 GPU_GP1;
GPUPacket PacketBuffer;
s16 DrawingOffset[2]; // [0] : Drawing offset X (signed)
// [1] : Drawing offset Y (signed)
- le16_t* TBA; // Ptr to current texture in VRAM
- le16_t* CBA; // Ptr to current CLUT in VRAM
-
////////////////////////////////////////////////////////////////////////////
// Inner Loop parameters
- // 22.10 Fixed-pt texture coords, mask, scanline advance
- // NOTE: U,V are no longer packed together into one u32, this proved to be
- // too imprecise, leading to pixel dropouts. Example: NFS3's skybox.
- u32 u, v;
- u32 u_msk, v_msk;
- s32 u_inc, v_inc;
-
- // Color for Gouraud-shaded prims
- // Fixed-pt 8.8 rgb triplet
- // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
- // layout: ccccccccXXXXXXXX for c in [r, g, b]
- // ^ bit 16
- gcol_t gCol;
- gcol_t gInc; // Increment along scanline for gCol
-
- // Color for flat-shaded, texture-blended prims
- u8 r5, g5, b5; // 5-bit light for undithered prims
- u8 r8, g8, b8; // 8-bit light for dithered prims
-
- // Color for flat-shaded, untextured prims
- u16 PixelData; // bgr555 color for untextured flat-shaded polys
+ __attribute__((aligned(32)))
+ gpu_unai_inner_t inn;
// End of inner Loop parameters
////////////////////////////////////////////////////////////////////////////
u32 DitherMatrix[64]; // Matrix of dither coefficients
};
-static gpu_unai_t gpu_unai;
+static __attribute__((aligned(32))) gpu_unai_t gpu_unai;
// Global config that frontend can alter.. Values are read in GPU_init().
// TODO: if frontend menu modifies a setting, add a function that can notify
//senquack - new vars must be updated whenever texture window is changed:
// (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
const u32 fb = FIXED_BITS; // # of fractional fixed-pt bits of u4/v4
- gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
- gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+ gpu_unai.inn.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+ gpu_unai.inn.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
// Configuration options
gpu_unai.config = gpu_unai_config_ext;
// Inner loop vars must be updated whenever texture window is changed:
const u32 fb = FIXED_BITS; // # of fractional fixed-pt bits of u4/v4
- gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
- gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+ gpu_unai.inn.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+ gpu_unai.inn.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
gpuSetTexture(gpu_unai.GPU_GP1);
}