gpu_unai: more asm
authornotaz <notasas@gmail.com>
Sat, 23 Nov 2024 23:32:12 +0000 (01:32 +0200)
committernotaz <notasas@gmail.com>
Sun, 24 Nov 2024 22:12:34 +0000 (00:12 +0200)
plugins/gpu_unai/gpu_arm.S
plugins/gpu_unai/gpu_arm.h
plugins/gpu_unai/gpu_command.h
plugins/gpu_unai/gpu_inner.h
plugins/gpu_unai/gpu_raster_polygon.h
plugins/gpu_unai/gpu_raster_sprite.h
plugins/gpu_unai/gpu_unai.h
plugins/gpu_unai/gpulib_if.cpp

index 9326993..3b68ace 100644 (file)
@@ -207,4 +207,76 @@ sprite_driver_8bpp_asm:
     .cfi_endproc
 
 
+.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
+poly_4bpp_asm:
+    .cfi_startproc
+    add     r12, r1, #4
+    stmfd   sp!, {r4-r7,lr}
+    .cfi_def_cfa_offset 4*5
+    .cfi_rel_offset lr, 4*4
+    ldmia   r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk
+    ldr     r5, [r1, #0x18]    @ u_inc
+    mov     r6, r12
+    ldr     r12,[r1, #0x1c]    @ v_inc
+    and     r4, r4, r6
+    and     lr, lr, r7         @ v_msk & v
+    and     lr, lr, #0xff<<10
+    tst     r12,r12
+    bne     poly_4bpp_asm_v
+    ldr     r1, [r1]           @ src
+    mov     r7, r4, lsr #13
+    add     r1, r1, lr, lsl #1
+    add     r12,r1, r7, lsl #2
+    pld_    r12,#2048
+0:
+    ldr     lr, [r1, r7, lsl #2]
+    lsr     r12,r4, #8
+    and     r12,r12,#0x1c
+    sub     r12,r12,#1
+    mov     r12,lr, ror r12
+    add     r4, r4, r5
+    and     r12,r12,#0x1e
+    and     r4, r4, r6
+    ldrh    r12,[r3, r12]
+    add     r0, r0, #2
+    mov     r7, r4, lsr #13
+    tst     r12,r12
+    strneh  r12,[r0, #-2]
+    subs    r2, r2, #1
+    bgt     0b
+
+    ldmfd   sp!, {r4-r7,pc}
+
+poly_4bpp_asm_v: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+    stmfd   sp!, {r8-r9}
+    ldr     r9, [r1, #0x14]    @ v_msk
+    ldr     r1, [r1]           @ src
+    mov     r8, r12            @ v_inc
+    mov     r12,r4, lsr #13
+    add     lr, r1, lr, lsl #1
+    and     r9, r9, #0xff<<10  @ v_msk_final
+0:
+    ldr     lr, [lr, r12, lsl #2]
+    lsr     r12,r4, #8
+    and     r12,r12,#0x1c
+    sub     r12,r12,#1
+    mov     r12,lr, ror r12
+    add     r4, r4, r5
+    and     r12,r12,#0x1e
+    and     r4, r4, r6
+    ldrh    r12,[r3, r12]
+    add     r0, r0, #2
+    add     r7, r7, r8
+    and     lr, r7, r9
+    tst     r12,r12
+    add     lr, r1, lr, lsl #1
+    strneh  r12,[r0, #-2]
+    mov     r12,r4, lsr #13
+    subs    r2, r2, #1
+    bgt     0b
+
+    ldmfd   sp!, {r8-r9}
+    ldmfd   sp!, {r4-r7,pc}
+    .cfi_endproc
+
 @ vim:filetype=armasm
index 2329c46..287846e 100644 (file)
@@ -5,6 +5,7 @@
 extern "C" {
 #endif
 
+struct gpu_unai_inner_t;
 struct spriteDriverArg;
 
 void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base,
@@ -13,6 +14,8 @@ void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base,
        u32 count, const struct spriteDriverArg *arg);
 void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
 
+void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+
 #ifdef __cplusplus
 }
 #endif
index cf6b62b..adede2b 100644 (file)
@@ -45,13 +45,13 @@ void gpuSetTexture(u16 tpage)
        
        gpu_unai.BLEND_MODE  = ((tpage>>5) & 3) << 3;
        gpu_unai.TEXT_MODE   = (tmode + 1) << 5; // gpu_unai.TEXT_MODE should be values 1..3, so add one
-       gpu_unai.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)];
+       gpu_unai.inn.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)];
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 INLINE void gpuSetCLUT(u16 clut)
 {
-       gpu_unai.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4];
+       gpu_unai.inn.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4];
 }
 
 #ifdef  ENABLE_GPU_NULL_SUPPORT
index a80c3a3..4f2b115 100644 (file)
@@ -385,9 +385,9 @@ static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
 
        u8 r5, g5, b5;
        if (CF_LIGHT) {
-               r5 = gpu_unai.r5;
-               g5 = gpu_unai.g5;
-               b5 = gpu_unai.b5;
+               r5 = gpu_unai.inn.r5;
+               g5 = gpu_unai.inn.g5;
+               b5 = gpu_unai.inn.b5;
        }
 
        if (CF_TEXTMODE==3) {
@@ -531,6 +531,8 @@ const PS gpuSpriteDrivers[256] = {
 #undef TI
 #undef TN
 #undef TIBLOCK
+#undef TA4
+#undef TA8
 
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Polygon innerloops generator
@@ -569,7 +571,7 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
                if (!CF_GOURAUD)
                {
                        // UNTEXTURED, NO GOURAUD
-                       const u16 pix15 = gpu_unai.PixelData;
+                       const u16 pix15 = gpu_unai.inn.PixelData;
                        do {
                                uint_fast16_t uSrc, uDst;
 
@@ -596,8 +598,8 @@ endpolynotextnogou:
                else
                {
                        // UNTEXTURED, GOURAUD
-                       gcol_t l_gCol = gpu_unai.gCol;
-                       gcol_t l_gInc = gpu_unai.gInc;
+                       gcol_t l_gCol = gpu_unai.inn.gCol;
+                       gcol_t l_gInc = gpu_unai.inn.gInc;
 
                        do {
                                uint_fast16_t uDst, uSrc;
@@ -643,12 +645,15 @@ endpolynotextgou:
                //senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
                // one 32-bit unsigned int, but this proved to lose too much accuracy
                // (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
-               u32 l_u_msk = gpu_unai.u_msk;     u32 l_v_msk = gpu_unai.v_msk;
-               u32 l_u = gpu_unai.u & l_u_msk;   u32 l_v = gpu_unai.v & l_v_msk;
-               s32 l_u_inc = gpu_unai.u_inc;     s32 l_v_inc = gpu_unai.v_inc;
+               u32 l_u_msk = gpu_unai.inn.u_msk;     u32 l_v_msk = gpu_unai.inn.v_msk;
+               u32 l_u = gpu_unai.inn.u & l_u_msk;   u32 l_v = gpu_unai.inn.v & l_v_msk;
+               s32 l_u_inc = gpu_unai.inn.u_inc;     s32 l_v_inc = gpu_unai.inn.v_inc;
+               l_v <<= 1;
+               l_v_inc <<= 1;
+               l_v_msk = (l_v_msk & (0xff<<10)) << 1;
 
-               const le16_t* TBA_ = gpu_unai.TBA;
-               const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
+               const le16_t* TBA_ = gpu_unai.inn.TBA;
+               const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA;
 
                u8 r5, g5, b5;
                u8 r8, g8, b8;
@@ -657,17 +662,17 @@ endpolynotextgou:
 
                if (CF_LIGHT) {
                        if (CF_GOURAUD) {
-                               l_gInc = gpu_unai.gInc;
-                               l_gCol = gpu_unai.gCol;
+                               l_gInc = gpu_unai.inn.gInc;
+                               l_gCol = gpu_unai.inn.gCol;
                        } else {
                                if (CF_DITHER) {
-                                       r8 = gpu_unai.r8;
-                                       g8 = gpu_unai.g8;
-                                       b8 = gpu_unai.b8;
+                                       r8 = gpu_unai.inn.r8;
+                                       g8 = gpu_unai.inn.g8;
+                                       b8 = gpu_unai.inn.b8;
                                } else {
-                                       r5 = gpu_unai.r5;
-                                       g5 = gpu_unai.g5;
-                                       b5 = gpu_unai.b5;
+                                       r5 = gpu_unai.inn.r5;
+                                       g5 = gpu_unai.inn.g5;
+                                       b5 = gpu_unai.inn.b5;
                                }
                        }
                }
@@ -682,17 +687,19 @@ endpolynotextgou:
                        //           (UNAI originally used 16.16)
                        if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
                                u32 tu=(l_u>>10);
-                               u32 tv=(l_v<<1)&(0xff<<11);
+                               u32 tv=l_v&l_v_msk;
                                u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
                                uSrc=le16_to_u16(CBA_[(rgb>>((tu&1)<<2))&0xf]);
                                if (!uSrc) goto endpolytext;
                        }
                        if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
-                               uSrc = le16_to_u16(CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])]);
+                               u32 tv=l_v&l_v_msk;
+                               uSrc = le16_to_u16(CBA_[((u8*)TBA_)[tv+(l_u>>10)]]);
                                if (!uSrc) goto endpolytext;
                        }
                        if (CF_TEXTMODE==3) {  // 16bpp
-                               uSrc = le16_to_u16(TBA_[(l_u>>10)+((l_v)&(0xff<<10))]);
+                               u32 tv=(l_v&l_v_msk)>>1;
+                               uSrc = le16_to_u16(TBA_[tv+(l_u>>10)]);
                                if (!uSrc) goto endpolytext;
                        }
 
@@ -736,7 +743,7 @@ endpolynotextgou:
 endpolytext:
                        pDst++;
                        l_u = (l_u + l_u_inc) & l_u_msk;
-                       l_v = (l_v + l_v_inc) & l_v_msk;
+                       l_v += l_v_inc;
                        if (CF_LIGHT && CF_GOURAUD)
                                l_gCol.raw += l_gInc.raw;
                }
@@ -744,6 +751,13 @@ endpolytext:
        }
 }
 
+#ifdef __arm__
+static void PolySpan4bppAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+{
+       poly_4bpp_asm(pDst, &gpu_unai.inn, count);
+}
+#endif
+
 static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
 {
        #ifdef ENABLE_GPU_LOG_SUPPORT
@@ -758,12 +772,17 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
 // Template instantiation helper macros
 #define TI(cf) gpuPolySpanFn<(cf)>
 #define TN     PolyNULL
+#ifdef __arm__
+#define TA4(cf) PolySpan4bppAsm
+#else
+#define TA4(cf) TI(cf)
+#endif
 #define TIBLOCK(ub) \
        TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
        TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
        TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
        TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
-       TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TA4((ub)|0x20),TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
        TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
        TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
        TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
@@ -800,5 +819,7 @@ const PP gpuPolySpanDrivers[2048] = {
 #undef TI
 #undef TN
 #undef TIBLOCK
+#undef TA4
+#undef TA8
 
 #endif /* __GPU_UNAI_GPU_INNER_H__ */
index ebd52eb..1457afd 100644 (file)
@@ -227,7 +227,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
        PolyType ptype = POLYTYPE_F)
 {
        // Set up bgr555 color to be used across calls in inner driver
-       gpu_unai.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0]));
+       gpu_unai.inn.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0]));
 
        PolyVertex vbuf[4];
        polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
@@ -379,13 +379,13 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
        PolyType ptype = POLYTYPE_FT)
 {
        // r8/g8/b8 used if texture-blending & dithering is applied (24-bit light)
-       gpu_unai.r8 = packet.U1[0];
-       gpu_unai.g8 = packet.U1[1];
-       gpu_unai.b8 = packet.U1[2];
+       gpu_unai.inn.r8 = packet.U1[0];
+       gpu_unai.inn.g8 = packet.U1[1];
+       gpu_unai.inn.b8 = packet.U1[2];
        // r5/g5/b5 used if just texture-blending is applied (15-bit light)
-       gpu_unai.r5 = packet.U1[0] >> 3;
-       gpu_unai.g5 = packet.U1[1] >> 3;
-       gpu_unai.b5 = packet.U1[2] >> 3;
+       gpu_unai.inn.r5 = packet.U1[0] >> 3;
+       gpu_unai.inn.g5 = packet.U1[1] >> 3;
+       gpu_unai.inn.b5 = packet.U1[2] >> 3;
 
        PolyVertex vbuf[4];
        polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
@@ -462,8 +462,8 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 #endif
 #endif
                // Set u,v increments for inner driver
-               gpu_unai.u_inc = du4;
-               gpu_unai.v_inc = dv4;
+               gpu_unai.inn.u_inc = du4;
+               gpu_unai.inn.v_inc = dv4;
 
                //senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here?
                //                       (SAME ISSUE ELSEWHERE)
@@ -695,8 +695,8 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
                                }
 
                                // Set u,v coords for inner driver
-                               gpu_unai.u = u4;
-                               gpu_unai.v = v4;
+                               gpu_unai.inn.u = u4;
+                               gpu_unai.inn.v = v4;
 
                                if (xb > xmax) xb = xmax;
                                if ((xb - xa) > 0)
@@ -792,7 +792,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
 #endif
 #endif
                // Setup packed Gouraud increment for inner driver
-               gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+               gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
 
                for (s32 loop0 = 2; loop0; loop0--) {
                        if (loop0 == 2) {
@@ -1044,7 +1044,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
                                }
 
                                // Setup packed Gouraud color for inner driver
-                               gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+                               gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4);
 
                                if (xb > xmax) xb = xmax;
                                if ((xb - xa) > 0)
@@ -1158,9 +1158,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
 #endif
 #endif
                // Set u,v increments and packed Gouraud increment for inner driver
-               gpu_unai.u_inc = du4;
-               gpu_unai.v_inc = dv4;
-               gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+               gpu_unai.inn.u_inc = du4;
+               gpu_unai.inn.v_inc = dv4;
+               gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
 
                for (s32 loop0 = 2; loop0; loop0--) {
                        if (loop0 == 2) {
@@ -1448,9 +1448,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
                                }
 
                                // Set packed Gouraud color and u,v coords for inner driver
-                               gpu_unai.u = u4;
-                               gpu_unai.v = v4;
-                               gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+                               gpu_unai.inn.u = u4;
+                               gpu_unai.inn.v = v4;
+                               gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4);
 
                                if (xb > xmax) xb = xmax;
                                if ((xb - xa) > 0)
index 2564e7f..13d783e 100644 (file)
@@ -61,22 +61,22 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out)
        *w_out = x1;
        *h_out = y1 - y0;
 
-       gpu_unai.r5 = packet.U1[0] >> 3;
-       gpu_unai.g5 = packet.U1[1] >> 3;
-       gpu_unai.b5 = packet.U1[2] >> 3;
+       gpu_unai.inn.r5 = packet.U1[0] >> 3;
+       gpu_unai.inn.g5 = packet.U1[1] >> 3;
+       gpu_unai.inn.b5 = packet.U1[2] >> 3;
 
        le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
        const int li=gpu_unai.ilace_mask;
        //const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
        //const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
        unsigned int tmode = gpu_unai.TEXT_MODE >> 5;
-       u8* pTxt_base = (u8*)gpu_unai.TBA;
+       u8* pTxt_base = (u8*)gpu_unai.inn.TBA;
 
        // Texture is accessed byte-wise, so adjust idx if 16bpp
        if (tmode == 3) u0 <<= 1;
 
        spriteDriverArg arg;
-       arg.CBA = gpu_unai.CBA;
+       arg.CBA = gpu_unai.inn.CBA;
        arg.u0 = u0;
        arg.v0 = v0;
        arg.u0_mask = gpu_unai.TextureWindow[2];
index 844a8fd..fff9126 100644 (file)
@@ -196,6 +196,34 @@ static inline s32 GPU_DIV(s32 rs, s32 rt)
 // 'Unsafe' version of above that doesn't check for div-by-zero
 #define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt))
 
+// warning: gpu_arm.S asm uses this struct, update the asm if you change this
+struct gpu_unai_inner_t {
+       le16_t* TBA;              // 00 Ptr to current texture in VRAM
+       le16_t* CBA;              // 04 Ptr to current CLUT in VRAM
+
+       // 22.10 Fixed-pt texture coords, mask, scanline advance
+       // NOTE: U,V are no longer packed together into one u32, this proved to be
+       //  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
+       u32 u, v;                 // 08
+       u32 u_msk, v_msk;         // 10
+       s32 u_inc, v_inc;         // 18
+
+       // Color for Gouraud-shaded prims
+       // Fixed-pt 8.8 rgb triplet
+       // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
+       //  layout:  ccccccccXXXXXXXX for c in [r, g, b]
+       //           ^ bit 16
+       gcol_t gCol;
+       gcol_t gInc;       // Increment along scanline for gCol
+
+       // Color for flat-shaded, texture-blended prims
+       u8  r5, g5, b5;    // 5-bit light for undithered prims
+       u8  r8, g8, b8;    // 8-bit light for dithered prims
+
+       // Color for flat-shaded, untextured prims
+       u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+};
+
 struct gpu_unai_t {
        u32 GPU_GP1;
        GPUPacket PacketBuffer;
@@ -260,33 +288,11 @@ struct gpu_unai_t {
        s16 DrawingOffset[2];  // [0] : Drawing offset X (signed)
                               // [1] : Drawing offset Y (signed)
 
-       le16_t* TBA;              // Ptr to current texture in VRAM
-       le16_t* CBA;              // Ptr to current CLUT in VRAM
-
        ////////////////////////////////////////////////////////////////////////////
        //  Inner Loop parameters
 
-       // 22.10 Fixed-pt texture coords, mask, scanline advance
-       // NOTE: U,V are no longer packed together into one u32, this proved to be
-       //  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
-       u32 u, v;
-       u32 u_msk, v_msk;
-       s32 u_inc, v_inc;
-
-       // Color for Gouraud-shaded prims
-       // Fixed-pt 8.8 rgb triplet
-       // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
-       //  layout:  ccccccccXXXXXXXX for c in [r, g, b]
-       //           ^ bit 16
-       gcol_t gCol;
-       gcol_t gInc;       // Increment along scanline for gCol
-
-       // Color for flat-shaded, texture-blended prims
-       u8  r5, g5, b5;    // 5-bit light for undithered prims
-       u8  r8, g8, b8;    // 8-bit light for dithered prims
-
-       // Color for flat-shaded, untextured prims
-       u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+       __attribute__((aligned(32)))
+       gpu_unai_inner_t inn;
 
        // End of inner Loop parameters
        ////////////////////////////////////////////////////////////////////////////
@@ -319,7 +325,7 @@ struct gpu_unai_t {
        u32 DitherMatrix[64];   // Matrix of dither coefficients
 };
 
-static gpu_unai_t gpu_unai;
+static __attribute__((aligned(32))) gpu_unai_t gpu_unai;
 
 // Global config that frontend can alter.. Values are read in GPU_init().
 // TODO: if frontend menu modifies a setting, add a function that can notify
index 733b255..53a1b1d 100644 (file)
@@ -243,8 +243,8 @@ int renderer_init(void)
   //senquack - new vars must be updated whenever texture window is changed:
   //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
   const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
-  gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
-  gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+  gpu_unai.inn.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+  gpu_unai.inn.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
 
   // Configuration options
   gpu_unai.config = gpu_unai_config_ext;
@@ -340,8 +340,8 @@ static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
 
         // Inner loop vars must be updated whenever texture window is changed:
         const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
-        gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
-        gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+        gpu_unai.inn.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+        gpu_unai.inn.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
 
         gpuSetTexture(gpu_unai.GPU_GP1);
       }