gpu_unai: more asm

author notaz <notasas@gmail.com>

Sat, 23 Nov 2024 23:32:12 +0000 (01:32 +0200)

committer notaz <notasas@gmail.com>

Sun, 24 Nov 2024 22:12:34 +0000 (00:12 +0200)
author notaz <notasas@gmail.com>
Sat, 23 Nov 2024 23:32:12 +0000 (01:32 +0200)
committer notaz <notasas@gmail.com>
Sun, 24 Nov 2024 22:12:34 +0000 (00:12 +0200)
diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S

index 9326993..3b68ace 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.S
+++ b/plugins/gpu_unai/gpu_arm.S
@@ -207,4 +207,76 @@ sprite_driver_8bpp_asm:
      .cfi_endproc
  
  
+.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
+poly_4bpp_asm:
+    .cfi_startproc
+    add     r12, r1, #4
+    stmfd   sp!, {r4-r7,lr}
+    .cfi_def_cfa_offset 4*5
+    .cfi_rel_offset lr, 4*4
+    ldmia   r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk
+    ldr     r5, [r1, #0x18]    @ u_inc
+    mov     r6, r12
+    ldr     r12,[r1, #0x1c]    @ v_inc
+    and     r4, r4, r6
+    and     lr, lr, r7         @ v_msk & v
+    and     lr, lr, #0xff<<10
+    tst     r12,r12
+    bne     poly_4bpp_asm_v
+    ldr     r1, [r1]           @ src
+    mov     r7, r4, lsr #13
+    add     r1, r1, lr, lsl #1
+    add     r12,r1, r7, lsl #2
+    pld_    r12,#2048
+0:
+    ldr     lr, [r1, r7, lsl #2]
+    lsr     r12,r4, #8
+    and     r12,r12,#0x1c
+    sub     r12,r12,#1
+    mov     r12,lr, ror r12
+    add     r4, r4, r5
+    and     r12,r12,#0x1e
+    and     r4, r4, r6
+    ldrh    r12,[r3, r12]
+    add     r0, r0, #2
+    mov     r7, r4, lsr #13
+    tst     r12,r12
+    strneh  r12,[r0, #-2]
+    subs    r2, r2, #1
+    bgt     0b
+
+    ldmfd   sp!, {r4-r7,pc}
+
+poly_4bpp_asm_v: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+    stmfd   sp!, {r8-r9}
+    ldr     r9, [r1, #0x14]    @ v_msk
+    ldr     r1, [r1]           @ src
+    mov     r8, r12            @ v_inc
+    mov     r12,r4, lsr #13
+    add     lr, r1, lr, lsl #1
+    and     r9, r9, #0xff<<10  @ v_msk_final
+0:
+    ldr     lr, [lr, r12, lsl #2]
+    lsr     r12,r4, #8
+    and     r12,r12,#0x1c
+    sub     r12,r12,#1
+    mov     r12,lr, ror r12
+    add     r4, r4, r5
+    and     r12,r12,#0x1e
+    and     r4, r4, r6
+    ldrh    r12,[r3, r12]
+    add     r0, r0, #2
+    add     r7, r7, r8
+    and     lr, r7, r9
+    tst     r12,r12
+    add     lr, r1, lr, lsl #1
+    strneh  r12,[r0, #-2]
+    mov     r12,r4, lsr #13
+    subs    r2, r2, #1
+    bgt     0b
+
+    ldmfd   sp!, {r8-r9}
+    ldmfd   sp!, {r4-r7,pc}
+    .cfi_endproc
+
  @ vim:filetype=armasm
diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h

index 2329c46..287846e 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.h
+++ b/plugins/gpu_unai/gpu_arm.h
@@ -5,6 +5,7 @@
  extern "C" {
  #endif
  
+struct gpu_unai_inner_t;
  struct spriteDriverArg;
  
  void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base,
@@ -13,6 +14,8 @@ void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct spriteDriverArg *arg);
  void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
  
+void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/plugins/gpu_unai/gpu_command.h b/plugins/gpu_unai/gpu_command.h

index cf6b62b..adede2b 100644 (file)
--- a/plugins/gpu_unai/gpu_command.h
+++ b/plugins/gpu_unai/gpu_command.h
@@ -45,13 +45,13 @@ void gpuSetTexture(u16 tpage)
         
         gpu_unai.BLEND_MODE  = ((tpage>>5) & 3) << 3;
         gpu_unai.TEXT_MODE   = (tmode + 1) << 5; // gpu_unai.TEXT_MODE should be values 1..3, so add one
-       gpu_unai.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)];
+       gpu_unai.inn.TBA = &gpu_unai.vram[FRAME_OFFSET(tx, ty)];
  }
  
  ///////////////////////////////////////////////////////////////////////////////
  INLINE void gpuSetCLUT(u16 clut)
  {
-       gpu_unai.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4];
+       gpu_unai.inn.CBA = &gpu_unai.vram[(clut & 0x7FFF) << 4];
  }
  
  #ifdef  ENABLE_GPU_NULL_SUPPORT
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h

index a80c3a3..4f2b115 100644 (file)
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -385,9 +385,9 @@ static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
  
         u8 r5, g5, b5;
         if (CF_LIGHT) {
-               r5 = gpu_unai.r5;
-               g5 = gpu_unai.g5;
-               b5 = gpu_unai.b5;
+               r5 = gpu_unai.inn.r5;
+               g5 = gpu_unai.inn.g5;
+               b5 = gpu_unai.inn.b5;
         }
  
         if (CF_TEXTMODE==3) {
@@ -531,6 +531,8 @@ const PS gpuSpriteDrivers[256] = {
  #undef TI
  #undef TN
  #undef TIBLOCK
+#undef TA4
+#undef TA8
  
  ///////////////////////////////////////////////////////////////////////////////
  //  GPU Polygon innerloops generator
@@ -569,7 +571,7 @@ static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
                 if (!CF_GOURAUD)
                 {
                         // UNTEXTURED, NO GOURAUD
-                       const u16 pix15 = gpu_unai.PixelData;
+                       const u16 pix15 = gpu_unai.inn.PixelData;
                         do {
                                 uint_fast16_t uSrc, uDst;
  
@@ -596,8 +598,8 @@ endpolynotextnogou:
                 else
                 {
                         // UNTEXTURED, GOURAUD
-                       gcol_t l_gCol = gpu_unai.gCol;
-                       gcol_t l_gInc = gpu_unai.gInc;
+                       gcol_t l_gCol = gpu_unai.inn.gCol;
+                       gcol_t l_gInc = gpu_unai.inn.gInc;
  
                         do {
                                 uint_fast16_t uDst, uSrc;
@@ -643,12 +645,15 @@ endpolynotextgou:
                 //senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
                 // one 32-bit unsigned int, but this proved to lose too much accuracy
                 // (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
-               u32 l_u_msk = gpu_unai.u_msk;     u32 l_v_msk = gpu_unai.v_msk;
-               u32 l_u = gpu_unai.u & l_u_msk;   u32 l_v = gpu_unai.v & l_v_msk;
-               s32 l_u_inc = gpu_unai.u_inc;     s32 l_v_inc = gpu_unai.v_inc;
+               u32 l_u_msk = gpu_unai.inn.u_msk;     u32 l_v_msk = gpu_unai.inn.v_msk;
+               u32 l_u = gpu_unai.inn.u & l_u_msk;   u32 l_v = gpu_unai.inn.v & l_v_msk;
+               s32 l_u_inc = gpu_unai.inn.u_inc;     s32 l_v_inc = gpu_unai.inn.v_inc;
+               l_v <<= 1;
+               l_v_inc <<= 1;
+               l_v_msk = (l_v_msk & (0xff<<10)) << 1;
  
-               const le16_t* TBA_ = gpu_unai.TBA;
-               const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
+               const le16_t* TBA_ = gpu_unai.inn.TBA;
+               const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA;
  
                 u8 r5, g5, b5;
                 u8 r8, g8, b8;
@@ -657,17 +662,17 @@ endpolynotextgou:
  
                 if (CF_LIGHT) {
                         if (CF_GOURAUD) {
-                               l_gInc = gpu_unai.gInc;
-                               l_gCol = gpu_unai.gCol;
+                               l_gInc = gpu_unai.inn.gInc;
+                               l_gCol = gpu_unai.inn.gCol;
                         } else {
                                 if (CF_DITHER) {
-                                       r8 = gpu_unai.r8;
-                                       g8 = gpu_unai.g8;
-                                       b8 = gpu_unai.b8;
+                                       r8 = gpu_unai.inn.r8;
+                                       g8 = gpu_unai.inn.g8;
+                                       b8 = gpu_unai.inn.b8;
                                 } else {
-                                       r5 = gpu_unai.r5;
-                                       g5 = gpu_unai.g5;
-                                       b5 = gpu_unai.b5;
+                                       r5 = gpu_unai.inn.r5;
+                                       g5 = gpu_unai.inn.g5;
+                                       b5 = gpu_unai.inn.b5;
                                 }
                         }
                 }
@@ -682,17 +687,19 @@ endpolynotextgou:
                         //           (UNAI originally used 16.16)
                         if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
                                 u32 tu=(l_u>>10);
-                               u32 tv=(l_v<<1)&(0xff<<11);
+                               u32 tv=l_v&l_v_msk;
                                 u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
                                 uSrc=le16_to_u16(CBA_[(rgb>>((tu&1)<<2))&0xf]);
                                 if (!uSrc) goto endpolytext;
                         }
                         if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
-                               uSrc = le16_to_u16(CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])]);
+                               u32 tv=l_v&l_v_msk;
+                               uSrc = le16_to_u16(CBA_[((u8*)TBA_)[tv+(l_u>>10)]]);
                                 if (!uSrc) goto endpolytext;
                         }
                         if (CF_TEXTMODE==3) {  // 16bpp
-                               uSrc = le16_to_u16(TBA_[(l_u>>10)+((l_v)&(0xff<<10))]);
+                               u32 tv=(l_v&l_v_msk)>>1;
+                               uSrc = le16_to_u16(TBA_[tv+(l_u>>10)]);
                                 if (!uSrc) goto endpolytext;
                         }
  
@@ -736,7 +743,7 @@ endpolynotextgou:
  endpolytext:
                         pDst++;
                         l_u = (l_u + l_u_inc) & l_u_msk;
-                       l_v = (l_v + l_v_inc) & l_v_msk;
+                       l_v += l_v_inc;
                         if (CF_LIGHT && CF_GOURAUD)
                                 l_gCol.raw += l_gInc.raw;
                 }
@@ -744,6 +751,13 @@ endpolytext:
         }
  }
  
+#ifdef __arm__
+static void PolySpan4bppAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+{
+       poly_4bpp_asm(pDst, &gpu_unai.inn, count);
+}
+#endif
+
  static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
  {
         #ifdef ENABLE_GPU_LOG_SUPPORT
@@ -758,12 +772,17 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
  // Template instantiation helper macros
  #define TI(cf) gpuPolySpanFn<(cf)>
  #define TN     PolyNULL
+#ifdef __arm__
+#define TA4(cf) PolySpan4bppAsm
+#else
+#define TA4(cf) TI(cf)
+#endif
  #define TIBLOCK(ub) \
         TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
         TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
         TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
         TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
-       TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TA4((ub)|0x20),TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
         TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
         TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
         TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
@@ -800,5 +819,7 @@ const PP gpuPolySpanDrivers[2048] = {
  #undef TI
  #undef TN
  #undef TIBLOCK
+#undef TA4
+#undef TA8
  
  #endif /* __GPU_UNAI_GPU_INNER_H__ */
diff --git a/plugins/gpu_unai/gpu_raster_polygon.h b/plugins/gpu_unai/gpu_raster_polygon.h

index ebd52eb..1457afd 100644 (file)
--- a/plugins/gpu_unai/gpu_raster_polygon.h
+++ b/plugins/gpu_unai/gpu_raster_polygon.h
@@ -227,7 +227,7 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
         PolyType ptype = POLYTYPE_F)
  {
         // Set up bgr555 color to be used across calls in inner driver
-       gpu_unai.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0]));
+       gpu_unai.inn.PixelData = GPU_RGB16(le32_to_u32(packet.U4[0]));
  
         PolyVertex vbuf[4];
         polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
@@ -379,13 +379,13 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
         PolyType ptype = POLYTYPE_FT)
  {
         // r8/g8/b8 used if texture-blending & dithering is applied (24-bit light)
-       gpu_unai.r8 = packet.U1[0];
-       gpu_unai.g8 = packet.U1[1];
-       gpu_unai.b8 = packet.U1[2];
+       gpu_unai.inn.r8 = packet.U1[0];
+       gpu_unai.inn.g8 = packet.U1[1];
+       gpu_unai.inn.b8 = packet.U1[2];
         // r5/g5/b5 used if just texture-blending is applied (15-bit light)
-       gpu_unai.r5 = packet.U1[0] >> 3;
-       gpu_unai.g5 = packet.U1[1] >> 3;
-       gpu_unai.b5 = packet.U1[2] >> 3;
+       gpu_unai.inn.r5 = packet.U1[0] >> 3;
+       gpu_unai.inn.g5 = packet.U1[1] >> 3;
+       gpu_unai.inn.b5 = packet.U1[2] >> 3;
  
         PolyVertex vbuf[4];
         polyInitVertexBuffer(vbuf, packet, ptype, is_quad);
@@ -462,8 +462,8 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
  #endif
  #endif
                 // Set u,v increments for inner driver
-               gpu_unai.u_inc = du4;
-               gpu_unai.v_inc = dv4;
+               gpu_unai.inn.u_inc = du4;
+               gpu_unai.inn.v_inc = dv4;
  
                 //senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here?
                 //                       (SAME ISSUE ELSEWHERE)
@@ -695,8 +695,8 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
                                 }
  
                                 // Set u,v coords for inner driver
-                               gpu_unai.u = u4;
-                               gpu_unai.v = v4;
+                               gpu_unai.inn.u = u4;
+                               gpu_unai.inn.v = v4;
  
                                 if (xb > xmax) xb = xmax;
                                 if ((xb - xa) > 0)
@@ -792,7 +792,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
  #endif
  #endif
                 // Setup packed Gouraud increment for inner driver
-               gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+               gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
  
                 for (s32 loop0 = 2; loop0; loop0--) {
                         if (loop0 == 2) {
@@ -1044,7 +1044,7 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
                                 }
  
                                 // Setup packed Gouraud color for inner driver
-                               gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+                               gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4);
  
                                 if (xb > xmax) xb = xmax;
                                 if ((xb - xa) > 0)
@@ -1158,9 +1158,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
  #endif
  #endif
                 // Set u,v increments and packed Gouraud increment for inner driver
-               gpu_unai.u_inc = du4;
-               gpu_unai.v_inc = dv4;
-               gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+               gpu_unai.inn.u_inc = du4;
+               gpu_unai.inn.v_inc = dv4;
+               gpu_unai.inn.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
  
                 for (s32 loop0 = 2; loop0; loop0--) {
                         if (loop0 == 2) {
@@ -1448,9 +1448,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
                                 }
  
                                 // Set packed Gouraud color and u,v coords for inner driver
-                               gpu_unai.u = u4;
-                               gpu_unai.v = v4;
-                               gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+                               gpu_unai.inn.u = u4;
+                               gpu_unai.inn.v = v4;
+                               gpu_unai.inn.gCol = gpuPackGouraudCol(r4, g4, b4);
  
                                 if (xb > xmax) xb = xmax;
                                 if ((xb - xa) > 0)
diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h

index 2564e7f..13d783e 100644 (file)
--- a/plugins/gpu_unai/gpu_raster_sprite.h
+++ b/plugins/gpu_unai/gpu_raster_sprite.h
@@ -61,22 +61,22 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out)
         *w_out = x1;
         *h_out = y1 - y0;
  
-       gpu_unai.r5 = packet.U1[0] >> 3;
-       gpu_unai.g5 = packet.U1[1] >> 3;
-       gpu_unai.b5 = packet.U1[2] >> 3;
+       gpu_unai.inn.r5 = packet.U1[0] >> 3;
+       gpu_unai.inn.g5 = packet.U1[1] >> 3;
+       gpu_unai.inn.b5 = packet.U1[2] >> 3;
  
         le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
         const int li=gpu_unai.ilace_mask;
         //const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
         //const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
         unsigned int tmode = gpu_unai.TEXT_MODE >> 5;
-       u8* pTxt_base = (u8*)gpu_unai.TBA;
+       u8* pTxt_base = (u8*)gpu_unai.inn.TBA;
  
         // Texture is accessed byte-wise, so adjust idx if 16bpp
         if (tmode == 3) u0 <<= 1;
  
         spriteDriverArg arg;
-       arg.CBA = gpu_unai.CBA;
+       arg.CBA = gpu_unai.inn.CBA;
         arg.u0 = u0;
         arg.v0 = v0;
         arg.u0_mask = gpu_unai.TextureWindow[2];
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h

index 844a8fd..fff9126 100644 (file)
--- a/plugins/gpu_unai/gpu_unai.h
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -196,6 +196,34 @@ static inline s32 GPU_DIV(s32 rs, s32 rt)
  // 'Unsafe' version of above that doesn't check for div-by-zero
  #define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt))
  
+// warning: gpu_arm.S asm uses this struct, update the asm if you change this
+struct gpu_unai_inner_t {
+       le16_t* TBA;              // 00 Ptr to current texture in VRAM
+       le16_t* CBA;              // 04 Ptr to current CLUT in VRAM
+
+       // 22.10 Fixed-pt texture coords, mask, scanline advance
+       // NOTE: U,V are no longer packed together into one u32, this proved to be
+       //  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
+       u32 u, v;                 // 08
+       u32 u_msk, v_msk;         // 10
+       s32 u_inc, v_inc;         // 18
+
+       // Color for Gouraud-shaded prims
+       // Fixed-pt 8.8 rgb triplet
+       // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
+       //  layout:  ccccccccXXXXXXXX for c in [r, g, b]
+       //           ^ bit 16
+       gcol_t gCol;
+       gcol_t gInc;       // Increment along scanline for gCol
+
+       // Color for flat-shaded, texture-blended prims
+       u8  r5, g5, b5;    // 5-bit light for undithered prims
+       u8  r8, g8, b8;    // 8-bit light for dithered prims
+
+       // Color for flat-shaded, untextured prims
+       u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+};
+
  struct gpu_unai_t {
         u32 GPU_GP1;
         GPUPacket PacketBuffer;
@@ -260,33 +288,11 @@ struct gpu_unai_t {
         s16 DrawingOffset[2];  // [0] : Drawing offset X (signed)
                                // [1] : Drawing offset Y (signed)
  
-       le16_t* TBA;              // Ptr to current texture in VRAM
-       le16_t* CBA;              // Ptr to current CLUT in VRAM
-
         ////////////////////////////////////////////////////////////////////////////
         //  Inner Loop parameters
  
-       // 22.10 Fixed-pt texture coords, mask, scanline advance
-       // NOTE: U,V are no longer packed together into one u32, this proved to be
-       //  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
-       u32 u, v;
-       u32 u_msk, v_msk;
-       s32 u_inc, v_inc;
-
-       // Color for Gouraud-shaded prims
-       // Fixed-pt 8.8 rgb triplet
-       // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
-       //  layout:  ccccccccXXXXXXXX for c in [r, g, b]
-       //           ^ bit 16
-       gcol_t gCol;
-       gcol_t gInc;       // Increment along scanline for gCol
-
-       // Color for flat-shaded, texture-blended prims
-       u8  r5, g5, b5;    // 5-bit light for undithered prims
-       u8  r8, g8, b8;    // 8-bit light for dithered prims
-
-       // Color for flat-shaded, untextured prims
-       u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+       __attribute__((aligned(32)))
+       gpu_unai_inner_t inn;
  
         // End of inner Loop parameters
         ////////////////////////////////////////////////////////////////////////////
@@ -319,7 +325,7 @@ struct gpu_unai_t {
         u32 DitherMatrix[64];   // Matrix of dither coefficients
  };
  
-static gpu_unai_t gpu_unai;
+static __attribute__((aligned(32))) gpu_unai_t gpu_unai;
  
  // Global config that frontend can alter.. Values are read in GPU_init().
  // TODO: if frontend menu modifies a setting, add a function that can notify
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp

index 733b255..53a1b1d 100644 (file)
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -243,8 +243,8 @@ int renderer_init(void)
    //senquack - new vars must be updated whenever texture window is changed:
    //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
    const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
-  gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
-  gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+  gpu_unai.inn.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+  gpu_unai.inn.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
  
    // Configuration options
    gpu_unai.config = gpu_unai_config_ext;
@@ -340,8 +340,8 @@ static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
  
          // Inner loop vars must be updated whenever texture window is changed:
          const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
-        gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
-        gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+        gpu_unai.inn.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+        gpu_unai.inn.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
  
          gpuSetTexture(gpu_unai.GPU_GP1);
        }
author	notaz <notasas@gmail.com>
	Sat, 23 Nov 2024 23:32:12 +0000 (01:32 +0200)
committer	notaz <notasas@gmail.com>
	Sun, 24 Nov 2024 22:12:34 +0000 (00:12 +0200)
plugins/gpu_unai/gpu_arm.S		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_arm.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_command.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_raster_polygon.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_raster_sprite.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_unai.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpulib_if.cpp		patch \| blob \| blame \| history