gpu_unai: various asm tuning for armv6

author notaz <notasas@gmail.com>

Wed, 7 Jan 2026 23:39:43 +0000 (01:39 +0200)

committer notaz <notasas@gmail.com>

Fri, 9 Jan 2026 02:35:29 +0000 (04:35 +0200)
author notaz <notasas@gmail.com>
Wed, 7 Jan 2026 23:39:43 +0000 (01:39 +0200)
committer notaz <notasas@gmail.com>
Fri, 9 Jan 2026 02:35:29 +0000 (04:35 +0200)
diff --git a/include/arm_features.h b/include/arm_features.h

index 9f51ab8..bd76096 100644 (file)
--- a/include/arm_features.h
+++ b/include/arm_features.h
@@ -14,6 +14,7 @@
  #define HAVE_ARMV8
  #define HAVE_ARMV7
  #define HAVE_ARMV6
+#define HAVE_ARMV5E
  #define HAVE_ARMV5
  
  #elif (defined(__ARM_ARCH) && __ARM_ARCH >= 7) \
@@ -23,6 +24,7 @@
  
  #define HAVE_ARMV7
  #define HAVE_ARMV6
+#define HAVE_ARMV5E
  #define HAVE_ARMV5
  
  #elif (defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
@@ -32,11 +34,17 @@
      || defined(__ARM_ARCH_6M__)
  
  #define HAVE_ARMV6
+#define HAVE_ARMV5E
  #define HAVE_ARMV5
  #define HAVE_PRE_ARMV7
  
-#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5E__) \
-   || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+
+#define HAVE_ARMV5E
+#define HAVE_ARMV5
+#define HAVE_PRE_ARMV7
+
+#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__)
  
  #define HAVE_ARMV5
  #define HAVE_PRE_ARMV7
diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S

index a516f08..4d30243 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.S
+++ b/plugins/gpu_unai/gpu_arm.S
@@ -19,6 +19,8 @@
  
  #ifdef HAVE_ARMV6
  
+@ mbr: 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000
+@ mg:  0ggg gggg ...
  .macro modulate rp mbr mg t0 t1 t2
      and     \t0, \rp, #0x001f
      and     \t1, \rp, #0x03e0
@@ -649,8 +651,9 @@ FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count)
      and     r4, r4, r6
      and     lr, lr, r7         @ v_msk & v
      and     lr, lr, #0xff<<10
+    pld_    r3                 @ clut
      tst     r12,r12
-    bne     v_\name
+    bne     10f @ vinc_\name
      ldr     r1, [r1]           @ src
      mov     r7, r4, lsr #(13 - (\bpp / 8 * 3))
      add     r1, r1, lr, lsl #1
@@ -705,7 +708,7 @@ FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count)
  1:
      ldmfd   sp!, {r4-r11,pc}
  
-v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+10: @ vinc_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
  .if \light || \semit >= 0
      sub     sp, sp, #4*2
      stmia   sp, {r5,r6}
@@ -723,9 +726,9 @@ v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
  .endif
  0:
  .if \light || \semit >= 0
-    and     lr, r7, r9
-    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))
-    add     lr, r1, lr, lsl #1
+    and     lr, r7, r9         @ l_v & l_v_msk
+    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))  @ l_u
+    add     lr, r1, lr, lsl #1 @ (u16 *)TBA + l_v
      subs    r2, r2, #1
      bmi     1f
  .endif
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h

index 3281d0f..3ac39b6 100644 (file)
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -62,11 +62,16 @@
  #include "gpu_inner_blend_arm.h"
  #include "gpu_inner_light_arm.h"
  #define gpuBlending gpuBlendingARM
-#define gpuLightingTXT gpuLightingTXTARM
-#else
+#endif
+#ifndef gpuBlending
  #define gpuBlending gpuBlendingGeneric
+#endif
+#ifndef gpuLightingTXT // gpuLightingTXTARM
  #define gpuLightingTXT gpuLightingTXTGeneric
  #endif
+#ifndef gpuLightingTXTGouraud // gpuLightingTXTGouraudARM
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric
+#endif
  
  // Non-dithering lighting and blending functions preserve uSrc
  // MSB. This saves a few operations and useless load/stores.
@@ -425,12 +430,12 @@ static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt
         uint_fast16_t uSrc, uDst, srcMSB;
         bool should_blend;
         u32 u0_mask = inn.u_msk >> 10;
+       u32 bgr0888;
  
-       u8 r5, g5, b5;
         if (CF_LIGHT) {
-               r5 = inn.r5;
-               g5 = inn.g5;
-               b5 = inn.b5;
+               bgr0888 = (gpu_unai.inn.b8 << 16) |
+                         (gpu_unai.inn.g8 << 8) |
+                          gpu_unai.inn.r8;
         }
  
         const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA;
@@ -474,7 +479,7 @@ static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt
                 if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
                 
                 if (CF_LIGHT)
-                       uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+                       uSrc = gpuLightingTXT(uSrc, bgr0888);
  
                 should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
  
@@ -683,7 +688,7 @@ endpolynotextnogou:
  
  endpolynotextgou:
                                 pDst++;
-                               l_gCol.raw += l_gInc.raw;
+                               l_gCol += l_gInc;
                         }
                         while (--count);
                 }
@@ -707,25 +712,24 @@ endpolynotextgou:
                 const le16_t* TBA_ = gpu_unai.inn.TBA;
                 const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA;
  
-               u8 r5, g5, b5;
-               u8 r8, g8, b8;
+               u32 bgr0888;
  
                 gcol_t l_gInc, l_gCol;
+               int pcounter = count - 1; // "repeat while positive" counter
  
                 if (CF_LIGHT) {
                         if (CF_GOURAUD) {
                                 l_gInc = gpu_unai.inn.gInc;
                                 l_gCol = gpu_unai.inn.gCol;
+
+                               l_gInc.set_counter(-1);
+                               l_gCol.set_counter(pcounter);
                         } else {
-                               if (CF_DITHER) {
-                                       r8 = gpu_unai.inn.r8;
-                                       g8 = gpu_unai.inn.g8;
-                                       b8 = gpu_unai.inn.b8;
-                               } else {
-                                       r5 = gpu_unai.inn.r5;
-                                       g5 = gpu_unai.inn.g5;
-                                       b5 = gpu_unai.inn.b5;
-                               }
+                               // keep this packed, otherwise gcc runs out of regs
+                               bgr0888 = (gpu_unai.inn.b8 << 16) |
+                                         (gpu_unai.inn.g8 << 8) |
+                                          gpu_unai.inn.r8;
+                               // XXX pre-pack
                         }
                 }
  
@@ -769,7 +773,7 @@ endpolynotextgou:
                                 if ( CF_GOURAUD)
                                         uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol);
                                 if (!CF_GOURAUD)
-                                       uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8);
+                                       uSrc24 = gpuLightingTXT24(uSrc, bgr0888);
  
                                 if (CF_BLEND && srcMSB)
                                         uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
@@ -781,7 +785,7 @@ endpolynotextgou:
                                         if ( CF_GOURAUD)
                                                 uSrc = gpuLightingTXTGouraud(uSrc, l_gCol);
                                         if (!CF_GOURAUD)
-                                               uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+                                               uSrc = gpuLightingTXT(uSrc, bgr0888);
                                 }
  
                                 should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
@@ -796,10 +800,13 @@ endpolytext:
                         pDst++;
                         l_u = (l_u + l_u_inc) & l_u_msk;
                         l_v += l_v_inc;
-                       if (CF_LIGHT && CF_GOURAUD)
-                               l_gCol.raw += l_gInc.raw;
+                       if (CF_LIGHT && CF_GOURAUD) {
+                               l_gCol += l_gInc;
+                               l_gCol.get_counter(pcounter);
+                       }
+                       pcounter--;
                 }
-               while (--count);
+               while (pcounter >= 0);
         }
  }
  
diff --git a/plugins/gpu_unai/gpu_inner_light.h b/plugins/gpu_unai/gpu_inner_light.h

index 44fecdc..f4ec213 100644 (file)
--- a/plugins/gpu_unai/gpu_inner_light.h
+++ b/plugins/gpu_unai/gpu_inner_light.h
@@ -85,11 +85,12 @@ static void SetupLightLUT()
  ////////////////////////////////////////////////////////////////////////////////
  GPU_INLINE gcol_t gpuPackGouraudCol(u32 r, u32 g, u32 b)
  {
-       return (gcol_t){
+       return (gcol_t){{
                 (u16)(r >> 2),
                 (u16)(g >> 2),
                 (u16)(b >> 2),
-       };
+               0
+       }};
  }
  
  ////////////////////////////////////////////////////////////////////////////////
@@ -167,8 +168,13 @@ GPU_INLINE u32 gpuLightingRGB24(gcol_t gCol)
  //          u16 output:  0bbbbbgggggrrrrr
  // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
  ////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u32 bgr0888)
  {
+       // gcc can move this out of the loop if it wants to
+       uint_fast32_t b5 = (bgr0888 >> 19);
+       uint_fast32_t g5 = (bgr0888 >> 11) & 0x1f;
+       uint_fast32_t r5 = (bgr0888 >>  3) & 0x1f;
+
         return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) |
                (gpu_unai.LightLUT[ (uSrc&0x03E0)     | g5] <<  5) |
                (gpu_unai.LightLUT[((uSrc&0x001F)<<5) | r5]      ) |
@@ -189,7 +195,7 @@ GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5,
  //          u16 output:  0bbbbbgggggrrrrr
  // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
  ////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGouraud(uint_fast16_t uSrc, gcol_t gCol)
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, gcol_t gCol)
  {
         return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | (gCol.c.b >> 11)] << 10) |
                (gpu_unai.LightLUT[ (uSrc&0x03E0)     | (gCol.c.g >> 11)] << 5) |
@@ -213,22 +219,22 @@ GPU_INLINE uint_fast16_t gpuLightingTXTGouraud(uint_fast16_t uSrc, gcol_t gCol)
  //                     ^ bit 31
  // Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
  ////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u8 r8, u8 g8, u8 b8)
+GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u32 bgr0888)
  {
         uint_fast16_t r1 = uSrc&0x001F;
         uint_fast16_t g1 = uSrc&0x03E0;
         uint_fast16_t b1 = uSrc&0x7C00;
  
-       uint_fast16_t r2 = r8;
-       uint_fast16_t g2 = g8;
-       uint_fast16_t b2 = b8;
+       uint_fast16_t r2 = bgr0888 & 0x0000ff;
+       uint_fast32_t g2 = bgr0888 & 0x00ff00;
+       uint_fast16_t b2 = bgr0888 >> 16;
  
         u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
-       u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+       u32 g3 = g1 * g2; if (g3 & 0xFE000000) g3 = ~0xFE000000;
         u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
  
         return ((r3>> 3)    ) |
-              ((g3>> 8)<<10) |
+              ((g3>>16)<<10) |
                ((b3>>13)<<20);
  }
  
diff --git a/plugins/gpu_unai/gpu_inner_light_arm.h b/plugins/gpu_unai/gpu_inner_light_arm.h

index 7edb8fb..3445e79 100644 (file)
--- a/plugins/gpu_unai/gpu_inner_light_arm.h
+++ b/plugins/gpu_unai/gpu_inner_light_arm.h
@@ -14,6 +14,7 @@
  //                 ^ bit 16
  // Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
  ////////////////////////////////////////////////////////////////////////////////
+// note: outdated, unused
  GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
  {
         uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
@@ -30,11 +31,14 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
         return out;
  }
  
+//#ifdef HAVE_ARMV5E // todo?
+#ifdef HAVE_ARMV6
+
  ////////////////////////////////////////////////////////////////////////////////
-// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+// Apply 8-bit lighting to bgr555 texture color:
  //
  // INPUT:
-//       'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//       'r8','g8','b8' are unsigned 8-bit color values, value of 127
  //         is midpoint that doesn't modify that component of texture
  //       'uSrc' input:  mbbbbbgggggrrrrr
  //                      ^ bit 16
@@ -42,95 +46,59 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
  //         u16 output:  mbbbbbgggggrrrrr
  // Where 'X' are fixed-pt bits.
  ////////////////////////////////////////////////////////////////////////////////
-#ifdef HAVE_ARMV6
-// clang uses smulbb but not gcc, so we need this
-GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b)
+// on v6 we have single-cycle mul and sat which is better than the LightLUT
+GPU_INLINE u32 gpuLightingTXTARM(u32 uSrc, u32 bgr0888)
  {
-       int_fast16_t r;
-       asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
-       return r;
+       int_fast32_t r, g, b, s_d = uSrc;
+       // has to be in a block, otherwise gcc schedules the insns poorly
+       asm("and    %[r],  %[s_d], #0x001f\n"
+           "and    %[b],  %[bgr], #0xff\n"
+           "smulbb %[r],  %[r],   %[b]\n"
+           "uxtb   %[b],  %[bgr], ror #8\n"
+           "and    %[g],  %[s_d], #0x03e0\n"
+           "smulbb %[g],  %[g],   %[b]\n"
+           "and    %[b],  %[s_d], #0x7c00\n"
+           "and    %[s_d],%[s_d], #0x8000\n"
+           "smulbt %[b],  %[b],   %[bgr]\n"
+           "usat   %[r],  #5, %[r], asr #7\n"
+           "usat   %[g],  #5, %[g], asr #12\n"
+           "usat   %[b],  #5, %[b], asr #17\n"
+           "orr    %[s_d],%[s_d], %[r]\n"
+           "orr    %[s_d],%[s_d], %[g], lsl #5\n"
+           "orr    %[s_d],%[s_d], %[b], lsl #10\n"
+         : [s_d]"+r"(s_d), [r]"=&r"(r), [g]"=&r"(g), [b]"=&r"(b)
+         : [bgr]"r"(bgr0888));
+       return s_d;
  }
+#define gpuLightingTXT gpuLightingTXTARM
  
-GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+GPU_INLINE u32 gpuLightingTXTGouraudARM(u32 uSrc, gcol_t gCol)
  {
-       // on v6 we have single-cycle mul and sat which is better than the lut
-       int_fast16_t r = smulbb(uSrc & 0x001f, r5);
-       int_fast16_t g = smulbb(uSrc & 0x03e0, g5);
-       int_fast16_t b = smulbb(uSrc & 0x7c00, b5);
-       asm volatile("usat %0, #5, %0, asr #4"  : "=r"(r) : "0"(r));
-       asm volatile("usat %0, #5, %0, asr #9"  : "=r"(g) : "0"(g));
-       asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b));
-       return (uSrc & 0x8000) | (b << 10) | (g << 5) | r;
+       u32 r, g, s_d = uSrc;
+       asm("str    %[b],   [sp, #-4]!\n"        // conserve regs for gcc
+           "uxtb16 %[b],   %[b],    ror #8\n"   // b = g_rg >> 8 & 0xff00ff
+           "and    %[r],   %[s_d],  #0x001f\n"
+           "and    %[g],   %[s_d],  #0x03e0\n"
+           "smulbb %[r],   %[r],    %[b]\n"
+           "smulbt %[g],   %[g],    %[b]\n"
+           "uxtb   %[b],   %[g_b],  ror #8\n"
+           "tst    %[s_d],          #0x8000\n"
+           "and    %[s_d], %[s_d],  #0x7c00\n"
+           "smulbb %[b],   %[b],    %[s_d]\n"
+           "usat   %[s_d],#5, %[r], asr #7\n"
+           "usat   %[g],  #5, %[g], asr #12\n"
+           "usat   %[b],  #5, %[b], asr #17\n"
+           "orrne  %[s_d], %[s_d],  #0x8000\n"
+           "orr    %[s_d], %[s_d],  %[g], lsl #5\n"
+           "orr    %[s_d], %[s_d],  %[b], lsl #10\n"
+           "ldr    %[b],   [sp], #4\n"
+         : [s_d]"+r"(s_d), [r]"=&r"(r), [g]"=&r"(g)
+         : [b]"r"(gCol.raw32[0]), [g_b]"r"(gCol.raw32[1])
+         : "cc");
+       return s_d;
  }
-#else
-GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
-{
-       uint_fast16_t out = 0x03E0;
-       u32 db, dg;
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudARM
  
-       // Using `g` for src, `G` for dest
-       asm ("and    %[dg],  %[out],    %[src]  \n\t"             // dg holds 0x000000ggggg00000
-            "orr    %[dg],  %[dg],     %[g5]   \n\t"             // dg holds 0x000000gggggGGGGG
-            "and    %[db],  %[out],    %[src], lsr #0x05 \n\t"   // db holds 0x000000bbbbb00000
-            "ldrb   %[dg],  [%[lut],   %[dg]]  \n\t"             // dg holds result 0x00000000000ggggg
-            "and    %[out], %[out],    %[src], lsl #0x05 \n\t"   // out holds 0x000000rrrrr00000
-            "orr    %[out], %[out],    %[r5]   \n\t"             // out holds 0x000000rrrrrRRRRR
-            "orr    %[db],  %[db],     %[b5]   \n\t"             // db holds 0x000000bbbbbBBBBB
-            "ldrb   %[out], [%[lut],   %[out]] \n\t"             // out holds result 0x00000000000rrrrr
-            "ldrb   %[db],  [%[lut],   %[db]]  \n\t"             // db holds result 0x00000000000bbbbb
-            "tst    %[src], #0x8000\n\t"                         // check whether msb was set on uSrc
-            "orr    %[out], %[out],    %[dg],  lsl #0x05   \n\t" // out holds 0x000000gggggrrrrr
-            "orrne  %[out], %[out],    #0x8000\n\t"              // add msb to out if set on uSrc
-            "orr    %[out], %[out],    %[db],  lsl #0x0A   \n\t" // out holds 0xmbbbbbgggggrrrrr
-            : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg)
-            : [r5] "r" (r5), [g5] "r" (g5),  [b5] "r" (b5),
-              [lut] "r" (gpu_unai.LightLUT), [src] "r" (uSrc), "0" (out)
-            : "cc");
-       return out;
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
-//
-// INPUT:
-//  'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
-//     15.0 is midpoint that does not modify color of texture
-//        gCol input :  rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
-//                      ^ bit 31
-//       'uSrc' input:  mbbbbbgggggrrrrr
-//                      ^ bit 16
-// RETURNS:
-//         u16 output:  mbbbbbgggggrrrrr
-// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
-////////////////////////////////////////////////////////////////////////////////
-GPU_INLINE uint_fast16_t gpuLightingTXTGouraudARM(uint_fast16_t uSrc, u32 gCol)
-{
-       uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
-       u32 db,dg,gtmp;
-
-       // Using `g` for src, `G` for dest
-       asm ("and    %[dg],  %[out],  %[src]   \n\t"           // dg holds 0x000000ggggg00000
-            "and    %[gtmp],%[out],  %[gCol], lsr #0x0B \n\t" // gtmp holds 0x000000GGGGG00000
-            "and    %[db],  %[out],  %[src],  lsr #0x05 \n\t" // db holds 0x000000bbbbb00000
-            "orr    %[dg],  %[dg],   %[gtmp], lsr #0x05 \n\t" // dg holds 0x000000gggggGGGGG
-            "and    %[gtmp],%[out],  %[gCol]  \n\t"           // gtmp holds 0x000000BBBBB00000
-            "ldrb   %[dg],  [%[lut], %[dg]]   \n\t"           // dg holds result 0x00000000000ggggg
-            "and    %[out], %[out],  %[src],  lsl #0x05 \n\t" // out holds 0x000000rrrrr00000
-            "orr    %[out], %[out],  %[gCol], lsr #0x1B \n\t" // out holds 0x000000rrrrrRRRRR
-            "orr    %[db],  %[db],   %[gtmp], lsr #0x05 \n\t" // db holds 0x000000bbbbbBBBBB
-            "ldrb   %[out], [%[lut], %[out]]  \n\t"           // out holds result 0x00000000000rrrrr
-            "ldrb   %[db],  [%[lut], %[db]]   \n\t"           // db holds result 0x00000000000bbbbb
-            "tst    %[src], #0x8000\n\t"                      // check whether msb was set on uSrc
-            "orr    %[out], %[out],  %[dg],   lsl #0x05 \n\t" // out holds 0x000000gggggrrrrr
-            "orrne  %[out], %[out],  #0x8000\n\t"             // add msb to out if set on uSrc
-            "orr    %[out], %[out],  %[db],   lsl #0x0A \n\t" // out holds 0xmbbbbbgggggrrrrr
-            : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg),
-              [gtmp] "=&r" (gtmp) \
-            : [gCol] "r" (gCol), [lut] "r" (gpu_unai.LightLUT), "0" (out), [src] "r" (uSrc)
-            : "cc");
-
-       return out;
-}
+#endif // HAVE_ARMV6
  
  #endif  //_OP_LIGHT_ARM_H_
diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h

index 5c7b67c..26c7332 100644 (file)
--- a/plugins/gpu_unai/gpu_raster_sprite.h
+++ b/plugins/gpu_unai/gpu_raster_sprite.h
@@ -63,6 +63,9 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out)
  
         le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
  
+       gpu_unai.inn.r8 = packet.U1[0];
+       gpu_unai.inn.g8 = packet.U1[1];
+       gpu_unai.inn.b8 = packet.U1[2];
         gpu_unai.inn.r5 = packet.U1[0] >> 3;
         gpu_unai.inn.g5 = packet.U1[1] >> 3;
         gpu_unai.inn.b5 = packet.U1[2] >> 3;
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h

index 91cdb8a..2e30a28 100644 (file)
--- a/plugins/gpu_unai/gpu_unai.h
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -56,12 +56,51 @@
  #define s64 int64_t
  #define u64 uint64_t
  
-typedef union {
+union gcol_t {
         struct {
                 u16 r, g, b;
+#ifdef HAVE_ARMV6
+               u16 counter;
+#else
+               u16 unused;
+#endif
         } c;
+#if defined(HAVE_ARMV6) || (defined(__SIZEOF_SIZE_T__) && __SIZEOF_SIZE_T__ == 4)
+       u32 raw32[2];
+#else
         u64 raw;
-} gcol_t;
+#endif
+
+       inline gcol_t & operator+=(const gcol_t &rhs)
+       {
+#ifdef HAVE_ARMV6
+               // prevent bit spills the other versions have,
+               // allowing to use the unused part as a counter
+               asm("uadd16 %[d], %[d], %[s]" : [d]"+r"(raw32[0]) : [s]"r"(rhs.raw32[0]));
+               asm("uadd16 %[d], %[d], %[s]" : [d]"+r"(raw32[1]) : [s]"r"(rhs.raw32[1]));
+#elif defined(__SIZEOF_SIZE_T__) && __SIZEOF_SIZE_T__ == 4
+               // avoid having to do carry that's not needed here
+               raw32[0] += rhs.raw32[0];
+               raw32[1] += rhs.raw32[1];
+#else
+               raw += rhs.raw;
+#endif
+               return *this;
+       }
+
+       inline void set_counter(int counter)
+       {
+#ifdef HAVE_ARMV6
+               c.counter = counter;
+#endif
+       }
+       inline void get_counter(int &counter)
+       {
+#ifdef HAVE_ARMV6
+               counter = raw32[1];
+#endif
+       }
+};
  
  #ifndef NDEBUG
  
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp

index 71eccb1..71c9272 100644 (file)
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -24,6 +24,7 @@
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
+#include "arm_features.h"
  #include "../gpulib/gpu.h"
  #include "old/if.h"
author	notaz <notasas@gmail.com>
	Wed, 7 Jan 2026 23:39:43 +0000 (01:39 +0200)
committer	notaz <notasas@gmail.com>
	Fri, 9 Jan 2026 02:35:29 +0000 (04:35 +0200)
include/arm_features.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_arm.S		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner_light.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner_light_arm.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_raster_sprite.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_unai.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpulib_if.cpp		patch \| blob \| blame \| history