gpu_unai: asm part 2

author notaz <notasas@gmail.com>

Tue, 26 Nov 2024 00:26:59 +0000 (02:26 +0200)

committer notaz <notasas@gmail.com>

Sun, 1 Dec 2024 16:15:00 +0000 (18:15 +0200)
author notaz <notasas@gmail.com>
Tue, 26 Nov 2024 00:26:59 +0000 (02:26 +0200)
committer notaz <notasas@gmail.com>
Sun, 1 Dec 2024 16:15:00 +0000 (18:15 +0200)
diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S

index 3b68ace..f068499 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.S
+++ b/plugins/gpu_unai/gpu_arm.S
@@ -207,27 +207,40 @@ sprite_driver_8bpp_asm:
      .cfi_endproc
  
  
-.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
-poly_4bpp_asm:
-    .cfi_startproc
+.macro poly_4bpp_init v_target need_rgb
      add     r12, r1, #4
-    stmfd   sp!, {r4-r7,lr}
-    .cfi_def_cfa_offset 4*5
-    .cfi_rel_offset lr, 4*4
      ldmia   r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk
      ldr     r5, [r1, #0x18]    @ u_inc
+.if \need_rgb
+    ldr     r10,[r1, #0x24]    @ rbg
+.endif
      mov     r6, r12
      ldr     r12,[r1, #0x1c]    @ v_inc
+.if \need_rgb
+    mov     r10,r10,lsl #7     @ 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000
+    bic     r10,r10,#1<<23
+    bic     r10,r10,#1<<15
+    mov     r11,r10,lsl #8
+.endif
      and     r4, r4, r6
      and     lr, lr, r7         @ v_msk & v
      and     lr, lr, #0xff<<10
      tst     r12,r12
-    bne     poly_4bpp_asm_v
+    bne     \v_target
      ldr     r1, [r1]           @ src
      mov     r7, r4, lsr #13
      add     r1, r1, lr, lsl #1
      add     r12,r1, r7, lsl #2
      pld_    r12,#2048
+.endm
+
+.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
+poly_4bpp_asm:
+    .cfi_startproc
+    stmfd   sp!, {r4-r7,lr}
+    .cfi_def_cfa_offset 4*5
+    .cfi_rel_offset lr, 4*4
+    poly_4bpp_init poly_4bpp_v_asm 0
  0:
      ldr     lr, [r1, r7, lsl #2]
      lsr     r12,r4, #8
@@ -247,8 +260,10 @@ poly_4bpp_asm:
  
      ldmfd   sp!, {r4-r7,pc}
  
-poly_4bpp_asm_v: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      stmfd   sp!, {r8-r9}
+    .cfi_def_cfa_offset 4*7
+    .cfi_rel_offset lr, 4*6
      ldr     r9, [r1, #0x14]    @ v_msk
      ldr     r1, [r1]           @ src
      mov     r8, r12            @ v_inc
@@ -279,4 +294,92 @@ poly_4bpp_asm_v: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      ldmfd   sp!, {r4-r7,pc}
      .cfi_endproc
  
+
+#ifdef HAVE_ARMV6
+
+.macro modulate rp mbr mg t0 t1 t2
+    and     \t0, \rp, #0x001f
+    and     \t1, \rp, #0x03e0
+    and     \t2, \rp, #0x7c00
+    smulbb  \t0, \t0, \mbr       @ -> 0000 0000 0000 orrr  rrxx xxxx xxxx xxxx
+    smulbt  \t1, \t1, \mg        @ -> 0000 000o gggg gxxx  xxxx xxxx xxx0 0000
+    smulbt  \t2, \t2, \mbr       @ -> 00ob bbbb xxxx xxxx  xxxx xx00 0000 0000
+    and     \rp, \rp, #0x8000
+    usat    \t0, #5, \t0, asr #14
+    usat    \t1, #5, \t1, asr #19
+    usat    \t2, #5, \t2, asr #24
+    orr     \rp, \rp, \t0
+    orr     \rp, \rp, \t1, lsl #5
+    orr     \rp, \rp, \t2, lsl #10
+.endm
+
+.global poly_4bpp_l_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
+poly_4bpp_l_asm:
+    .cfi_startproc
+    stmfd   sp!, {r4-r11,lr}
+    .cfi_def_cfa_offset 4*9
+    .cfi_rel_offset lr, 4*8
+    poly_4bpp_init poly_4bpp_lv_asm 1
+0:
+    mov     r12,r4, lsr #13
+    subs    r2, r2, #1
+    bmi     1f
+    ldr     lr, [r1, r12, lsl #2]
+    lsr     r12,r4, #8
+    and     r12,r12,#0x1c
+    sub     r12,r12,#1
+    mov     r12,lr, ror r12
+    add     r4, r4, r5
+    and     r12,r12,#0x1e
+    and     r4, r4, r6
+    ldrh    r12,[r3, r12]
+    add     r0, r0, #2
+    tst     r12,r12
+    beq     0b
+    modulate r12, r10, r11, r7, r8, lr
+    strh    r12,[r0, #-2]
+    b       0b
+1:
+    ldmfd   sp!, {r4-r11,pc}
+
+poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+    sub     sp, sp, #4*2
+    .cfi_def_cfa_offset 4*(9+2)
+    .cfi_rel_offset lr, 4*(8+2)
+    ldr     r9, [r1, #0x14]    @ v_msk
+    ldr     r1, [r1]           @ src
+    mov     r8, r12            @ v_inc
+    mov     r12,r4, lsr #13
+    and     r9, r9, #0xff<<10  @ v_msk_final
+    stmia   sp, {r5,r6}
+0:
+    and     lr, r7, r9
+    mov     r12,r4, lsr #13
+    add     lr, r1, lr, lsl #1
+    subs    r2, r2, #1
+    bmi     1f
+    ldr     lr, [lr, r12, lsl #2]
+    lsr     r12,r4, #8
+    and     r12,r12,#0x1c
+    sub     r12,r12,#1
+    mov     r12,lr, ror r12
+    add     r4, r4, r5
+    and     r12,r12,#0x1e
+    and     r4, r4, r6
+    ldrh    r12,[r3, r12]
+    add     r0, r0, #2
+    add     r7, r7, r8
+    tst     r12,r12
+    beq     0b
+    modulate r12, r10, r11, r5, r6, lr
+    strh    r12,[r0, #-2]
+    ldmia   sp, {r5,r6}
+    b       0b
+1:
+    add     sp, sp, #4*2
+    ldmfd   sp!, {r4-r11,pc}
+    .cfi_endproc
+
+#endif // HAVE_ARMV6
+
  @ vim:filetype=armasm
diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h

index 287846e..ccdc781 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.h
+++ b/plugins/gpu_unai/gpu_arm.h
@@ -15,6 +15,7 @@ void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base,
  void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
  
  void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  
  #ifdef __cplusplus
  }
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h

index 4f2b115..5cef54a 100644 (file)
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -55,6 +55,7 @@
  #include "gpu_inner_quantization.h"
  #include "gpu_inner_light.h"
  
+#include "arm_features.h"
  #ifdef __arm__
  #include "gpu_inner_blend_arm.h"
  #include "gpu_inner_light_arm.h"
@@ -752,9 +753,13 @@ endpolytext:
  }
  
  #ifdef __arm__
-static void PolySpan4bppAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+template<int CF>
+static void PolySpanAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
  {
-       poly_4bpp_asm(pDst, &gpu_unai.inn, count);
+       switch (CF) {
+       case 0x20: poly_4bpp_asm  (pDst, &gpu_unai.inn, count); break;
+       case 0x21: poly_4bpp_l_asm(pDst, &gpu_unai.inn, count); break;
+       }
  }
  #endif
  
@@ -773,16 +778,21 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
  #define TI(cf) gpuPolySpanFn<(cf)>
  #define TN     PolyNULL
  #ifdef __arm__
-#define TA4(cf) PolySpan4bppAsm
+#define TA(cf) PolySpanAsm<(cf)>
  #else
-#define TA4(cf) TI(cf)
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) PolySpanAsm<(cf)>
+#else
+#define TA6(cf) TI(cf)
  #endif
  #define TIBLOCK(ub) \
         TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
         TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
         TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
         TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
-       TA4((ub)|0x20),TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
         TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
         TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
         TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
@@ -819,7 +829,7 @@ const PP gpuPolySpanDrivers[2048] = {
  #undef TI
  #undef TN
  #undef TIBLOCK
-#undef TA4
-#undef TA8
+#undef TA
+#undef TA6
  
  #endif /* __GPU_UNAI_GPU_INNER_H__ */
diff --git a/plugins/gpu_unai/gpu_inner_light_arm.h b/plugins/gpu_unai/gpu_inner_light_arm.h

index 7bd5890..7edb8fb 100644 (file)
--- a/plugins/gpu_unai/gpu_inner_light_arm.h
+++ b/plugins/gpu_unai/gpu_inner_light_arm.h
@@ -1,6 +1,8 @@
  #ifndef _OP_LIGHT_ARM_H_
  #define _OP_LIGHT_ARM_H_
  
+#include "arm_features.h"
+
  ////////////////////////////////////////////////////////////////////////////////
  // Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
  //
@@ -40,6 +42,27 @@ GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
  //         u16 output:  mbbbbbgggggrrrrr
  // Where 'X' are fixed-pt bits.
  ////////////////////////////////////////////////////////////////////////////////
+#ifdef HAVE_ARMV6
+// clang uses smulbb but not gcc, so we need this
+GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b)
+{
+       int_fast16_t r;
+       asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+       return r;
+}
+
+GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+{
+       // on v6 we have single-cycle mul and sat which is better than the lut
+       int_fast16_t r = smulbb(uSrc & 0x001f, r5);
+       int_fast16_t g = smulbb(uSrc & 0x03e0, g5);
+       int_fast16_t b = smulbb(uSrc & 0x7c00, b5);
+       asm volatile("usat %0, #5, %0, asr #4"  : "=r"(r) : "0"(r));
+       asm volatile("usat %0, #5, %0, asr #9"  : "=r"(g) : "0"(g));
+       asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b));
+       return (uSrc & 0x8000) | (b << 10) | (g << 5) | r;
+}
+#else
  GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
  {
         uint_fast16_t out = 0x03E0;
@@ -65,6 +88,7 @@ GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8
              : "cc");
         return out;
  }
+#endif
  
  ////////////////////////////////////////////////////////////////////////////////
  // Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h

index fff9126..722041a 100644 (file)
--- a/plugins/gpu_unai/gpu_unai.h
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -208,6 +208,10 @@ struct gpu_unai_inner_t {
         u32 u_msk, v_msk;         // 10
         s32 u_inc, v_inc;         // 18
  
+       // Color for flat-shaded, texture-blended prims
+       u8  r5, g5, b5, pad5;     // 20 5-bit light for undithered prims
+       u8  r8, g8, b8, pad8;     // 24 8-bit light for dithered prims
+
         // Color for Gouraud-shaded prims
         // Fixed-pt 8.8 rgb triplet
         // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
@@ -216,10 +220,6 @@ struct gpu_unai_inner_t {
         gcol_t gCol;
         gcol_t gInc;       // Increment along scanline for gCol
  
-       // Color for flat-shaded, texture-blended prims
-       u8  r5, g5, b5;    // 5-bit light for undithered prims
-       u8  r8, g8, b8;    // 8-bit light for dithered prims
-
         // Color for flat-shaded, untextured prims
         u16 PixelData;      // bgr555 color for untextured flat-shaded polys
  };
author	notaz <notasas@gmail.com>
	Tue, 26 Nov 2024 00:26:59 +0000 (02:26 +0200)
committer	notaz <notasas@gmail.com>
	Sun, 1 Dec 2024 16:15:00 +0000 (18:15 +0200)
plugins/gpu_unai/gpu_arm.S		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_arm.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner_light_arm.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_unai.h		patch \| blob \| blame \| history