gpu_unai: asm part 3

author notaz <notasas@gmail.com>

Wed, 27 Nov 2024 00:14:03 +0000 (02:14 +0200)

committer notaz <notasas@gmail.com>

Sun, 1 Dec 2024 16:15:06 +0000 (18:15 +0200)
author notaz <notasas@gmail.com>
Wed, 27 Nov 2024 00:14:03 +0000 (02:14 +0200)
committer notaz <notasas@gmail.com>
Sun, 1 Dec 2024 16:15:06 +0000 (18:15 +0200)
diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S

index f068499..b56951f 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.S
+++ b/plugins/gpu_unai/gpu_arm.S
@@ -7,6 +7,7 @@
  
  #include "arm_features.h"
  
+.syntax unified
  .text
  .align 2
  
@@ -32,13 +33,13 @@
      ldrh    r8, [r2, r8]
      ldrh    lr, [r2, lr]
      tst     r6, r6
-    strneh  r6, [r0, #\obase+0]
+    strhne  r6, [r0, #\obase+0]
      tst     r7, r7
-    strneh  r7, [r0, #\obase+2]
+    strhne  r7, [r0, #\obase+2]
      tst     r8, r8
-    strneh  r8, [r0, #\obase+4]
+    strhne  r8, [r0, #\obase+4]
      tst     lr, lr
-    strneh  lr, [r0, #\obase+6]
+    strhne  lr, [r0, #\obase+6]
  .endm
  
  @ in: r0=dst, r2=pal, r12=0x1fe
@@ -53,13 +54,13 @@
      ldrh     r8, [r2, r8]
      ldrh     \rs,[r2, \rs]
      tst      r6, r6
-    strneh   r6, [r0, #0]
+    strhne   r6, [r0, #0]
      tst      r7, r7
-    strneh   r7, [r0, #2]
+    strhne   r7, [r0, #2]
      tst      r8, r8
-    strneh   r8, [r0, #4]
+    strhne   r8, [r0, #4]
      tst      \rs,\rs
-    strneh   \rs,[r0, #6]
+    strhne   \rs,[r0, #6]
  .endm
  
  .global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines)
@@ -175,7 +176,7 @@ sprite_driver_4bpp_asm:
      ldrh    r7, [r2, r7]
      add     r0, r0, #2
      tst     r7, r7
-    strneh  r7, [r0, #-2]
+    strhne  r7, [r0, #-2]
      subs    r8, r8, #1
      bgt     0b
      sprite_driver_part3
@@ -200,7 +201,7 @@ sprite_driver_8bpp_asm:
      ldrh    r7, [r2, r7]
      add     r0, r0, #2
      tst     r7, r7
-    strneh  r7, [r0, #-2]
+    strhne  r7, [r0, #-2]
      subs    r8, r8, #1
      bgt     0b
      sprite_driver_part3
@@ -254,7 +255,7 @@ poly_4bpp_asm:
      add     r0, r0, #2
      mov     r7, r4, lsr #13
      tst     r12,r12
-    strneh  r12,[r0, #-2]
+    strhne  r12,[r0, #-2]
      subs    r2, r2, #1
      bgt     0b
  
@@ -285,7 +286,7 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      and     lr, r7, r9
      tst     r12,r12
      add     lr, r1, lr, lsl #1
-    strneh  r12,[r0, #-2]
+    strhne  r12,[r0, #-2]
      mov     r12,r4, lsr #13
      subs    r2, r2, #1
      bgt     0b
@@ -304,7 +305,7 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      smulbb  \t0, \t0, \mbr       @ -> 0000 0000 0000 orrr  rrxx xxxx xxxx xxxx
      smulbt  \t1, \t1, \mg        @ -> 0000 000o gggg gxxx  xxxx xxxx xxx0 0000
      smulbt  \t2, \t2, \mbr       @ -> 00ob bbbb xxxx xxxx  xxxx xx00 0000 0000
-    and     \rp, \rp, #0x8000
+    ands    \rp, \rp, #0x8000    @ retain msb + semi-transparency test
      usat    \t0, #5, \t0, asr #14
      usat    \t1, #5, \t1, asr #19
      usat    \t2, #5, \t2, asr #24
@@ -313,13 +314,25 @@ poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      orr     \rp, \rp, \t2, lsl #10
  .endm
  
-.global poly_4bpp_l_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
-poly_4bpp_l_asm:
+@ http://www.slack.net/~ant/info/rgb_mixing.html
+@ p0 = (p0 + p1) / 2; p1 |= 0x8000
+@ msb of input p0 is assumed to be set
+.macro semitrans0 p0 p1 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t, #0x0420
+    sub     \p0, \p0, \t
+    orr     \p1, \p1, #0x8000
+    uhadd16 \p0, \p0, \p1
+.endm
+
+.macro poly_4bpp_asm_m name semitrans
+.global \name @ (void *d, const struct gpu_unai_inner_t *inn, int count)
+\name:
      .cfi_startproc
      stmfd   sp!, {r4-r11,lr}
      .cfi_def_cfa_offset 4*9
      .cfi_rel_offset lr, 4*8
-    poly_4bpp_init poly_4bpp_lv_asm 1
+    poly_4bpp_init v_\name 1
  0:
      mov     r12,r4, lsr #13
      subs    r2, r2, #1
@@ -337,12 +350,20 @@ poly_4bpp_l_asm:
      tst     r12,r12
      beq     0b
      modulate r12, r10, r11, r7, r8, lr
+.if \semitrans < 0
+    @ no semi-transparency
+.elseif \semitrans == 0
+    ldrhne  r7, [r0, #-2]
+    strheq  r12,[r0, #-2]
+    beq     0b
+    semitrans0 r12, r7, lr
+.endif
      strh    r12,[r0, #-2]
      b       0b
  1:
      ldmfd   sp!, {r4-r11,pc}
  
-poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      sub     sp, sp, #4*2
      .cfi_def_cfa_offset 4*(9+2)
      .cfi_rel_offset lr, 4*(8+2)
@@ -372,6 +393,14 @@ poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      tst     r12,r12
      beq     0b
      modulate r12, r10, r11, r5, r6, lr
+.if \semitrans < 0
+    @ no semi-transparency
+.elseif \semitrans == 0
+    ldrhne  r7, [r0, #-2]
+    strheq  r12,[r0, #-2]
+    beq     0b
+    semitrans0 r12, r7, lr
+.endif
      strh    r12,[r0, #-2]
      ldmia   sp, {r5,r6}
      b       0b
@@ -379,6 +408,10 @@ poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      add     sp, sp, #4*2
      ldmfd   sp!, {r4-r11,pc}
      .cfi_endproc
+.endm
+
+poly_4bpp_asm_m poly_4bpp_l_asm,    -1
+poly_4bpp_asm_m poly_4bpp_l_st0_asm, 0
  
  #endif // HAVE_ARMV6
  
diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h

index ccdc781..027aa53 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.h
+++ b/plugins/gpu_unai/gpu_arm.h
@@ -16,6 +16,7 @@ void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
  
  void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  void poly_4bpp_l_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  
  #ifdef __cplusplus
  }
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h

index 5cef54a..87324b9 100644 (file)
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -56,6 +56,7 @@
  #include "gpu_inner_light.h"
  
  #include "arm_features.h"
+#include "compiler_features.h"
  #ifdef __arm__
  #include "gpu_inner_blend_arm.h"
  #include "gpu_inner_light_arm.h"
@@ -372,7 +373,7 @@ typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt,
         const spriteDriverArg *arg);
  
  template<int CF>
-static void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
         const spriteDriverArg *arg)
  {
         // Blend func can save an operation if it knows uSrc MSB is unset.
@@ -557,7 +558,7 @@ const PS gpuSpriteDrivers[256] = {
  //             relevant blend/light headers.
  // (see README_senquack.txt)
  template<int CF>
-static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
  {
         // Blend func can save an operation if it knows uSrc MSB is unset.
         //  Untextured prims can always skip this (src color MSB is always 0).
@@ -754,11 +755,13 @@ endpolytext:
  
  #ifdef __arm__
  template<int CF>
-static void PolySpanAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
  {
         switch (CF) {
-       case 0x20: poly_4bpp_asm  (pDst, &gpu_unai.inn, count); break;
-       case 0x21: poly_4bpp_l_asm(pDst, &gpu_unai.inn, count); break;
+       case 0x20: poly_4bpp_asm      (pDst, &gpu_unai.inn, count); break;
+       case 0x21: poly_4bpp_l_asm    (pDst, &gpu_unai.inn, count); break;
+       case 0x23: poly_4bpp_l_st0_asm(pDst, &gpu_unai.inn, count); break;
+       default:   gpuPolySpanFn<CF>(gpu_unai, pDst, count);
         }
  }
  #endif
@@ -778,12 +781,12 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
  #define TI(cf) gpuPolySpanFn<(cf)>
  #define TN     PolyNULL
  #ifdef __arm__
-#define TA(cf) PolySpanAsm<(cf)>
+#define TA(cf) PolySpanMaybeAsm<(cf)>
  #else
  #define TA(cf) TI(cf)
  #endif
  #ifdef HAVE_ARMV6
-#define TA6(cf) PolySpanAsm<(cf)>
+#define TA6(cf) PolySpanMaybeAsm<(cf)>
  #else
  #define TA6(cf) TI(cf)
  #endif
@@ -792,7 +795,7 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
         TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
         TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
         TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
-       TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
         TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
         TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
         TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm.h b/plugins/gpu_unai/gpu_inner_blend_arm.h

index 6413527..f887374 100644 (file)
--- a/plugins/gpu_unai/gpu_inner_blend_arm.h
+++ b/plugins/gpu_unai/gpu_inner_blend_arm.h
@@ -41,10 +41,14 @@ GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst)
                 asm ("eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst
                      "and %[mix], %[mix], %[mask]\n\t"  // ... & 0x0421
                      "sub %[mix], %[uDst], %[mix]\n\t"  // uDst - ...
+               #ifdef HAVE_ARMV6
+                    "uhadd16 %[mix], %[uSrc], %[mix]\n\t"
+               #else
                      "add %[mix], %[uSrc], %[mix]\n\t"  // uSrc + ...
                      "mov %[mix], %[mix], lsr #0x1\n\t" // ... >> 1
+               #endif
                      : [mix] "=&r" (mix)
-                    : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421));
+                    : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0420)); // 421
         }
  
         if (BLENDMODE == 1 || BLENDMODE == 3) {
author	notaz <notasas@gmail.com>
	Wed, 27 Nov 2024 00:14:03 +0000 (02:14 +0200)
committer	notaz <notasas@gmail.com>
	Sun, 1 Dec 2024 16:15:06 +0000 (18:15 +0200)
plugins/gpu_unai/gpu_arm.S		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_arm.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner_blend_arm.h		patch \| blob \| blame \| history