gpu_unai: asm part 5

author notaz <notasas@gmail.com>

Mon, 2 Dec 2024 00:56:09 +0000 (02:56 +0200)

committer notaz <notasas@gmail.com>

Tue, 3 Dec 2024 22:49:21 +0000 (00:49 +0200)
author notaz <notasas@gmail.com>
Mon, 2 Dec 2024 00:56:09 +0000 (02:56 +0200)
committer notaz <notasas@gmail.com>
Tue, 3 Dec 2024 22:49:21 +0000 (00:49 +0200)
diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S

index 9970c02..a516f08 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.S
+++ b/plugins/gpu_unai/gpu_arm.S
@@ -40,14 +40,65 @@
  @ msb of input p0 is assumed to be set
  .macro semitrans0 p0 p1 t
      eor     \t,  \p0, \p1
-    and     \t,  \t, #0x0420
+    and     \t,  \t,  #0x0420
      sub     \p0, \p0, \t
      orr     \p1, \p1, #0x8000
      uhadd16 \p0, \p0, \p1
  .endm
  
+.macro semitrans0p p0 p1 m421 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t,  \m421
+    add     \p0, \p0, \p1
+    uhsub16 \p0, \p0, \t           @ sub because of borrow into hi16
+.endm
+
+@ p0 - {p1|r,g,b}   // p1* - premasked rgb
+.macro semitrans2p p0 p1r p1g p1b m1f t0 t1
+    and     \t0, \p0, \m1f
+    and     \t1, \p0, \m1f, lsl #5
+    and     \p0, \p0, \m1f, lsl #10
+    uqsub16 \t0, \t0, \p1r
+    uqsub16 \t1, \t1, \p1g
+    uqsub16 \p0, \p0, \p1b
+    orr     \t0, \t0, \t1
+    orr     \p0, \p0, \t0
+.endm
+
+#else
+
+@ msb of input p0 is assumed to be set
+.macro semitrans0 p0 p1 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t,  #0x0420
+    orr     \p1, \p1, #0x8000
+    sub     \p0, \p0, \t
+    add     \p0, \p0, \p1
+    orr     \p0, \p0, #0x10000
+    mov     \p0, \p0, lsr #1
+.endm
+
+.macro semitrans0p p0 p1 m421 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t,  \m421
+    add     \p0, \p0, \p1
+    sub     \p0, \p0, \t
+    mov     \p0, \p0, lsr #1
+.endm
+
  #endif // HAVE_ARMV6
  
+.macro semitrans13p p0 p1 m421 t0
+    add     \t0, \p0, \p1
+    eor     \p0, \p0, \p1
+    and     \p0, \p0, \m421          @ low_bits
+    sub     \p0, \t0, \p0
+    and     \p0, \p0, \m421, lsl #5  @ carries
+    sub     \t0, \t0, \p0            @ modulo
+    sub     \p0, \p0, \p0, lsr #5    @ clamp
+    orr     \p0, \t0, \p0
+.endm
+
  
  @ in: r0=dst, r2=pal, r12=0x1e
  @ trashes r6-r8,lr,flags
@@ -95,6 +146,91 @@
      strhne   \rs,[r0, #6]
  .endm
  
+
+@ (void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn)
+@ see also poly_untex_st_m
+.macro tile_driver_st_m name semit
+FUNCTION(\name):
+    .cfi_startproc
+    stmfd   sp!, {r4-r9,lr}
+    .cfi_def_cfa_offset 4*7
+    .cfi_rel_offset lr, 4*6
+    ldr     r7, [r3, #0x18]        @ y0
+    ldr     r8, [r3, #0x1c]        @ y1
+.if \semit != 2
+    mov     r4, #0x8000
+    orr     r4, r4, r4, lsl #16    @ mask 8000
+    mov     r6, #0x420
+    orr     r6, r6, #1
+    orr     r6, r6, r6, lsl #16    @ mask 0421
+.endif
+.if \semit == 2
+    and     r4, r1, #0x03e0
+    and     r5, r1, #0x7c00
+    and     r1, r1, #0x001f
+    orr     r4, r4, r4, lsl #16    @ premasked g
+    orr     r5, r5, r5, lsl #16    @ premasked b
+    mov     r6, #0x00001f
+    orr     r6, #0x1f0000          @ mask
+.elseif \semit == 3
+    mov     r1, r1, lsr #2
+    bic     r1, r1, #(0x0c60>>2)
+.endif
+    orr     r1, r1, r1, lsl #16
+    sub     r3, r8, r7             @ h
+    mov     r7, r2                 @ save w
+0:
+    ldrh    r8, [r0]
+    pld_    r0, #2048
+    tst     r0, #2
+    beq     1f
+    sub     r2, #1
+.if \semit == 0
+    bic     r8, r8, r4
+    semitrans0p  r8, r1, r6, lr
+.elseif \semit == 1 || \semit == 3
+    bic     r8, r8, r4
+    semitrans13p r8, r1, r6, lr
+.elseif \semit == 2
+    semitrans2p  r8, r1, r4, r5, r6, r9, lr
+.endif
+    strh    r8, [r0], #2
+1:
+    ldr     r8, [r0]
+    pld_    r0, #32
+    subs    r2, r2, #2
+.if \semit == 0
+    bic     r8, r8, r4
+    semitrans0p  r8, r1, r6, lr
+.elseif \semit == 1 || \semit == 3
+    bic     r8, r8, r4
+    semitrans13p r8, r1, r6, lr
+.elseif \semit == 2
+    semitrans2p  r8, r1, r4, r5, r6, r9, lr
+.endif
+    strpl   r8, [r0], #4
+    bpl     1b
+2:
+    tst     r2, #1
+    strhne  r8, [r0], #2
+    mov     r2, r7                 @ w
+    add     r0, r0, #2048
+    sub     r0, r0, r7, lsl #1
+    subs    r3, r3, #1
+    bgt     0b
+
+    ldmfd   sp!, {r4-r9,pc}
+    .cfi_endproc
+.endm
+
+
+tile_driver_st_m tile_driver_st0_asm, 0
+tile_driver_st_m tile_driver_st1_asm, 1
+tile_driver_st_m tile_driver_st3_asm, 3
+#ifdef HAVE_ARMV6
+tile_driver_st_m tile_driver_st2_asm, 2
+#endif
+
  @ (u16 *d, void *s, u16 *pal, int lines)
  sprite_4bpp_x16_asm_:
      ldr     r12,[r3, #0x18]        @ y0
@@ -106,7 +242,7 @@ FUNCTION(sprite_4bpp_x16_asm):
      stmfd   sp!, {r4-r8,lr}
      .cfi_def_cfa_offset 4*6
      .cfi_rel_offset lr, 4*5
-    mov     r12, #0x1e             @ empty pixel
+    mov     r12, #0x1e
  
  0:
      ldmia   r1, {r4,r5}
@@ -343,15 +479,15 @@ FUNCTION(\name):
  .endm
  
  sprite_driver_l_st sprite_driver_4bpp_l0_std_asm, 4, 0, -1
+sprite_driver_l_st sprite_driver_4bpp_l0_st0_asm, 4, 0,  0
  sprite_driver_l_st sprite_driver_8bpp_l0_std_asm, 8, 0, -1
+sprite_driver_l_st sprite_driver_8bpp_l0_st0_asm, 8, 0,  0
  
  #ifdef HAVE_ARMV6
  
-sprite_driver_l_st sprite_driver_4bpp_l0_st0_asm, 4, 0,  0
  sprite_driver_l_st sprite_driver_4bpp_l1_std_asm, 4, 1, -1
  sprite_driver_l_st sprite_driver_4bpp_l1_st0_asm, 4, 1,  0
  sprite_driver_l_st sprite_driver_4bpp_l1_st1_asm, 4, 1,  1
-sprite_driver_l_st sprite_driver_8bpp_l0_st0_asm, 8, 0,  0
  sprite_driver_l_st sprite_driver_8bpp_l1_std_asm, 8, 1, -1
  sprite_driver_l_st sprite_driver_8bpp_l1_st0_asm, 8, 1,  0
  sprite_driver_l_st sprite_driver_8bpp_l1_st1_asm, 8, 1,  1
@@ -414,6 +550,82 @@ FUNCTION(sprite_driver_16bpp_asm):
      .cfi_endproc
  
  
+@ (void *d, const gpu_unai_inner_t *inn, int count)
+@ see also tile_driver_st_m
+.macro poly_untex_st_m name semit
+FUNCTION(\name):
+    .cfi_startproc
+    ldrh    r1, [r1, #0x38]        @ rgb
+    stmfd   sp!, {r4-r7,lr}
+    .cfi_def_cfa_offset 4*5
+    .cfi_rel_offset lr, 4*4
+.if \semit != 2
+    mov     r4, #0x8000
+    orr     r4, r4, r4, lsl #16    @ mask 8000
+    mov     r6, #0x420
+    orr     r6, r6, #1
+    orr     r6, r6, r6, lsl #16    @ mask 0421
+.endif
+.if \semit == 2
+    and     r4, r1, #0x03e0
+    and     r5, r1, #0x7c00
+    and     r1, r1, #0x001f
+    orr     r4, r4, r4, lsl #16    @ premasked g
+    orr     r5, r5, r5, lsl #16    @ premasked b
+    mov     r6, #0x00001f
+    orr     r6, #0x1f0000          @ mask
+.elseif \semit == 3
+    mov     r1, r1, lsr #2
+    bic     r1, r1, #(0x0c60>>2)
+.endif
+    orr     r1, r1, r1, lsl #16
+0:
+    ldrh    r3, [r0]
+    pld_    r0, #2048
+    tst     r0, #2
+    beq     1f
+    sub     r2, #1
+.if \semit == 0
+    bic     r3, r3, r4
+    semitrans0p  r3, r1, r6, lr
+.elseif \semit == 1 || \semit == 3
+    bic     r3, r3, r4
+    semitrans13p r3, r1, r6, lr
+.elseif \semit == 2
+    semitrans2p  r3, r1, r4, r5, r6, r7, lr
+.endif
+    strh    r3, [r0], #2
+1:
+    ldr     r3, [r0]
+    pld_    r0, #32
+    subs    r2, r2, #2
+.if \semit == 0
+    bic     r3, r3, r4
+    semitrans0p  r3, r1, r6, lr
+.elseif \semit == 1 || \semit == 3
+    bic     r3, r3, r4
+    semitrans13p r3, r1, r6, lr
+.elseif \semit == 2
+    semitrans2p  r3, r1, r4, r5, r6, r7, lr
+.endif
+    strpl   r3, [r0], #4
+    bpl     1b
+2:
+    tst     r2, #1
+    strhne  r3, [r0], #2
+
+    ldmfd   sp!, {r4-r7,pc}
+    .cfi_endproc
+.endm
+
+poly_untex_st_m poly_untex_st0_asm, 0
+poly_untex_st_m poly_untex_st1_asm, 1
+poly_untex_st_m poly_untex_st3_asm, 3
+#ifdef HAVE_ARMV6
+poly_untex_st_m poly_untex_st2_asm, 2
+#endif
+
+
  .macro poly_4_8bpp_asm_m name bpp light semit
  FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count)
      .cfi_startproc
@@ -568,15 +780,15 @@ v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      .cfi_endproc
  .endm
  
-poly_4_8bpp_asm_m poly_4bpp_asm,       4, 0, -1
-poly_4_8bpp_asm_m poly_8bpp_asm,       8, 0, -1
+poly_4_8bpp_asm_m poly_4bpp_asm,        4, 0, -1
+poly_4_8bpp_asm_m poly_4bpp_l0_st0_asm, 4, 0,  0
+poly_4_8bpp_asm_m poly_8bpp_asm,        8, 0, -1
+poly_4_8bpp_asm_m poly_8bpp_l0_st0_asm, 8, 0,  0
  
  #ifdef HAVE_ARMV6
  
-poly_4_8bpp_asm_m poly_4bpp_l0_st0_asm, 4, 0,  0
  poly_4_8bpp_asm_m poly_4bpp_l1_std_asm, 4, 1, -1
  poly_4_8bpp_asm_m poly_4bpp_l1_st0_asm, 4, 1,  0
-poly_4_8bpp_asm_m poly_8bpp_l0_st0_asm, 8, 0,  0
  poly_4_8bpp_asm_m poly_8bpp_l1_std_asm, 8, 1, -1
  poly_4_8bpp_asm_m poly_8bpp_l1_st0_asm, 8, 1,  0
  
diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h

index 6b8c81a..d69490f 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.h
+++ b/plugins/gpu_unai/gpu_arm.h
@@ -7,6 +7,10 @@ extern "C" {
  
  struct gpu_unai_inner_t;
  
+void tile_driver_st0_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn);
+void tile_driver_st1_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn);
+void tile_driver_st3_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn);
+
  void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base,
@@ -15,36 +19,43 @@ void sprite_driver_16bpp_asm(void *pPixel, const void *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
  
-void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
-void poly_8bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
-
  void sprite_driver_4bpp_l0_std_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_4bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_driver_8bpp_l0_std_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+
+void poly_untex_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_untex_st1_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_untex_st3_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_asm       (void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_asm       (void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  
  #ifdef HAVE_ARMV6
  
-void sprite_driver_4bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base,
-       u32 count, const struct gpu_unai_inner_t *inn);
+void tile_driver_st2_asm(void *d, u16 c, u32 cnt, const struct gpu_unai_inner_t *inn);
+
  void sprite_driver_4bpp_l1_std_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_driver_4bpp_l1_st0_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_driver_4bpp_l1_st1_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
-void sprite_driver_8bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base,
-       u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_driver_8bpp_l1_std_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_driver_8bpp_l1_st0_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_driver_8bpp_l1_st1_asm(void *pPixel, const u8 *pTxt_base,
         u32 count, const struct gpu_unai_inner_t *inn);
-void poly_4bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+
+void poly_untex_st2_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  void poly_4bpp_l1_std_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  void poly_4bpp_l1_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
-void poly_8bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  void poly_8bpp_l1_std_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  void poly_8bpp_l1_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
  
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h

index 14d6644..3281d0f 100644 (file)
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -58,6 +58,7 @@
  #include "arm_features.h"
  #include "compiler_features.h"
  #ifdef __arm__
+#include "gpu_arm.h"
  #include "gpu_inner_blend_arm.h"
  #include "gpu_inner_light_arm.h"
  #define gpuBlending gpuBlendingARM
@@ -278,7 +279,7 @@ const PSD gpuPixelSpanDrivers[64] =
  //  GPU Tiles innerloops generator
  
  template<int CF>
-static void gpuTileSpanFn(le16_t *pDst, u32 count, u16 data)
+static inline void gpuTileSpanFn(le16_t *pDst, u16 data, u32 count)
  {
         le16_t ldata;
  
@@ -330,7 +331,42 @@ endtile:
         }
  }
  
-static void TileNULL(le16_t *pDst, u32 count, u16 data)
+template<int CF>
+static noinline void gpuTileDriverFn(le16_t *pDst, u16 data, u32 count,
+       const gpu_unai_inner_t &inn)
+{
+       const int li=gpu_unai.inn.ilace_mask;
+       const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+       const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
+       const int y1 = inn.y1;
+       int y0 = inn.y0;
+
+       for (; y0 < y1; ++y0) {
+               if (!(y0&li) && (y0&pi) != pif)
+                       gpuTileSpanFn<CF>(pDst, data, count);
+               pDst += FRAME_WIDTH;
+       }
+}
+
+#ifdef __arm__
+
+template<int CF>
+static void TileAsm(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn)
+{
+       switch (CF) {
+       case 0x02: tile_driver_st0_asm(pDst, data, count, &inn); return;
+       case 0x0a: tile_driver_st1_asm(pDst, data, count, &inn); return;
+       case 0x1a: tile_driver_st3_asm(pDst, data, count, &inn); return;
+#ifdef HAVE_ARMV6
+       case 0x12: tile_driver_st2_asm(pDst, data, count, &inn); return;
+#endif
+       }
+       gpuTileDriverFn<CF>(pDst, data, count, inn);
+}
+
+#endif
+
+static void TileNULL(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn)
  {
         #ifdef ENABLE_GPU_LOG_SUPPORT
                 fprintf(stdout,"TileNULL()\n");
@@ -339,23 +375,35 @@ static void TileNULL(le16_t *pDst, u32 count, u16 data)
  
  ///////////////////////////////////////////////////////////////////////////////
  //  Tiles innerloops driver
-typedef void (*PT)(le16_t *pDst, u32 count, u16 data);
+typedef void (*PT)(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn);
  
  // Template instantiation helper macros
-#define TI(cf) gpuTileSpanFn<(cf)>
+#define TI(cf) gpuTileDriverFn<(cf)>
  #define TN     TileNULL
+#ifdef __arm__
+#define TA(cf) TileAsm<(cf)>
+#else
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) TileAsm<(cf)>
+#else
+#define TA6(cf) TI(cf)
+#endif
  #define TIBLOCK(ub) \
-       TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
-       TN,            TI((ub)|0x0a), TN,            TI((ub)|0x0e), \
-       TN,            TI((ub)|0x12), TN,            TI((ub)|0x16), \
-       TN,            TI((ub)|0x1a), TN,            TI((ub)|0x1e)
+       TI((ub)|0x00), TA6((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
+       TN,            TA ((ub)|0x0a), TN,            TI((ub)|0x0e), \
+       TN,            TA6((ub)|0x12), TN,            TI((ub)|0x16), \
+       TN,            TA ((ub)|0x1a), TN,            TI((ub)|0x1e)
  
-const PT gpuTileSpanDrivers[32] = {
+const PT gpuTileDrivers[32] = {
         TIBLOCK(0<<8), TIBLOCK(1<<8)
  };
  
  #undef TI
  #undef TN
+#undef TA
+#undef TA6
  #undef TIBLOCK
  
  
@@ -446,7 +494,6 @@ endsprite:
  }
  
  #ifdef __arm__
-#include "gpu_arm.h"
  
  template<int CF>
  static void SpriteMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
@@ -467,14 +514,14 @@ static void SpriteMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
      const u8 *pTxt = pTxt_base + inn.v * 2048;
      switch (CF) {
      case 0x20: sprite_driver_4bpp_l0_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x22: sprite_driver_4bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
      case 0x40: sprite_driver_8bpp_l0_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x42: sprite_driver_8bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
  #ifdef HAVE_ARMV6
      case 0x21: sprite_driver_4bpp_l1_std_asm(pPixel, pTxt, count, &inn); return;
-    case 0x22: sprite_driver_4bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
      case 0x23: sprite_driver_4bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return;
      case 0x2b: sprite_driver_4bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return;
      case 0x41: sprite_driver_8bpp_l1_std_asm(pPixel, pTxt, count, &inn); return;
-    case 0x42: sprite_driver_8bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
      case 0x43: sprite_driver_8bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return;
      case 0x4b: sprite_driver_8bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return;
  #endif
@@ -761,14 +808,18 @@ template<int CF>
  static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
  {
         switch (CF) {
+       case 0x02: poly_untex_st0_asm  (pDst, &gpu_unai.inn, count); break;
+       case 0x0a: poly_untex_st1_asm  (pDst, &gpu_unai.inn, count); break;
+       case 0x1a: poly_untex_st3_asm  (pDst, &gpu_unai.inn, count); break;
         case 0x20: poly_4bpp_asm       (pDst, &gpu_unai.inn, count); break;
+       case 0x22: poly_4bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
         case 0x40: poly_8bpp_asm       (pDst, &gpu_unai.inn, count); break;
+       case 0x42: poly_8bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
  #ifdef HAVE_ARMV6
+       case 0x12: poly_untex_st2_asm  (pDst, &gpu_unai.inn, count); break;
         case 0x21: poly_4bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
-       case 0x22: poly_4bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
         case 0x23: poly_4bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
         case 0x41: poly_8bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
-       case 0x42: poly_8bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
         case 0x43: poly_8bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
  #endif
         default:   gpuPolySpanFn<CF>(gpu_unai, pDst, count);
@@ -801,10 +852,10 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
  #define TA6(cf) TI(cf)
  #endif
  #define TIBLOCK(ub) \
-       TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
-       TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
-       TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
-       TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
+       TI((ub)|0x00), TI((ub)|0x01), TA6((ub)|0x02),TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
+       TN,            TN,            TA((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
+       TN,            TN,            TA6((ub)|0x12),TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
+       TN,            TN,            TA((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
         TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
         TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
         TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h

index e314e97..5c7b67c 100644 (file)
--- a/plugins/gpu_unai/gpu_raster_sprite.h
+++ b/plugins/gpu_unai/gpu_raster_sprite.h
@@ -73,7 +73,7 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out)
         gpuSpriteDriver(Pixel, x1, (u8 *)gpu_unai.inn.TBA, gpu_unai.inn);
  }
  
-void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_out)
+void gpuDrawT(PtrUnion packet, const PT gpuTileDriver, s32 *w_out, s32 *h_out)
  {
         s32 x0, x1, y0, y1;
  
@@ -103,15 +103,10 @@ void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_ou
  
         const u16 Data = GPU_RGB16(le32_to_u32(packet.U4[0]));
         le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
-       const int li=gpu_unai.inn.ilace_mask;
-       const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
-       const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
-
-       for (; y0<y1; ++y0) {
-               if (!(y0&li) && (y0&pi)!=pif)
-                       gpuTileSpanDriver(Pixel,x1,Data);
-               Pixel += FRAME_WIDTH;
-       }
+
+       gpu_unai.inn.y0 = y0;
+       gpu_unai.inn.y1 = y1;
+       gpuTileDriver(Pixel, Data, x1, gpu_unai.inn);
  }
  
  #endif /* __GPU_UNAI_GPU_RASTER_SPRITE_H__ */
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h

index fb30eec..6fe00bb 100644 (file)
--- a/plugins/gpu_unai/gpu_unai.h
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -224,11 +224,11 @@ struct gpu_unai_inner_t {
         // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
         //  layout:  ccccccccXXXXXXXX for c in [r, g, b]
         //           ^ bit 16
-       gcol_t gCol;
-       gcol_t gInc;       // Increment along scanline for gCol
+       gcol_t gCol;       // 28
+       gcol_t gInc;       // 30 Increment along scanline for gCol
  
         // Color for flat-shaded, untextured prims
-       u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+       u16 PixelData;     // 38 bgr555 color for untextured flat-shaded polys
  
         u8 blit_mask;           // Determines what pixels to skip when rendering.
                                 //  Only useful on low-resolution devices using
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp

index 9f72611..40c7fd9 100644 (file)
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -748,7 +748,7 @@ int do_cmd_list(u32 *list_, int list_len,
        case 0x61:
        case 0x62:
        case 0x63: {          // Monochrome rectangle (variable size)
-        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
          s32 w = 0, h = 0;
          gpuDrawT(packet, driver, &w, &h);
          gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
@@ -766,7 +766,7 @@ int do_cmd_list(u32 *list_, int list_len,
        case 0x6A:
        case 0x6B: {          // Monochrome rectangle (1x1 dot)
          gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00010001);
-        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
          s32 w = 0, h = 0;
          gpuDrawT(packet, driver, &w, &h);
          gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
@@ -777,7 +777,7 @@ int do_cmd_list(u32 *list_, int list_len,
        case 0x72:
        case 0x73: {          // Monochrome rectangle (8x8)
          gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00080008);
-        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
          s32 w = 0, h = 0;
          gpuDrawT(packet, driver, &w, &h);
          gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
@@ -796,7 +796,7 @@ int do_cmd_list(u32 *list_, int list_len,
        case 0x7A:
        case 0x7B: {          // Monochrome rectangle (16x16)
          gpu_unai.PacketBuffer.U4[2] = u32_to_le32(0x00100010);
-        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        PT driver = gpuTileDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
          s32 w = 0, h = 0;
          gpuDrawT(packet, driver, &w, &h);
          gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
author	notaz <notasas@gmail.com>
	Mon, 2 Dec 2024 00:56:09 +0000 (02:56 +0200)
committer	notaz <notasas@gmail.com>
	Tue, 3 Dec 2024 22:49:21 +0000 (00:49 +0200)
plugins/gpu_unai/gpu_arm.S		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_arm.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_raster_sprite.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_unai.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpulib_if.cpp		patch \| blob \| blame \| history