gpu_unai: asm part 4

author notaz <notasas@gmail.com>

Sun, 1 Dec 2024 16:14:18 +0000 (18:14 +0200)

committer notaz <notasas@gmail.com>

Mon, 2 Dec 2024 23:36:01 +0000 (01:36 +0200)
author notaz <notasas@gmail.com>
Sun, 1 Dec 2024 16:14:18 +0000 (18:14 +0200)
committer notaz <notasas@gmail.com>
Mon, 2 Dec 2024 23:36:01 +0000 (01:36 +0200)
diff --git a/Makefile b/Makefile

index 4000d48..5227572 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -272,6 +272,7 @@ OBJS += plugins/gpu_unai/old/if.o
  else
  CFLAGS += -DGPU_UNAI_NO_OLD
  endif
+plugins/gpu_unai/gpulib_if.o: plugins/gpu_unai/*.h
  plugins/gpu_unai/gpulib_if.o: CFLAGS += -DREARMED -DUSE_GPULIB=1
  ifneq ($(DEBUG), 1)
  plugins/gpu_unai/gpulib_if.o \
diff --git a/plugins/gpu_unai/gpu_arm.S b/plugins/gpu_unai/gpu_arm.S

index b56951f..9970c02 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.S
+++ b/plugins/gpu_unai/gpu_arm.S
@@ -17,6 +17,38 @@
  #endif
  .endm
  
+#ifdef HAVE_ARMV6
+
+.macro modulate rp mbr mg t0 t1 t2
+    and     \t0, \rp, #0x001f
+    and     \t1, \rp, #0x03e0
+    and     \t2, \rp, #0x7c00
+    smulbb  \t0, \t0, \mbr       @ -> 0000 0000 0000 orrr  rrxx xxxx xxxx xxxx
+    smulbt  \t1, \t1, \mg        @ -> 0000 000o gggg gxxx  xxxx xxxx xxx0 0000
+    smulbt  \t2, \t2, \mbr       @ -> 00ob bbbb xxxx xxxx  xxxx xx00 0000 0000
+    and     \rp, \rp, #0x8000    @ retain msb
+    usat    \t0, #5, \t0, asr #14
+    usat    \t1, #5, \t1, asr #19
+    usat    \t2, #5, \t2, asr #24
+    orr     \rp, \rp, \t0
+    orr     \rp, \rp, \t1, lsl #5
+    orr     \rp, \rp, \t2, lsl #10
+.endm
+
+@ http://www.slack.net/~ant/info/rgb_mixing.html
+@ p0 = (p0 + p1) / 2; p1 |= 0x8000
+@ msb of input p0 is assumed to be set
+.macro semitrans0 p0 p1 t
+    eor     \t,  \p0, \p1
+    and     \t,  \t, #0x0420
+    sub     \p0, \p0, \t
+    orr     \p1, \p1, #0x8000
+    uhadd16 \p0, \p0, \p1
+.endm
+
+#endif // HAVE_ARMV6
+
+
  @ in: r0=dst, r2=pal, r12=0x1e
  @ trashes r6-r8,lr,flags
  .macro do_4x_4bpp rs ibase obase
@@ -63,11 +95,13 @@
      strhne   \rs,[r0, #6]
  .endm
  
-.global sprite_4bpp_x16_asm @ (u16 *d, void *s, u16 *pal, int lines)
+@ (u16 *d, void *s, u16 *pal, int lines)
  sprite_4bpp_x16_asm_:
-    ldr     r2, [r3]               @ pal
-    ldr     r3, [r3, #0x1c]        @ lines
-sprite_4bpp_x16_asm:
+    ldr     r12,[r3, #0x18]        @ y0
+    ldr     r2, [r3, #0x04]        @ pal
+    ldr     r3, [r3, #0x1c]        @ y1
+    sub     r3, r3, r12
+FUNCTION(sprite_4bpp_x16_asm):
      .cfi_startproc
      stmfd   sp!, {r4-r8,lr}
      .cfi_def_cfa_offset 4*6
@@ -99,15 +133,17 @@ sprite_4bpp_x16_asm:
  .if \is8bpp
      orr     r12, r12, #0x1f0   @ mask=0x01fe
  .endif
-    ldr     r4, [r3, #4]       @ u0
-    ldr     r5, [r3, #0x1c]    @ h
+    ldr     r4, [r3, #0x08]    @ u
+    ldr     r5, [r3, #0x1c]    @ v1
+    ldr     r6, [r3, #0x18]    @ v0
      and     r4, r4, #((8 >> \is8bpp) - 1)
+    sub     r5, r5, r6
      sub     r5, r5, #1
      orr     r5, r4, r5, lsl #8 @ ((h-1) << 8) | u0_fraction
      mov     r9, r2             @ saved_w
      mov     r10, r0            @ saved_dst
      mov     r11, r1            @ saved_src
-    ldr     r2, [r3]           @ pal
+    ldr     r2, [r3, #0x04]    @ pal
  11: @ line_loop:
      pld_    r11, #2048
      mov     r0, r10
@@ -152,10 +188,10 @@ sprite_4bpp_x16_asm:
      b       12b @ return from fractional_u
  .endm
  
-.global sprite_driver_4bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg)
-sprite_driver_4bpp_asm:
+@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *)
+FUNCTION(sprite_driver_4bpp_asm):
      .cfi_startproc
-    ldr     r12, [r3, #4]      @ u0
+    ldr     r12, [r3, #8]      @ u
      mov     r12, r12, lsl #29
      orr     r12, r12, r2       @ w
      cmp     r12, #16
@@ -183,8 +219,8 @@ sprite_driver_4bpp_asm:
      .cfi_endproc
  
  
-.global sprite_driver_8bpp_asm @ (u16 *d, const void *s, int width, spriteDriverArg)
-sprite_driver_8bpp_asm:
+@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *)
+FUNCTION(sprite_driver_8bpp_asm):
      .cfi_startproc
      sprite_driver_part1 1
  0:
@@ -208,41 +244,215 @@ sprite_driver_8bpp_asm:
      .cfi_endproc
  
  
-.macro poly_4bpp_init v_target need_rgb
+@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *)
+.macro sprite_driver_l_st name bpp light semit
+FUNCTION(\name):
+    .cfi_startproc
+    stmfd   sp!, {r4-r11,lr}
+    .cfi_def_cfa_offset 4*4
+    .cfi_rel_offset lr, 4*3
+    ldr     r5, [r3, #0x18]    @ y0
+    ldr     r7, [r3, #0x1c]    @ y1
+    ldr     r8, [r3, #0x20]    @ rbg5
+    mov     r6, r2             @ saved_w
+    ldr     r2, [r3, #0x04]    @ pal
+    ldr     r10,[r3, #0x08]    @ u
+    ldr     r11,[r3, #0x10]    @ u_msk
+    sub     r5, r7, r5         @ h
+    mov     r7, r8, lsl #(8+2) @ 0bbb bb00 0ggg gg00 0rrr rr00 0000 0000
+    mov     r8, r8, lsl #(16+2)@ 0ggg gg00 ...
+    mov     r3, r11,lsr #10
+    orr     r6, r3, r6, lsl #16 @ (w << 16) | u_mask
+    mov     r3, r6
+    and     r10,r10,r6
+
+3: @ line_loop:
+.if \bpp == 4
+    add     r9, r1, r10, lsr #1
+.elseif \bpp == 8
+    add     r9, r1, r10
+    pld_    r9, #2048
+.endif
+0:
+.if \bpp == 4
+    ldrb    r4, [r1, r10, lsr #1]
+.elseif \bpp == 8
+    ldrb    r4, [r1, r10]
+.endif
+    subs    r3, r3, #1<<16
+    bmi     1f
+.if \bpp == 4
+    tst     r10, #1
+    movne   r4, r4, lsr #3
+    addeq   r4, r4, r4
+    and     r4, r4, #0x1e
+.elseif \bpp == 8
+    add     r4, r4, r4         @ <<= 1
+.endif
+    ldrsh   r12,[r2, r4]
+    add     r10,r10,#1
+    and     r10,r10,r6
+    add     r0, r0, #2
+    tst     r12,r12
+    beq     0b
+.if \light && \semit != 1
+    modulate r12, r7, r8, r4, r9, lr
+.endif
+.if \semit == 0
+    ldrhmi  lr, [r0, #-2]
+    strhpl  r12,[r0, #-2]
+    bpl     0b
+    semitrans0 r12, lr, r9
+.elseif \light && \semit == 1
+    and     r4,  r12, #0x001f
+    and     r9,  r12, #0x03e0
+    and     r12, r12, #0x7c00
+    ldrhmi  r11, [r0, #-2]
+    smulbb  r4,  r4,  r7       @ -> 0000 0000 0000 orrr  rrxx xxxx xxxx xxxx
+    smulbt  r9,  r9,  r8       @ -> 0000 000o gggg gxxx  xxxx xxxx xxx0 0000
+    smulbt  r12, r12, r7       @ -> 00ob bbbb xxxx xxxx  xxxx xx00 0000 0000
+    and     r8,  r11, #0x001f
+    and     lr,  r11, #0x03e0
+    and     r11, r11, #0x7c00
+    addmi   r4,  r4,  r8,  lsl #14
+    addmi   r9,  r9,  lr,  lsl #14
+    addmi   r12, r12, r11, lsl #14
+    usat    r4,  #5,  r4,  asr #14
+    usat    r9,  #5,  r9,  asr #19
+    usat    r12, #5,  r12, asr #24
+    orrmi   r4,  r4,  #0x8000
+    orr     r4,  r4,  r9,  lsl #5
+    orr     r12, r4,  r12, lsl #10
+    mov     r8,  r7,  lsl #8       @ restore r8
+.endif
+    strh    r12,[r0, #-2]
+    b       0b
+1:
+    add     r0, r0, #2048
+    add     r1, r1, #2048
+    sub     r0, r0, r6, lsr #15    @ dst
+    sub     r10,r10,r6, lsr #16    @ u
+    mov     r3, r6                 @ (w << 16) | u_mask
+    and     r10,r6, r10
+    subs    r5, r5, #1
+    and     r10,r10,#0xff
+    bgt     3b @ line_loop
+
+    ldmfd   sp!, {r4-r11,pc}
+    .cfi_endproc
+.endm
+
+sprite_driver_l_st sprite_driver_4bpp_l0_std_asm, 4, 0, -1
+sprite_driver_l_st sprite_driver_8bpp_l0_std_asm, 8, 0, -1
+
+#ifdef HAVE_ARMV6
+
+sprite_driver_l_st sprite_driver_4bpp_l0_st0_asm, 4, 0,  0
+sprite_driver_l_st sprite_driver_4bpp_l1_std_asm, 4, 1, -1
+sprite_driver_l_st sprite_driver_4bpp_l1_st0_asm, 4, 1,  0
+sprite_driver_l_st sprite_driver_4bpp_l1_st1_asm, 4, 1,  1
+sprite_driver_l_st sprite_driver_8bpp_l0_st0_asm, 8, 0,  0
+sprite_driver_l_st sprite_driver_8bpp_l1_std_asm, 8, 1, -1
+sprite_driver_l_st sprite_driver_8bpp_l1_st0_asm, 8, 1,  0
+sprite_driver_l_st sprite_driver_8bpp_l1_st1_asm, 8, 1,  1
+
+#endif // HAVE_ARMV6
+
+
+@ (u16 *d, const void *s, int width, const gpu_unai_inner_t *)
+FUNCTION(sprite_driver_16bpp_asm):
+    .cfi_startproc
+    stmfd   sp!, {r4-r6,lr}
+    .cfi_def_cfa_offset 4*4
+    .cfi_rel_offset lr, 4*3
+    ldr     r4, [r3, #0x1c]    @ v1
+    ldr     r5, [r3, #0x18]    @ v0
+    mov     r12,      #0x00ff
+    orr     r12, r12, #0xff00  @ mask
+    mov     r6, r2             @ saved_w
+    sub     r5, r4, r5
+    sub     r5, r5, #1         @ h-1
+3: @ line_loop:
+    pld_    r1, #2048
+    mov     r2, r6             @ w
+    tst     r1, #2
+    beq     0f
+2: @ 1pix:
+    ldrh    lr, [r1], #2
+    add     r0, r0, #2
+    sub     r2, r2, #1
+    tst     lr, lr
+    strhne  lr, [r0, #-2]
+0:
+    subs    r2, r2, #4
+    bmi     1f
+0:
+    ldmia   r1!, {r3,r4}
+    add     r0, r0, #2*4
+    pld_    r1, #24
+    tst     r3, r12
+    strhne  r3, [r0, #-8]
+    movs    lr, r3, lsr #16
+    strhne  lr, [r0, #-6]
+    tst     r4, r12
+    strhne  r4, [r0, #-4]
+    movs    lr, r4, lsr #16
+    strhne  lr, [r0, #-2]
+    subs    r2, r2, #4
+    bpl     0b
+1:
+    adds    r2, r2, #4
+    bne     2b @ 1pix
+    add     r0, r0, #2048
+    add     r1, r1, #2048
+    sub     r0, r0, r6, lsl #1 @ dst
+    sub     r1, r1, r6, lsl #1
+    subs    r5, r5, #1
+    bpl     3b @ line_loop
+
+    ldmfd   sp!, {r4-r6,pc}
+    .cfi_endproc
+
+
+.macro poly_4_8bpp_asm_m name bpp light semit
+FUNCTION(\name): @ (void *d, const gpu_unai_inner_t *inn, int count)
+    .cfi_startproc
+    stmfd   sp!, {r4-r11,lr}
+    .cfi_def_cfa_offset 4*9
+    .cfi_rel_offset lr, 4*8
      add     r12, r1, #4
      ldmia   r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk
      ldr     r5, [r1, #0x18]    @ u_inc
-.if \need_rgb
+.if \light
      ldr     r10,[r1, #0x24]    @ rbg
  .endif
-    mov     r6, r12
+    mov     r6, r12            @ u_msk
      ldr     r12,[r1, #0x1c]    @ v_inc
-.if \need_rgb
+.if \light
      mov     r10,r10,lsl #7     @ 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000
      bic     r10,r10,#1<<23
      bic     r10,r10,#1<<15
-    mov     r11,r10,lsl #8
+    mov     r11,r10,lsl #8     @ 0ggg gggg ...
  .endif
      and     r4, r4, r6
      and     lr, lr, r7         @ v_msk & v
      and     lr, lr, #0xff<<10
      tst     r12,r12
-    bne     \v_target
+    bne     v_\name
      ldr     r1, [r1]           @ src
-    mov     r7, r4, lsr #13
+    mov     r7, r4, lsr #(13 - (\bpp / 8 * 3))
      add     r1, r1, lr, lsl #1
-    add     r12,r1, r7, lsl #2
-    pld_    r12,#2048
-.endm
-
-.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
-poly_4bpp_asm:
-    .cfi_startproc
-    stmfd   sp!, {r4-r7,lr}
-    .cfi_def_cfa_offset 4*5
-    .cfi_rel_offset lr, 4*4
-    poly_4bpp_init poly_4bpp_v_asm 0
+#ifdef HAVE_ARMV6
+    add     r12,r1, r7, lsl #(2 - (\bpp / 8 * 2))
+    pld_    r12,#2048          @ next line
+#endif
  0:
+.if \light || \semit >= 0
+    mov     r7, r4, lsr #(13 - (\bpp / 8 * 3))
+    subs    r2, r2, #1
+    bmi     1f
+.endif
+.if \bpp == 4
      ldr     lr, [r1, r7, lsl #2]
      lsr     r12,r4, #8
      and     r12,r12,#0x1c
@@ -250,135 +460,64 @@ poly_4bpp_asm:
      mov     r12,lr, ror r12
      add     r4, r4, r5
      and     r12,r12,#0x1e
-    and     r4, r4, r6
-    ldrh    r12,[r3, r12]
-    add     r0, r0, #2
-    mov     r7, r4, lsr #13
-    tst     r12,r12
-    strhne  r12,[r0, #-2]
-    subs    r2, r2, #1
-    bgt     0b
-
-    ldmfd   sp!, {r4-r7,pc}
-
-poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
-    stmfd   sp!, {r8-r9}
-    .cfi_def_cfa_offset 4*7
-    .cfi_rel_offset lr, 4*6
-    ldr     r9, [r1, #0x14]    @ v_msk
-    ldr     r1, [r1]           @ src
-    mov     r8, r12            @ v_inc
-    mov     r12,r4, lsr #13
-    add     lr, r1, lr, lsl #1
-    and     r9, r9, #0xff<<10  @ v_msk_final
-0:
-    ldr     lr, [lr, r12, lsl #2]
-    lsr     r12,r4, #8
-    and     r12,r12,#0x1c
-    sub     r12,r12,#1
-    mov     r12,lr, ror r12
+.else
+    ldrb    r12,[r1, r7]
      add     r4, r4, r5
-    and     r12,r12,#0x1e
+    add     r12,r12,r12
+.endif
      and     r4, r4, r6
-    ldrh    r12,[r3, r12]
+    ldrsh   r12,[r3, r12]
      add     r0, r0, #2
-    add     r7, r7, r8
-    and     lr, r7, r9
+.if !\light && \semit < 0
+    mov     r7, r4, lsr #(13 - (\bpp / 8 * 3))
      tst     r12,r12
-    add     lr, r1, lr, lsl #1
      strhne  r12,[r0, #-2]
-    mov     r12,r4, lsr #13
      subs    r2, r2, #1
      bgt     0b
-
-    ldmfd   sp!, {r8-r9}
-    ldmfd   sp!, {r4-r7,pc}
-    .cfi_endproc
-
-
-#ifdef HAVE_ARMV6
-
-.macro modulate rp mbr mg t0 t1 t2
-    and     \t0, \rp, #0x001f
-    and     \t1, \rp, #0x03e0
-    and     \t2, \rp, #0x7c00
-    smulbb  \t0, \t0, \mbr       @ -> 0000 0000 0000 orrr  rrxx xxxx xxxx xxxx
-    smulbt  \t1, \t1, \mg        @ -> 0000 000o gggg gxxx  xxxx xxxx xxx0 0000
-    smulbt  \t2, \t2, \mbr       @ -> 00ob bbbb xxxx xxxx  xxxx xx00 0000 0000
-    ands    \rp, \rp, #0x8000    @ retain msb + semi-transparency test
-    usat    \t0, #5, \t0, asr #14
-    usat    \t1, #5, \t1, asr #19
-    usat    \t2, #5, \t2, asr #24
-    orr     \rp, \rp, \t0
-    orr     \rp, \rp, \t1, lsl #5
-    orr     \rp, \rp, \t2, lsl #10
-.endm
-
-@ http://www.slack.net/~ant/info/rgb_mixing.html
-@ p0 = (p0 + p1) / 2; p1 |= 0x8000
-@ msb of input p0 is assumed to be set
-.macro semitrans0 p0 p1 t
-    eor     \t,  \p0, \p1
-    and     \t,  \t, #0x0420
-    sub     \p0, \p0, \t
-    orr     \p1, \p1, #0x8000
-    uhadd16 \p0, \p0, \p1
-.endm
-
-.macro poly_4bpp_asm_m name semitrans
-.global \name @ (void *d, const struct gpu_unai_inner_t *inn, int count)
-\name:
-    .cfi_startproc
-    stmfd   sp!, {r4-r11,lr}
-    .cfi_def_cfa_offset 4*9
-    .cfi_rel_offset lr, 4*8
-    poly_4bpp_init v_\name 1
-0:
-    mov     r12,r4, lsr #13
-    subs    r2, r2, #1
-    bmi     1f
-    ldr     lr, [r1, r12, lsl #2]
-    lsr     r12,r4, #8
-    and     r12,r12,#0x1c
-    sub     r12,r12,#1
-    mov     r12,lr, ror r12
-    add     r4, r4, r5
-    and     r12,r12,#0x1e
-    and     r4, r4, r6
-    ldrh    r12,[r3, r12]
-    add     r0, r0, #2
+    @ end
+.else
      tst     r12,r12
      beq     0b
+.if \light && \semit != 1
      modulate r12, r10, r11, r7, r8, lr
-.if \semitrans < 0
-    @ no semi-transparency
-.elseif \semitrans == 0
-    ldrhne  r7, [r0, #-2]
-    strheq  r12,[r0, #-2]
-    beq     0b
+.endif
+.if \semit == 0
+    ldrhmi  r7, [r0, #-2]
+    strhpl  r12,[r0, #-2]
+    bpl     0b
      semitrans0 r12, r7, lr
  .endif
      strh    r12,[r0, #-2]
      b       0b
+.endif                         @ \light || \semit >= 0
  1:
      ldmfd   sp!, {r4-r11,pc}
  
  v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+.if \light || \semit >= 0
      sub     sp, sp, #4*2
+    stmia   sp, {r5,r6}
      .cfi_def_cfa_offset 4*(9+2)
      .cfi_rel_offset lr, 4*(8+2)
+.endif
      ldr     r9, [r1, #0x14]    @ v_msk
      ldr     r1, [r1]           @ src
      mov     r8, r12            @ v_inc
-    mov     r12,r4, lsr #13
      and     r9, r9, #0xff<<10  @ v_msk_final
-    stmia   sp, {r5,r6}
+.if !\light && \semit < 0
+    and     lr, r7, r9
+    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))
+    add     lr, r1, lr, lsl #1
+.endif
  0:
+.if \light || \semit >= 0
      and     lr, r7, r9
-    mov     r12,r4, lsr #13
+    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))
      add     lr, r1, lr, lsl #1
      subs    r2, r2, #1
      bmi     1f
+.endif
+.if \bpp == 4
      ldr     lr, [lr, r12, lsl #2]
      lsr     r12,r4, #8
      and     r12,r12,#0x1c
@@ -386,32 +525,60 @@ v_\name: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
      mov     r12,lr, ror r12
      add     r4, r4, r5
      and     r12,r12,#0x1e
+.else
+    ldrb    r12,[lr, r12]
+    add     r4, r4, r5
+    add     r12,r12,r12
+.endif
      and     r4, r4, r6
-    ldrh    r12,[r3, r12]
+    ldrsh   r12,[r3, r12]
      add     r0, r0, #2
      add     r7, r7, r8
+.if !\light && \semit < 0
+    and     lr, r7, r9
+    tst     r12,r12
+    add     lr, r1, lr, lsl #1
+    strhne  r12,[r0, #-2]
+    mov     r12,r4, lsr #(13 - (\bpp / 8 * 3))
+    subs    r2, r2, #1
+    bgt     0b
+    @ end
+.else
      tst     r12,r12
      beq     0b
+.if \light && \semit != 1
      modulate r12, r10, r11, r5, r6, lr
-.if \semitrans < 0
-    @ no semi-transparency
-.elseif \semitrans == 0
-    ldrhne  r7, [r0, #-2]
-    strheq  r12,[r0, #-2]
-    beq     0b
-    semitrans0 r12, r7, lr
+.endif
+.if \semit == 0
+    ldrhmi  r6, [r0, #-2]
+    strhpl  r12,[r0, #-2]
+    ldmiapl sp, {r5,r6}
+    bpl     0b
+    semitrans0 r12, r6, lr
  .endif
      strh    r12,[r0, #-2]
      ldmia   sp, {r5,r6}
      b       0b
+.endif                         @ \light || \semit >= 0
  1:
+.if \light || \semit >= 0
      add     sp, sp, #4*2
+.endif
      ldmfd   sp!, {r4-r11,pc}
      .cfi_endproc
  .endm
  
-poly_4bpp_asm_m poly_4bpp_l_asm,    -1
-poly_4bpp_asm_m poly_4bpp_l_st0_asm, 0
+poly_4_8bpp_asm_m poly_4bpp_asm,       4, 0, -1
+poly_4_8bpp_asm_m poly_8bpp_asm,       8, 0, -1
+
+#ifdef HAVE_ARMV6
+
+poly_4_8bpp_asm_m poly_4bpp_l0_st0_asm, 4, 0,  0
+poly_4_8bpp_asm_m poly_4bpp_l1_std_asm, 4, 1, -1
+poly_4_8bpp_asm_m poly_4bpp_l1_st0_asm, 4, 1,  0
+poly_4_8bpp_asm_m poly_8bpp_l0_st0_asm, 8, 0,  0
+poly_4_8bpp_asm_m poly_8bpp_l1_std_asm, 8, 1, -1
+poly_4_8bpp_asm_m poly_8bpp_l1_st0_asm, 8, 1,  0
  
  #endif // HAVE_ARMV6
  
diff --git a/plugins/gpu_unai/gpu_arm.h b/plugins/gpu_unai/gpu_arm.h

index 027aa53..6b8c81a 100644 (file)
--- a/plugins/gpu_unai/gpu_arm.h
+++ b/plugins/gpu_unai/gpu_arm.h
@@ -6,17 +6,49 @@ extern "C" {
  #endif
  
  struct gpu_unai_inner_t;
-struct spriteDriverArg;
  
  void sprite_driver_4bpp_asm(void *pPixel, const u8 *pTxt_base,
-       u32 count, const struct spriteDriverArg *arg);
+       u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_driver_8bpp_asm(void *pPixel, const u8 *pTxt_base,
-       u32 count, const struct spriteDriverArg *arg);
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_16bpp_asm(void *pPixel, const void *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
  void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
  
  void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
-void poly_4bpp_l_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
-void poly_4bpp_l_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+
+void sprite_driver_4bpp_l0_std_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l0_std_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+
+#ifdef HAVE_ARMV6
+
+void sprite_driver_4bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_4bpp_l1_std_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_4bpp_l1_st0_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_4bpp_l1_st1_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l0_st0_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l1_std_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l1_st0_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void sprite_driver_8bpp_l1_st1_asm(void *pPixel, const u8 *pTxt_base,
+       u32 count, const struct gpu_unai_inner_t *inn);
+void poly_4bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l1_std_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l1_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_l0_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_l1_std_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_8bpp_l1_st0_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+
+#endif // HAVE_ARMV6
  
  #ifdef __cplusplus
  }
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h

index 87324b9..14d6644 100644 (file)
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -362,19 +362,12 @@ const PT gpuTileSpanDrivers[32] = {
  ///////////////////////////////////////////////////////////////////////////////
  //  GPU Sprites innerloops generator
  
-// warning: gpu_arm.S asm uses this, update it if you change this
-typedef struct spriteDriverArg {
-       const le16_t *CBA;             // 00
-       u32 u0, v0, u0_mask, v0_mask;  // 04 08 0c 10
-       s32 y0, y1, lines, li;         // 14
-} spriteDriverArg;
-
  typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt,
-       const spriteDriverArg *arg);
+       const gpu_unai_inner_t &inn);
  
  template<int CF>
  static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
-       const spriteDriverArg *arg)
+       const gpu_unai_inner_t &inn)
  {
         // Blend func can save an operation if it knows uSrc MSB is unset.
         //  Untextured prims can always skip (source color always comes with MSB=0).
@@ -383,25 +376,26 @@ static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt
  
         uint_fast16_t uSrc, uDst, srcMSB;
         bool should_blend;
-       u32 u0_mask = arg->u0_mask;
+       u32 u0_mask = inn.u_msk >> 10;
  
         u8 r5, g5, b5;
         if (CF_LIGHT) {
-               r5 = gpu_unai.inn.r5;
-               g5 = gpu_unai.inn.g5;
-               b5 = gpu_unai.inn.b5;
+               r5 = inn.r5;
+               g5 = inn.g5;
+               b5 = inn.b5;
         }
  
+       const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA;
+       const u32 v0_mask = inn.v_msk >> 10;
+       s32 y0 = inn.y0, y1 = inn.y1, li = inn.ilace_mask;
+       u32 u0_ = inn.u, v0 = inn.v;
+
         if (CF_TEXTMODE==3) {
-               // Texture is accessed byte-wise, so adjust mask if 16bpp
+               // Texture is accessed byte-wise, so adjust to 16bpp
+               u0_ <<= 1;
                 u0_mask <<= 1;
         }
  
-       const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = arg->CBA;
-       const u32 v0_mask = arg->v0_mask;
-       s32 y0 = arg->y0, y1 = arg->y1, li = arg->li;
-       u32 u0_ = arg->u0, v0 = arg->v0;
-
         for (; y0 < y1; ++y0, pPixel += FRAME_WIDTH, ++v0)
         {
           if (y0 & li) continue;
@@ -454,39 +448,45 @@ endsprite:
  #ifdef __arm__
  #include "gpu_arm.h"
  
-static void Sprite4bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
-        const spriteDriverArg *arg)
+template<int CF>
+static void SpriteMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+        const gpu_unai_inner_t &inn)
  {
  #if 1
-       s32 lines = arg->lines;
-       u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1;
-       if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) {
-               pTxt_base += arg->u0 / 2 + arg->v0 * 2048;
-               sprite_driver_4bpp_asm(pPixel, pTxt_base, count, arg);
-       }
-       else
+  s32 lines = inn.y1 - inn.y0;
+  u32 u1m = inn.u + count - 1, v1m = inn.v + lines - 1;
+  if (u1m == (u1m & (inn.u_msk >> 10)) && v1m == (v1m & (inn.v_msk >> 10))) {
+    const u8 *pTxt = pTxt_base + inn.v * 2048;
+    switch (CF) {
+    case 0x20: sprite_driver_4bpp_asm (pPixel, pTxt + inn.u / 2, count, &inn); return;
+    case 0x40: sprite_driver_8bpp_asm (pPixel, pTxt + inn.u,     count, &inn); return;
+    case 0x60: sprite_driver_16bpp_asm(pPixel, pTxt + inn.u * 2, count, &inn); return;
+    }
+  }
+  if (v1m == (v1m & (inn.v_msk >> 10))) {
+    const u8 *pTxt = pTxt_base + inn.v * 2048;
+    switch (CF) {
+    case 0x20: sprite_driver_4bpp_l0_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x40: sprite_driver_8bpp_l0_std_asm(pPixel, pTxt, count, &inn); return;
+#ifdef HAVE_ARMV6
+    case 0x21: sprite_driver_4bpp_l1_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x22: sprite_driver_4bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
+    case 0x23: sprite_driver_4bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return;
+    case 0x2b: sprite_driver_4bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return;
+    case 0x41: sprite_driver_8bpp_l1_std_asm(pPixel, pTxt, count, &inn); return;
+    case 0x42: sprite_driver_8bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
+    case 0x43: sprite_driver_8bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return;
+    case 0x4b: sprite_driver_8bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return;
  #endif
-               gpuSpriteDriverFn<0x20>(pPixel, count, pTxt_base, arg);
-}
-
-static void Sprite8bppMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
-        const spriteDriverArg *arg)
-{
-#if 1
-       s32 lines = arg->lines;
-       u32 u1m = arg->u0 + count - 1, v1m = arg->v0 + lines - 1;
-       if (u1m == (u1m & arg->u0_mask) && v1m == (v1m & arg->v0_mask)) {
-               pTxt_base += arg->u0 + arg->v0 * 2048;
-               sprite_driver_8bpp_asm(pPixel, pTxt_base, count, arg);
-       }
-       else
+    }
+  }
  #endif
-               gpuSpriteDriverFn<0x40>(pPixel, count, pTxt_base, arg);
+  gpuSpriteDriverFn<CF>(pPixel, count, pTxt_base, inn);
  }
  #endif // __arm__
  
  static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base,
-       const spriteDriverArg *arg)
+       const gpu_unai_inner_t &inn)
  {
         #ifdef ENABLE_GPU_LOG_SUPPORT
                 fprintf(stdout,"SpriteNULL()\n");
@@ -502,29 +502,32 @@ static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base,
  #define TI(cf) gpuSpriteDriverFn<(cf)>
  #define TN     SpriteNULL
  #ifdef __arm__
-#define TA4(cf) Sprite4bppMaybeAsm
-#define TA8(cf) Sprite8bppMaybeAsm
+#define TA(cf) SpriteMaybeAsm<(cf)>
+#else
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) SpriteMaybeAsm<(cf)>
  #else
-#define TA4(cf) TI(cf)
-#define TA8(cf) TI(cf)
+#define TA6(cf) TI(cf)
  #endif
  #define TIBLOCK(ub) \
-       TN,             TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
-       TN,             TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
-       TN,             TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
-       TN,             TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
-       TA4((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
-       TN,             TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
-       TN,             TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
-       TN,             TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
-       TA8((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
-       TN,             TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
-       TN,             TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
-       TN,             TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
-       TI((ub)|0x60),  TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
-       TN,             TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
-       TN,             TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
-       TN,             TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f)
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TN,            TN,            TI((ub)|0x2a), TA6((ub)|0x2b),TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+       TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+       TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+       TA((ub)|0x40), TA6((ub)|0x41),TA6((ub)|0x42),TA6((ub)|0x43),TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+       TN,            TN,            TI((ub)|0x4a), TA6((ub)|0x4b),TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+       TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+       TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+       TA((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+       TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+       TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+       TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f)
  
  const PS gpuSpriteDrivers[256] = {
         TIBLOCK(0<<8), TIBLOCK(1<<8)
@@ -533,8 +536,8 @@ const PS gpuSpriteDrivers[256] = {
  #undef TI
  #undef TN
  #undef TIBLOCK
-#undef TA4
-#undef TA8
+#undef TA
+#undef TA6
  
  ///////////////////////////////////////////////////////////////////////////////
  //  GPU Polygon innerloops generator
@@ -566,7 +569,7 @@ static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32
         const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
         bool should_blend;
  
-       u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.blit_mask;
+       u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.inn.blit_mask;
  
         if (!CF_TEXTMODE)
         {
@@ -758,9 +761,16 @@ template<int CF>
  static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
  {
         switch (CF) {
-       case 0x20: poly_4bpp_asm      (pDst, &gpu_unai.inn, count); break;
-       case 0x21: poly_4bpp_l_asm    (pDst, &gpu_unai.inn, count); break;
-       case 0x23: poly_4bpp_l_st0_asm(pDst, &gpu_unai.inn, count); break;
+       case 0x20: poly_4bpp_asm       (pDst, &gpu_unai.inn, count); break;
+       case 0x40: poly_8bpp_asm       (pDst, &gpu_unai.inn, count); break;
+#ifdef HAVE_ARMV6
+       case 0x21: poly_4bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
+       case 0x22: poly_4bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
+       case 0x23: poly_4bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
+       case 0x41: poly_8bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
+       case 0x42: poly_8bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
+       case 0x43: poly_8bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
+#endif
         default:   gpuPolySpanFn<CF>(gpu_unai, pDst, count);
         }
  }
@@ -795,11 +805,11 @@ typedef void (*PP)(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count);
         TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
         TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
         TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
-       TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
         TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
         TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
         TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
-       TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+       TA((ub)|0x40), TA6((ub)|0x41),TA6((ub)|0x42),TA6((ub)|0x43),TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
         TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
         TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
         TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
diff --git a/plugins/gpu_unai/gpu_raster_polygon.h b/plugins/gpu_unai/gpu_raster_polygon.h

index 1457afd..0479d0f 100644 (file)
--- a/plugins/gpu_unai/gpu_raster_polygon.h
+++ b/plugins/gpu_unai/gpu_raster_polygon.h
@@ -352,9 +352,9 @@ void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
                                 continue;
  
                         le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)];
-                       int li=gpu_unai.ilace_mask;
-                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+                       int li=gpu_unai.inn.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
  
                         for (; loop1; --loop1, ya++, PixelBase += FRAME_WIDTH,
                                         x3 += dx3, x4 += dx4 )
@@ -663,9 +663,9 @@ void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
                                 continue;
  
                         le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)];
-                       int li=gpu_unai.ilace_mask;
-                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+                       int li=gpu_unai.inn.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
  
                         for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
                                         x3 += dx3, x4 += dx4,
@@ -1008,9 +1008,9 @@ void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad
                                 continue;
  
                         le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)];
-                       int li=gpu_unai.ilace_mask;
-                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+                       int li=gpu_unai.inn.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
  
                         for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
                                         x3 += dx3, x4 += dx4,
@@ -1403,9 +1403,9 @@ void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_qua
                                 continue;
  
                         le16_t* PixelBase = &gpu_unai.vram[FRAME_OFFSET(0, ya)];
-                       int li=gpu_unai.ilace_mask;
-                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+                       int li=gpu_unai.inn.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
  
                         for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
                                         x3 += dx3, x4 += dx4,
diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h

index 13d783e..e314e97 100644 (file)
--- a/plugins/gpu_unai/gpu_raster_sprite.h
+++ b/plugins/gpu_unai/gpu_raster_sprite.h
@@ -61,31 +61,16 @@ void gpuDrawS(PtrUnion packet, const PS gpuSpriteDriver, s32 *w_out, s32 *h_out)
         *w_out = x1;
         *h_out = y1 - y0;
  
+       le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
+
         gpu_unai.inn.r5 = packet.U1[0] >> 3;
         gpu_unai.inn.g5 = packet.U1[1] >> 3;
         gpu_unai.inn.b5 = packet.U1[2] >> 3;
-
-       le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
-       const int li=gpu_unai.ilace_mask;
-       //const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-       //const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
-       unsigned int tmode = gpu_unai.TEXT_MODE >> 5;
-       u8* pTxt_base = (u8*)gpu_unai.inn.TBA;
-
-       // Texture is accessed byte-wise, so adjust idx if 16bpp
-       if (tmode == 3) u0 <<= 1;
-
-       spriteDriverArg arg;
-       arg.CBA = gpu_unai.inn.CBA;
-       arg.u0 = u0;
-       arg.v0 = v0;
-       arg.u0_mask = gpu_unai.TextureWindow[2];
-       arg.v0_mask = gpu_unai.TextureWindow[3];
-       arg.y0 = y0;
-       arg.y1 = y1;
-       arg.lines = y1 - y0;
-       arg.li = li;
-       gpuSpriteDriver(Pixel, x1, pTxt_base, &arg);
+       gpu_unai.inn.u = u0;
+       gpu_unai.inn.v = v0;
+       gpu_unai.inn.y0 = y0;
+       gpu_unai.inn.y1 = y1;
+       gpuSpriteDriver(Pixel, x1, (u8 *)gpu_unai.inn.TBA, gpu_unai.inn);
  }
  
  void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_out)
@@ -118,9 +103,9 @@ void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver, s32 *w_out, s32 *h_ou
  
         const u16 Data = GPU_RGB16(le32_to_u32(packet.U4[0]));
         le16_t *Pixel = &gpu_unai.vram[FRAME_OFFSET(x0, y0)];
-       const int li=gpu_unai.ilace_mask;
-       const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
-       const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+       const int li=gpu_unai.inn.ilace_mask;
+       const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+       const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
  
         for (; y0<y1; ++y0) {
                 if (!(y0&li) && (y0&pi)!=pif)
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h

index 722041a..fb30eec 100644 (file)
--- a/plugins/gpu_unai/gpu_unai.h
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -204,9 +204,16 @@ struct gpu_unai_inner_t {
         // 22.10 Fixed-pt texture coords, mask, scanline advance
         // NOTE: U,V are no longer packed together into one u32, this proved to be
         //  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
-       u32 u, v;                 // 08
-       u32 u_msk, v_msk;         // 10
-       s32 u_inc, v_inc;         // 18
+       u32 u, v;                 // 08 not fractional for sprites
+       u32 u_msk, v_msk;         // 10 always 22.10
+       union {
+         struct {
+           s32 u_inc, v_inc;     // 18 poly uv increment, 22.10
+         };
+         struct {
+           s32 y0, y1;           // 18 sprite y range
+         };
+       };
  
         // Color for flat-shaded, texture-blended prims
         u8  r5, g5, b5, pad5;     // 20 5-bit light for undithered prims
@@ -222,6 +229,19 @@ struct gpu_unai_inner_t {
  
         // Color for flat-shaded, untextured prims
         u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+
+       u8 blit_mask;           // Determines what pixels to skip when rendering.
+                               //  Only useful on low-resolution devices using
+                               //  a simple pixel-dropping downscaler for PS1
+                               //  high-res modes. See 'pixel_skip' option.
+
+       u8 ilace_mask;          // Determines what lines to skip when rendering.
+                               //  Normally 0 when PS1 240 vertical res is in
+                               //  use and ilace_force is 0. When running in
+                               //  PS1 480 vertical res on a low-resolution
+                               //  device (320x240), will usually be set to 1
+                               //  so odd lines are not rendered. (Unless future
+                               //  full-screen scaling option is in use ..TODO)
  };
  
  struct gpu_unai_t {
@@ -297,20 +317,6 @@ struct gpu_unai_t {
         // End of inner Loop parameters
         ////////////////////////////////////////////////////////////////////////////
  
-
-       u8 blit_mask;           // Determines what pixels to skip when rendering.
-                               //  Only useful on low-resolution devices using
-                               //  a simple pixel-dropping downscaler for PS1
-                               //  high-res modes. See 'pixel_skip' option.
-
-       u8 ilace_mask;          // Determines what lines to skip when rendering.
-                               //  Normally 0 when PS1 240 vertical res is in
-                               //  use and ilace_force is 0. When running in
-                               //  PS1 480 vertical res on a low-resolution
-                               //  device (320x240), will usually be set to 1
-                               //  so odd lines are not rendered. (Unless future
-                               //  full-screen scaling option is in use ..TODO)
-
         bool prog_ilace_flag;   // Tracks successive frames for 'prog_ilace' option
  
         u8 BLEND_MODE;
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp

index 53a1b1d..9f72611 100644 (file)
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -161,7 +161,7 @@ static uint16_t *get_downscale_buffer(int *x, int *y, int *w, int *h, int *vram_
      lines = *h;
  
      // Ensure start at a non-skipped line
-    while (*y & gpu_unai.ilace_mask) ++*y;
+    while (*y & gpu_unai.inn.ilace_mask) ++*y;
    }
  
    unsigned int fb_offset_src = (*y * dstride + *x) & fb_mask;
@@ -252,7 +252,7 @@ int renderer_init(void)
    // sprite-span functions, perhaps unnecessarily. No Abe Oddysey hack was
    // present in latest PCSX4ALL sources we were using.
    //gpu_unai.config.enableAbbeyHack = gpu_unai_config_ext.abe_hack;
-  gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+  gpu_unai.inn.ilace_mask = gpu_unai.config.ilace_force;
  
  #if defined(GPU_UNAI_USE_INT_DIV_MULTINV) || (!defined(GPU_UNAI_NO_OLD) && !defined(GPU_UNAI_USE_FLOATMATH))
    // s_invTable
@@ -285,13 +285,13 @@ void renderer_finish(void)
  
  void renderer_notify_res_change(void)
  {
-  gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+  gpu_unai.inn.ilace_mask = gpu_unai.config.ilace_force;
  
  #ifndef HAVE_PRE_ARMV7 /* XXX */
    if (gpu_unai.config.scale_hires)
  #endif
    {
-    gpu_unai.ilace_mask |= !!(gpu.status & PSX_GPU_STATUS_INTERLACE);
+    gpu_unai.inn.ilace_mask |= !!(gpu.status & PSX_GPU_STATUS_INTERLACE);
    }
  
    /*
author	notaz <notasas@gmail.com>
	Sun, 1 Dec 2024 16:14:18 +0000 (18:14 +0200)
committer	notaz <notasas@gmail.com>
	Mon, 2 Dec 2024 23:36:01 +0000 (01:36 +0200)
Makefile		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_arm.S		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_arm.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_inner.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_raster_polygon.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_raster_sprite.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpu_unai.h		patch \| blob \| blame \| history
plugins/gpu_unai/gpulib_if.cpp		patch \| blob \| blame \| history