.cfi_endproc
-.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
-poly_4bpp_asm:
- .cfi_startproc
+.macro poly_4bpp_init v_target need_rgb
add r12, r1, #4
- stmfd sp!, {r4-r7,lr}
- .cfi_def_cfa_offset 4*5
- .cfi_rel_offset lr, 4*4
ldmia r12, {r3, r4, r7, r12, lr} @ clut, u, v, u_msk, v_msk
ldr r5, [r1, #0x18] @ u_inc
+.if \need_rgb
+ ldr r10,[r1, #0x24] @ rbg
+.endif
mov r6, r12
ldr r12,[r1, #0x1c] @ v_inc
+.if \need_rgb
+ mov r10,r10,lsl #7 @ 0bbb bbbb 0ggg gggg 0rrr rrrr r000 0000
+ bic r10,r10,#1<<23
+ bic r10,r10,#1<<15
+ mov r11,r10,lsl #8
+.endif
and r4, r4, r6
and lr, lr, r7 @ v_msk & v
and lr, lr, #0xff<<10
tst r12,r12
- bne poly_4bpp_asm_v
+ bne \v_target
ldr r1, [r1] @ src
mov r7, r4, lsr #13
add r1, r1, lr, lsl #1
add r12,r1, r7, lsl #2
pld_ r12,#2048
+.endm
+
+.global poly_4bpp_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
+poly_4bpp_asm:
+ .cfi_startproc
+ stmfd sp!, {r4-r7,lr}
+ .cfi_def_cfa_offset 4*5
+ .cfi_rel_offset lr, 4*4
+ poly_4bpp_init poly_4bpp_v_asm 0
0:
ldr lr, [r1, r7, lsl #2]
lsr r12,r4, #8
ldmfd sp!, {r4-r7,pc}
-poly_4bpp_asm_v: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+poly_4bpp_v_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
stmfd sp!, {r8-r9}
+ .cfi_def_cfa_offset 4*7
+ .cfi_rel_offset lr, 4*6
ldr r9, [r1, #0x14] @ v_msk
ldr r1, [r1] @ src
mov r8, r12 @ v_inc
ldmfd sp!, {r4-r7,pc}
.cfi_endproc
+
+#ifdef HAVE_ARMV6
+
+.macro modulate rp mbr mg t0 t1 t2
+ and \t0, \rp, #0x001f
+ and \t1, \rp, #0x03e0
+ and \t2, \rp, #0x7c00
+ smulbb \t0, \t0, \mbr @ -> 0000 0000 0000 orrr rrxx xxxx xxxx xxxx
+ smulbt \t1, \t1, \mg @ -> 0000 000o gggg gxxx xxxx xxxx xxx0 0000
+ smulbt \t2, \t2, \mbr @ -> 00ob bbbb xxxx xxxx xxxx xx00 0000 0000
+ and \rp, \rp, #0x8000
+ usat \t0, #5, \t0, asr #14
+ usat \t1, #5, \t1, asr #19
+ usat \t2, #5, \t2, asr #24
+ orr \rp, \rp, \t0
+ orr \rp, \rp, \t1, lsl #5
+ orr \rp, \rp, \t2, lsl #10
+.endm
+
+.global poly_4bpp_l_asm @ (void *d, const struct gpu_unai_inner_t *inn, int count)
+poly_4bpp_l_asm:
+ .cfi_startproc
+ stmfd sp!, {r4-r11,lr}
+ .cfi_def_cfa_offset 4*9
+ .cfi_rel_offset lr, 4*8
+ poly_4bpp_init poly_4bpp_lv_asm 1
+0:
+ mov r12,r4, lsr #13
+ subs r2, r2, #1
+ bmi 1f
+ ldr lr, [r1, r12, lsl #2]
+ lsr r12,r4, #8
+ and r12,r12,#0x1c
+ sub r12,r12,#1
+ mov r12,lr, ror r12
+ add r4, r4, r5
+ and r12,r12,#0x1e
+ and r4, r4, r6
+ ldrh r12,[r3, r12]
+ add r0, r0, #2
+ tst r12,r12
+ beq 0b
+ modulate r12, r10, r11, r7, r8, lr
+ strh r12,[r0, #-2]
+ b 0b
+1:
+ ldmfd sp!, {r4-r11,pc}
+
+poly_4bpp_lv_asm: @ r3=clut, r4=u, r5=u_inc, r6=u_msk, r7=v, lr=v_masked
+ sub sp, sp, #4*2
+ .cfi_def_cfa_offset 4*(9+2)
+ .cfi_rel_offset lr, 4*(8+2)
+ ldr r9, [r1, #0x14] @ v_msk
+ ldr r1, [r1] @ src
+ mov r8, r12 @ v_inc
+ mov r12,r4, lsr #13
+ and r9, r9, #0xff<<10 @ v_msk_final
+ stmia sp, {r5,r6}
+0:
+ and lr, r7, r9
+ mov r12,r4, lsr #13
+ add lr, r1, lr, lsl #1
+ subs r2, r2, #1
+ bmi 1f
+ ldr lr, [lr, r12, lsl #2]
+ lsr r12,r4, #8
+ and r12,r12,#0x1c
+ sub r12,r12,#1
+ mov r12,lr, ror r12
+ add r4, r4, r5
+ and r12,r12,#0x1e
+ and r4, r4, r6
+ ldrh r12,[r3, r12]
+ add r0, r0, #2
+ add r7, r7, r8
+ tst r12,r12
+ beq 0b
+ modulate r12, r10, r11, r5, r6, lr
+ strh r12,[r0, #-2]
+ ldmia sp, {r5,r6}
+ b 0b
+1:
+ add sp, sp, #4*2
+ ldmfd sp!, {r4-r11,pc}
+ .cfi_endproc
+
+#endif // HAVE_ARMV6
+
@ vim:filetype=armasm
void sprite_4bpp_x16_asm(void *d, const void *s, void *pal, int lines);
void poly_4bpp_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
+void poly_4bpp_l_asm(void *d, const struct gpu_unai_inner_t *inn, int count);
#ifdef __cplusplus
}
#include "gpu_inner_quantization.h"
#include "gpu_inner_light.h"
+#include "arm_features.h"
#ifdef __arm__
#include "gpu_inner_blend_arm.h"
#include "gpu_inner_light_arm.h"
}
#ifdef __arm__
-static void PolySpan4bppAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+template<int CF>
+static void PolySpanAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
{
- poly_4bpp_asm(pDst, &gpu_unai.inn, count);
+ switch (CF) {
+ case 0x20: poly_4bpp_asm (pDst, &gpu_unai.inn, count); break;
+ case 0x21: poly_4bpp_l_asm(pDst, &gpu_unai.inn, count); break;
+ }
}
#endif
#define TI(cf) gpuPolySpanFn<(cf)>
#define TN PolyNULL
#ifdef __arm__
-#define TA4(cf) PolySpan4bppAsm
+#define TA(cf) PolySpanAsm<(cf)>
#else
-#define TA4(cf) TI(cf)
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) PolySpanAsm<(cf)>
+#else
+#define TA6(cf) TI(cf)
#endif
#define TIBLOCK(ub) \
TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
TN, TN, TI((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \
TN, TN, TI((ub)|0x12), TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \
TN, TN, TI((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \
- TA4((ub)|0x20),TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+ TA((ub)|0x20), TA6((ub)|0x21),TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \
TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \
TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \
#undef TI
#undef TN
#undef TIBLOCK
-#undef TA4
-#undef TA8
+#undef TA
+#undef TA6
#endif /* __GPU_UNAI_GPU_INNER_H__ */
#ifndef _OP_LIGHT_ARM_H_
#define _OP_LIGHT_ARM_H_
+#include "arm_features.h"
+
////////////////////////////////////////////////////////////////////////////////
// Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
//
// u16 output: mbbbbbgggggrrrrr
// Where 'X' are fixed-pt bits.
////////////////////////////////////////////////////////////////////////////////
+#ifdef HAVE_ARMV6
+// clang uses smulbb but not gcc, so we need this
+GPU_INLINE int_fast16_t smulbb(int_fast16_t a, int_fast16_t b)
+{
+ int_fast16_t r;
+ asm("smulbb %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+ return r;
+}
+
+GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+{
+ // on v6 we have single-cycle mul and sat which is better than the lut
+ int_fast16_t r = smulbb(uSrc & 0x001f, r5);
+ int_fast16_t g = smulbb(uSrc & 0x03e0, g5);
+ int_fast16_t b = smulbb(uSrc & 0x7c00, b5);
+ asm volatile("usat %0, #5, %0, asr #4" : "=r"(r) : "0"(r));
+ asm volatile("usat %0, #5, %0, asr #9" : "=r"(g) : "0"(g));
+ asm volatile("usat %0, #5, %0, asr #14" : "=r"(b) : "0"(b));
+ return (uSrc & 0x8000) | (b << 10) | (g << 5) | r;
+}
+#else
GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
{
uint_fast16_t out = 0x03E0;
: "cc");
return out;
}
+#endif
////////////////////////////////////////////////////////////////////////////////
// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
u32 u_msk, v_msk; // 10
s32 u_inc, v_inc; // 18
+ // Color for flat-shaded, texture-blended prims
+ u8 r5, g5, b5, pad5; // 20 5-bit light for undithered prims
+ u8 r8, g8, b8, pad8; // 24 8-bit light for dithered prims
+
// Color for Gouraud-shaded prims
// Fixed-pt 8.8 rgb triplet
// Packed fixed-pt 8.3:8.3:8.2 rgb triplet
gcol_t gCol;
gcol_t gInc; // Increment along scanline for gCol
- // Color for flat-shaded, texture-blended prims
- u8 r5, g5, b5; // 5-bit light for undithered prims
- u8 r8, g8, b8; // 8-bit light for dithered prims
-
// Color for flat-shaded, untextured prims
u16 PixelData; // bgr555 color for untextured flat-shaded polys
};