X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=frontend%2Fcspace.c;h=33a981df27eb9cde28463d40d6124e8f83786e35;hb=HEAD;hp=f0c491266ffcef63efc9d5a6b878d326b7ae8c9e;hpb=c82f907a99f7d0d3071471489d247f4b4394a8b7;p=pcsx_rearmed.git diff --git a/frontend/cspace.c b/frontend/cspace.c index f0c49126..a3e3301f 100644 --- a/frontend/cspace.c +++ b/frontend/cspace.c @@ -1,5 +1,5 @@ /* - * (C) Gražvydas "notaz" Ignotas, 2011,2012 + * (C) Gražvydas "notaz" Ignotas, 2011,2012,2022 * * This work is licensed under the terms of any of these licenses * (at your option): @@ -8,6 +8,7 @@ * See the COPYING file in the top-level directory. */ +#include #include "cspace.h" /* @@ -15,23 +16,103 @@ * in favor of NEON version or platform-specific conversion */ -#ifndef __ARM_NEON__ +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define SWAP16(x) __builtin_bswap16(x) +#define LE16TOHx2(x) ((SWAP16((x) >> 16) << 16) | SWAP16(x)) +#else +#define LE16TOHx2(x) (x) +#endif + +#if defined(HAVE_bgr555_to_rgb565) + +/* have bgr555_to_rgb565 somewhere else */ + +#elif ((defined(__clang_major__) && __clang_major__ >= 4) \ + || (defined(__GNUC__) && __GNUC__ >= 5)) \ + && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ + +#include + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#define gsli(d_, s_, n_) d_ = vsliq_n_u16(d_, s_, n_) +#define gsri(d_, s_, n_) d_ = vsriq_n_u16(d_, s_, n_) +#else +#define gsli(d_, s_, n_) d_ |= s_ << n_ +#define gsri(d_, s_, n_) d_ |= s_ >> n_ +#endif + +typedef uint16_t gvu16 __attribute__((vector_size(16),aligned(16))); +typedef uint16_t gvu16u __attribute__((vector_size(16),aligned(2))); +#define gdup(v_) {v_, v_, v_, v_, v_, v_, v_, v_} +#define do_one(s) ({ \ + uint16_t d_ = (s) << 1; d_ = (d_ & 0x07c0) | (d_ << 10) | (d_ >> 11); d_; \ +}) +#define do_one_simd(d_, s_, c0x07c0_) { \ + gvu16 s1 = s_ << 1; \ + d_ = s1 & c0x07c0_; \ + gsli(d_, s_, 11); \ + gsri(d_, s1, 11); \ +} + +void bgr555_to_rgb565(void * __restrict__ dst_, const void * __restrict__ src_, int bytes) +{ + const uint16_t * __restrict__ src = src_; + uint16_t * __restrict__ dst = dst_; + gvu16 c0x07c0 = gdup(0x07c0); + + assert(!(((uintptr_t)dst | (uintptr_t)src | bytes) & 1)); + + // align the destination + if ((uintptr_t)dst & 0x0e) + { + uintptr_t left = 0x10 - ((uintptr_t)dst & 0x0e); + gvu16 d, s = *(const gvu16u *)src; + do_one_simd(d, s, c0x07c0); + *(gvu16u *)dst = d; + dst += left / 2; + src += left / 2; + bytes -= left; + } + // go + for (; bytes >= 16; dst += 8, src += 8, bytes -= 16) + { + gvu16 d, s = *(const gvu16u *)src; + do_one_simd(d, s, c0x07c0); + *(gvu16 *)dst = d; + __builtin_prefetch(src + 128/2); + } + // finish it + for (; bytes > 0; dst++, src++, bytes -= 2) + *dst = do_one(*src); +} +#undef do_one +#undef do_one_simd + +#else void bgr555_to_rgb565(void *dst_, const void *src_, int bytes) { - const unsigned int *src = src_; + // source can be misaligned, but it's very rare, so just force + const unsigned int *src = (const void *)((intptr_t)src_ & ~3); unsigned int *dst = dst_; - unsigned int p; - int x; + unsigned int x, p, r, g, b; for (x = 0; x < bytes / 4; x++) { - p = src[x]; - p = ((p & 0x7c007c00) >> 10) | ((p & 0x03e003e0) << 1) - | ((p & 0x001f001f) << 11); - dst[x] = p; + p = LE16TOHx2(src[x]); + + r = (p & 0x001f001f) << 11; + g = (p & 0x03e003e0) << 1; + b = (p & 0x7c007c00) >> 10; + + dst[x] = r | g | b; } } +#endif + +#ifndef HAVE_bgr888_to_x + void bgr888_to_rgb565(void *dst_, const void *src_, int bytes) { const unsigned char *src = src_; @@ -45,8 +126,13 @@ void bgr888_to_rgb565(void *dst_, const void *src_, int bytes) r2 = src[3] & 0xf8; g2 = src[4] & 0xfc; b2 = src[5] & 0xf8; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + *dst = (r1 << 24) | (g1 << 19) | (b1 << 13) | + (r2 << 8) | (g2 << 3) | (b2 >> 3); +#else *dst = (r2 << 24) | (g2 << 19) | (b2 << 13) | (r1 << 8) | (g1 << 3) | (b1 >> 3); +#endif } } @@ -129,7 +215,7 @@ void bgr555_to_uyvy(void *d, const void *s, int pixels) int r0, g0, b0, r1, g1, b1; int y0, y1, u, v; - for (; pixels > 0; src += 2, dst++, pixels -= 2) + for (; pixels > 1; src += 2, dst++, pixels -= 2) { b0 = (src[0] >> 10) & 0x1f; g0 = (src[0] >> 5) & 0x1f;