+#if defined(HAVE_bgr555_to_rgb565)
+
+/* have bgr555_to_rgb565 somewhere else */
+
+#elif ((defined(__clang_major__) && __clang_major__ >= 4) \
+ || (defined(__GNUC__) && __GNUC__ >= 5)) \
+ && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+
+#include <stdint.h>
+#include <assert.h>
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define gsli(d_, s_, n_) d_ = vsliq_n_u16(d_, s_, n_)
+#define gsri(d_, s_, n_) d_ = vsriq_n_u16(d_, s_, n_)
+#else
+#define gsli(d_, s_, n_) d_ |= s_ << n_
+#define gsri(d_, s_, n_) d_ |= s_ >> n_
+#endif
+
+typedef uint16_t gvu16 __attribute__((vector_size(16),aligned(16)));
+typedef uint16_t gvu16u __attribute__((vector_size(16),aligned(2)));
+#define gdup(v_) {v_, v_, v_, v_, v_, v_, v_, v_}
+#define do_one(s) ({ \
+ uint16_t d_ = (s) << 1; d_ = (d_ & 0x07c0) | (d_ << 10) | (d_ >> 11); d_; \
+})
+#define do_one_simd(d_, s_, c0x07c0_) { \
+ gvu16 s1 = s_ << 1; \
+ d_ = s1 & c0x07c0_; \
+ gsli(d_, s_, 11); \
+ gsri(d_, s1, 11); \
+}
+
+void bgr555_to_rgb565(void * __restrict__ dst_, const void * __restrict__ src_, int bytes)