2 * (C) GraÅžvydas "notaz" Ignotas, 2011,2012,2022
4 * This work is licensed under the terms of any of these licenses
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
13 #include "compiler_features.h"
16 * note: these are intended for testing and should be avoided
17 * in favor of NEON version or platform-specific conversion
20 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
21 #define SWAP16(x) __builtin_bswap16(x)
22 #define LE16TOHx2(x) ((SWAP16((x) >> 16) << 16) | SWAP16(x))
24 #define LE16TOHx2(x) (x)
27 #if defined(HAVE_bgr555_to_rgb565)
29 /* have bgr555_to_rgb565 somewhere else */
31 #elif ((defined(__clang_major__) && __clang_major__ >= 4) \
32 || (defined(__GNUC__) && __GNUC__ >= 5)) \
33 && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
37 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
39 #define gsli(d_, s_, n_) d_ = vsliq_n_u16(d_, s_, n_)
40 #define gsri(d_, s_, n_) d_ = vsriq_n_u16(d_, s_, n_)
42 #define gsli(d_, s_, n_) d_ |= s_ << n_
43 #define gsri(d_, s_, n_) d_ |= s_ >> n_
46 typedef uint16_t gvu16 __attribute__((vector_size(16),aligned(16)));
47 typedef uint16_t gvu16u __attribute__((vector_size(16),aligned(2)));
48 #define gdup(v_) {v_, v_, v_, v_, v_, v_, v_, v_}
49 #define do_one(s) ({ \
50 uint16_t d_ = (s) << 1; d_ = (d_ & 0x07c0) | (d_ << 10) | (d_ >> 11); d_; \
52 #define do_one_simd(d_, s_, c0x07c0_) { \
59 void bgr555_to_rgb565(void * __restrict__ dst_, const void * __restrict__ src_, int bytes)
61 const uint16_t * __restrict__ src = src_;
62 uint16_t * __restrict__ dst = dst_;
63 gvu16 c0x07c0 = gdup(0x07c0);
65 assert(!(((uintptr_t)dst | (uintptr_t)src | bytes) & 1));
67 // align the destination
68 if ((uintptr_t)dst & 0x0e)
70 uintptr_t left = 0x10 - ((uintptr_t)dst & 0x0e);
71 gvu16 d, s = *(const gvu16u *)src;
72 do_one_simd(d, s, c0x07c0);
79 for (; bytes >= 16; dst += 8, src += 8, bytes -= 16)
81 gvu16 d, s = *(const gvu16u *)src;
82 do_one_simd(d, s, c0x07c0);
84 __builtin_prefetch(src + 128/2);
87 for (; bytes > 0; dst++, src++, bytes -= 2)
95 void bgr555_to_rgb565(void *dst_, const void *src_, int bytes)
97 // source can be misaligned, but it's very rare, so just force
98 const unsigned int *src = (const void *)((intptr_t)src_ & ~3);
99 unsigned int *dst = dst_;
100 unsigned int x, p, r, g, b;
102 for (x = 0; x < bytes / 4; x++) {
103 p = LE16TOHx2(src[x]);
105 r = (p & 0x001f001f) << 11;
106 g = (p & 0x03e003e0) << 1;
107 b = (p & 0x7c007c00) >> 10;
115 #ifndef HAVE_bgr888_to_x
117 void attr_weak bgr888_to_rgb565(void *dst_, const void *src_, int bytes)
119 const unsigned char *src = src_;
120 unsigned int *dst = dst_;
121 unsigned int r1, g1, b1, r2, g2, b2;
123 for (; bytes >= 6; bytes -= 6, src += 6, dst++) {
130 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
131 *dst = (r1 << 24) | (g1 << 19) | (b1 << 13) |
132 (r2 << 8) | (g2 << 3) | (b2 >> 3);
134 *dst = (r2 << 24) | (g2 << 19) | (b2 << 13) |
135 (r1 << 8) | (g1 << 3) | (b1 >> 3);
141 void rgb888_to_rgb565(void *dst, const void *src, int bytes) {}
142 void bgr888_to_rgb888(void *dst, const void *src, int bytes) {}
144 #endif // HAVE_bgr888_to_x
146 void bgr555_to_xrgb8888(void * __restrict__ dst_, const void * __restrict__ src_, int bytes)
148 const uint16_t * __restrict__ src = src_;
149 uint32_t * __restrict__ dst = dst_;
151 for (; bytes >= 2; bytes -= 2, src++, dst++)
153 uint32_t t = ((*src << 19) | (*src >> 7)) & 0xf800f8;
154 t |= (*src << 6) & 0xf800;
155 *dst = t | ((t >> 5) & 0x070707);
159 void bgr888_to_xrgb8888(void * __restrict__ dst_, const void * __restrict__ src_, int bytes)
161 const uint8_t * __restrict__ src = src_;
162 uint32_t * __restrict__ dst = dst_;
164 for (; bytes >= 3; bytes -= 3, src += 3, dst++)
165 *dst = (src[0] << 16) | (src[1] << 8) | src[2];
169 static int yuv_ry[32], yuv_gy[32], yuv_by[32];
170 static unsigned char yuv_u[32 * 2], yuv_v[32 * 2];
171 static struct uyvy { uint32_t y:8; uint32_t vyu:24; } yuv_uyvy[32768];
173 void bgr_to_uyvy_init(void)
175 unsigned char yuv_y[256];
178 /* init yuv converter:
179 y0 = (int)((0.299f * r0) + (0.587f * g0) + (0.114f * b0));
180 y1 = (int)((0.299f * r1) + (0.587f * g1) + (0.114f * b1));
181 u = (int)(8 * 0.565f * (b0 - y0)) + 128;
182 v = (int)(8 * 0.713f * (r0 - y0)) + 128;
184 for (i = 0; i < 32; i++) {
185 yuv_ry[i] = (int)(0.299f * i * 65536.0f + 0.5f);
186 yuv_gy[i] = (int)(0.587f * i * 65536.0f + 0.5f);
187 yuv_by[i] = (int)(0.114f * i * 65536.0f + 0.5f);
189 for (i = -32; i < 32; i++) {
190 v = (int)(8 * 0.565f * i) + 128;
196 v = (int)(8 * 0.713f * i) + 128;
203 // valid Y range seems to be 16..235
204 for (i = 0; i < 256; i++) {
205 yuv_y[i] = 16 + 219 * i / 32;
207 // everything combined into one large array for speed
208 for (i = 0; i < 32768; i++) {
209 int r = (i >> 0) & 0x1f, g = (i >> 5) & 0x1f, b = (i >> 10) & 0x1f;
210 int y = (yuv_ry[r] + yuv_gy[g] + yuv_by[b]) >> 16;
211 yuv_uyvy[i].y = yuv_y[y];
212 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
213 yuv_uyvy[i].vyu = (yuv_v[b-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[r-y + 32];
215 yuv_uyvy[i].vyu = (yuv_v[r-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[b-y + 32];
220 void rgb565_to_uyvy(void *d, const void *s, int pixels)
222 unsigned int *dst = d;
223 const unsigned short *src = s;
224 const unsigned char *yu = yuv_u + 32;
225 const unsigned char *yv = yuv_v + 32;
226 int r0, g0, b0, r1, g1, b1;
229 for (; pixels > 0; src += 2, dst++, pixels -= 2)
231 r0 = (src[0] >> 11) & 0x1f;
232 g0 = (src[0] >> 6) & 0x1f;
234 r1 = (src[1] >> 11) & 0x1f;
235 g1 = (src[1] >> 6) & 0x1f;
237 y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16;
238 y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16;
241 // valid Y range seems to be 16..235
242 y0 = 16 + 219 * y0 / 31;
243 y1 = 16 + 219 * y1 / 31;
245 *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u;
249 void bgr555_to_uyvy(void *d, const void *s, int pixels, int x2)
252 const uint16_t *src = s;
256 for (i = pixels; i >= 4; src += 4, dst += 4, i -= 4)
258 const struct uyvy *uyvy0 = yuv_uyvy + (src[0] & 0x7fff);
259 const struct uyvy *uyvy1 = yuv_uyvy + (src[1] & 0x7fff);
260 const struct uyvy *uyvy2 = yuv_uyvy + (src[2] & 0x7fff);
261 const struct uyvy *uyvy3 = yuv_uyvy + (src[3] & 0x7fff);
262 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
263 dst[0] = uyvy0->y | (uyvy0->vyu << 8);
264 dst[1] = uyvy1->y | (uyvy1->vyu << 8);
265 dst[2] = uyvy2->y | (uyvy2->vyu << 8);
266 dst[3] = uyvy3->y | (uyvy3->vyu << 8);
268 dst[0] = (uyvy0->y << 24) | uyvy0->vyu;
269 dst[1] = (uyvy1->y << 24) | uyvy1->vyu;
270 dst[2] = (uyvy2->y << 24) | uyvy2->vyu;
271 dst[3] = (uyvy3->y << 24) | uyvy3->vyu;
275 for (i = pixels; i >= 4; src += 4, dst += 2, i -= 4)
277 const struct uyvy *uyvy0 = yuv_uyvy + (src[0] & 0x7fff);
278 const struct uyvy *uyvy1 = yuv_uyvy + (src[1] & 0x7fff);
279 const struct uyvy *uyvy2 = yuv_uyvy + (src[2] & 0x7fff);
280 const struct uyvy *uyvy3 = yuv_uyvy + (src[3] & 0x7fff);
281 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
282 dst[0] = uyvy1->y | (uyvy0->vyu << 8);
283 dst[1] = uyvy3->y | (uyvy2->vyu << 8);
285 dst[0] = (uyvy1->y << 24) | uyvy0->vyu;
286 dst[1] = (uyvy3->y << 24) | uyvy2->vyu;
292 void bgr888_to_uyvy(void *d, const void *s, int pixels, int x2)
294 unsigned int *dst = d;
295 const unsigned char *src8 = s;
296 const unsigned char *yu = yuv_u + 32;
297 const unsigned char *yv = yuv_v + 32;
298 int r0, g0, b0, r1, g1, b1;
299 int y0, y1, u0, u1, v0, v1;
302 for (; pixels >= 2; src8 += 3*2, pixels -= 2)
304 r0 = src8[0], g0 = src8[1], b0 = src8[2];
305 r1 = src8[3], g1 = src8[4], b1 = src8[5];
306 y0 = (r0 * 19595 + g0 * 38470 + b0 * 7471) >> 16;
307 y1 = (r1 * 19595 + g1 * 38470 + b1 * 7471) >> 16;
308 u0 = yu[(b0 - y0) / 8];
309 u1 = yu[(b1 - y1) / 8];
310 v0 = yv[(r0 - y0) / 8];
311 v1 = yv[(r1 - y1) / 8];
312 y0 = 16 + 219 * y0 / 255;
313 y1 = 16 + 219 * y1 / 255;
315 *dst++ = (y0 << 24) | (v0 << 16) | (y0 << 8) | u0;
316 *dst++ = (y1 << 24) | (v1 << 16) | (y1 << 8) | u1;
320 for (; pixels >= 2; src8 += 3*2, dst++, pixels -= 2)
322 r0 = src8[0], g0 = src8[1], b0 = src8[2];
323 r1 = src8[3], g1 = src8[4], b1 = src8[5];
324 y0 = (r0 * 19595 + g0 * 38470 + b0 * 7471) >> 16;
325 y1 = (r1 * 19595 + g1 * 38470 + b1 * 7471) >> 16;
326 u0 = yu[(b0 - y0) / 8];
327 v0 = yv[(r0 - y0) / 8];
328 y0 = 16 + 219 * y0 / 255;
329 y1 = 16 + 219 * y1 / 255;
331 *dst = (y1 << 24) | (v0 << 16) | (y0 << 8) | u0;