| 1 | /* |
| 2 | * (C) GraÅžvydas "notaz" Ignotas, 2011,2012,2022 |
| 3 | * |
| 4 | * This work is licensed under the terms of any of these licenses |
| 5 | * (at your option): |
| 6 | * - GNU GPL, version 2 or later. |
| 7 | * - GNU LGPL, version 2.1 or later. |
| 8 | * See the COPYING file in the top-level directory. |
| 9 | */ |
| 10 | |
| 11 | #include <stdint.h> |
| 12 | #include "cspace.h" |
| 13 | #include "compiler_features.h" |
| 14 | |
| 15 | /* |
| 16 | * note: these are intended for testing and should be avoided |
| 17 | * in favor of NEON version or platform-specific conversion |
| 18 | */ |
| 19 | |
| 20 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 21 | #define SWAP16(x) __builtin_bswap16(x) |
| 22 | #define LE16TOHx2(x) ((SWAP16((x) >> 16) << 16) | SWAP16(x)) |
| 23 | #else |
| 24 | #define LE16TOHx2(x) (x) |
| 25 | #endif |
| 26 | |
| 27 | #if defined(HAVE_bgr555_to_rgb565) |
| 28 | |
| 29 | /* have bgr555_to_rgb565 somewhere else */ |
| 30 | |
| 31 | #elif ((defined(__clang_major__) && __clang_major__ >= 4) \ |
| 32 | || (defined(__GNUC__) && __GNUC__ >= 5)) \ |
| 33 | && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ |
| 34 | |
| 35 | #include <assert.h> |
| 36 | |
| 37 | #if defined(__ARM_NEON) || defined(__ARM_NEON__) |
| 38 | #include <arm_neon.h> |
| 39 | #define gsli(d_, s_, n_) d_ = vsliq_n_u16(d_, s_, n_) |
| 40 | #define gsri(d_, s_, n_) d_ = vsriq_n_u16(d_, s_, n_) |
| 41 | #else |
| 42 | #define gsli(d_, s_, n_) d_ |= s_ << n_ |
| 43 | #define gsri(d_, s_, n_) d_ |= s_ >> n_ |
| 44 | #endif |
| 45 | |
| 46 | typedef uint16_t gvu16 __attribute__((vector_size(16),aligned(16))); |
| 47 | typedef uint16_t gvu16u __attribute__((vector_size(16),aligned(2))); |
| 48 | #define gdup(v_) {v_, v_, v_, v_, v_, v_, v_, v_} |
| 49 | #define do_one(s) ({ \ |
| 50 | uint16_t d_ = (s) << 1; d_ = (d_ & 0x07c0) | (d_ << 10) | (d_ >> 11); d_; \ |
| 51 | }) |
| 52 | #define do_one_simd(d_, s_, c0x07c0_) { \ |
| 53 | gvu16 s1 = s_ << 1; \ |
| 54 | d_ = s1 & c0x07c0_; \ |
| 55 | gsli(d_, s_, 11); \ |
| 56 | gsri(d_, s1, 11); \ |
| 57 | } |
| 58 | |
| 59 | void bgr555_to_rgb565(void * __restrict__ dst_, const void * __restrict__ src_, int bytes) |
| 60 | { |
| 61 | const uint16_t * __restrict__ src = src_; |
| 62 | uint16_t * __restrict__ dst = dst_; |
| 63 | gvu16 c0x07c0 = gdup(0x07c0); |
| 64 | |
| 65 | assert(!(((uintptr_t)dst | (uintptr_t)src | bytes) & 1)); |
| 66 | |
| 67 | // align the destination |
| 68 | if ((uintptr_t)dst & 0x0e) |
| 69 | { |
| 70 | uintptr_t left = 0x10 - ((uintptr_t)dst & 0x0e); |
| 71 | gvu16 d, s = *(const gvu16u *)src; |
| 72 | do_one_simd(d, s, c0x07c0); |
| 73 | *(gvu16u *)dst = d; |
| 74 | dst += left / 2; |
| 75 | src += left / 2; |
| 76 | bytes -= left; |
| 77 | } |
| 78 | // go |
| 79 | for (; bytes >= 16; dst += 8, src += 8, bytes -= 16) |
| 80 | { |
| 81 | gvu16 d, s = *(const gvu16u *)src; |
| 82 | do_one_simd(d, s, c0x07c0); |
| 83 | *(gvu16 *)dst = d; |
| 84 | __builtin_prefetch(src + 128/2); |
| 85 | } |
| 86 | // finish it |
| 87 | for (; bytes > 0; dst++, src++, bytes -= 2) |
| 88 | *dst = do_one(*src); |
| 89 | } |
| 90 | #undef do_one |
| 91 | #undef do_one_simd |
| 92 | |
| 93 | #else |
| 94 | |
| 95 | void bgr555_to_rgb565(void *dst_, const void *src_, int bytes) |
| 96 | { |
| 97 | // source can be misaligned, but it's very rare, so just force |
| 98 | const unsigned int *src = (const void *)((intptr_t)src_ & ~3); |
| 99 | unsigned int *dst = dst_; |
| 100 | unsigned int x, p, r, g, b; |
| 101 | |
| 102 | for (x = 0; x < bytes / 4; x++) { |
| 103 | p = LE16TOHx2(src[x]); |
| 104 | |
| 105 | r = (p & 0x001f001f) << 11; |
| 106 | g = (p & 0x03e003e0) << 1; |
| 107 | b = (p & 0x7c007c00) >> 10; |
| 108 | |
| 109 | dst[x] = r | g | b; |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | #endif |
| 114 | |
| 115 | #ifndef HAVE_bgr888_to_x |
| 116 | |
| 117 | void attr_weak bgr888_to_rgb565(void *dst_, const void *src_, int bytes) |
| 118 | { |
| 119 | const unsigned char *src = src_; |
| 120 | unsigned int *dst = dst_; |
| 121 | unsigned int r1, g1, b1, r2, g2, b2; |
| 122 | |
| 123 | for (; bytes >= 6; bytes -= 6, src += 6, dst++) { |
| 124 | r1 = src[0] & 0xf8; |
| 125 | g1 = src[1] & 0xfc; |
| 126 | b1 = src[2] & 0xf8; |
| 127 | r2 = src[3] & 0xf8; |
| 128 | g2 = src[4] & 0xfc; |
| 129 | b2 = src[5] & 0xf8; |
| 130 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 131 | *dst = (r1 << 24) | (g1 << 19) | (b1 << 13) | |
| 132 | (r2 << 8) | (g2 << 3) | (b2 >> 3); |
| 133 | #else |
| 134 | *dst = (r2 << 24) | (g2 << 19) | (b2 << 13) | |
| 135 | (r1 << 8) | (g1 << 3) | (b1 >> 3); |
| 136 | #endif |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | // TODO? |
| 141 | void rgb888_to_rgb565(void *dst, const void *src, int bytes) {} |
| 142 | void bgr888_to_rgb888(void *dst, const void *src, int bytes) {} |
| 143 | |
| 144 | #endif // HAVE_bgr888_to_x |
| 145 | |
| 146 | void bgr555_to_xrgb8888(void * __restrict__ dst_, const void * __restrict__ src_, int bytes) |
| 147 | { |
| 148 | const uint16_t * __restrict__ src = src_; |
| 149 | uint32_t * __restrict__ dst = dst_; |
| 150 | |
| 151 | for (; bytes >= 2; bytes -= 2, src++, dst++) |
| 152 | { |
| 153 | uint32_t t = ((*src << 19) | (*src >> 7)) & 0xf800f8; |
| 154 | t |= (*src << 6) & 0xf800; |
| 155 | *dst = t | ((t >> 5) & 0x070707); |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | void bgr888_to_xrgb8888(void * __restrict__ dst_, const void * __restrict__ src_, int bytes) |
| 160 | { |
| 161 | const uint8_t * __restrict__ src = src_; |
| 162 | uint32_t * __restrict__ dst = dst_; |
| 163 | |
| 164 | for (; bytes >= 3; bytes -= 3, src += 3, dst++) |
| 165 | *dst = (src[0] << 16) | (src[1] << 8) | src[2]; |
| 166 | } |
| 167 | |
| 168 | /* YUV stuff */ |
| 169 | static int yuv_ry[32], yuv_gy[32], yuv_by[32]; |
| 170 | static unsigned char yuv_u[32 * 2], yuv_v[32 * 2]; |
| 171 | static struct uyvy { uint32_t y:8; uint32_t vyu:24; } yuv_uyvy[32768]; |
| 172 | |
| 173 | void bgr_to_uyvy_init(void) |
| 174 | { |
| 175 | unsigned char yuv_y[256]; |
| 176 | int i, v; |
| 177 | |
| 178 | /* init yuv converter: |
| 179 | y0 = (int)((0.299f * r0) + (0.587f * g0) + (0.114f * b0)); |
| 180 | y1 = (int)((0.299f * r1) + (0.587f * g1) + (0.114f * b1)); |
| 181 | u = (int)(8 * 0.565f * (b0 - y0)) + 128; |
| 182 | v = (int)(8 * 0.713f * (r0 - y0)) + 128; |
| 183 | */ |
| 184 | for (i = 0; i < 32; i++) { |
| 185 | yuv_ry[i] = (int)(0.299f * i * 65536.0f + 0.5f); |
| 186 | yuv_gy[i] = (int)(0.587f * i * 65536.0f + 0.5f); |
| 187 | yuv_by[i] = (int)(0.114f * i * 65536.0f + 0.5f); |
| 188 | } |
| 189 | for (i = -32; i < 32; i++) { |
| 190 | v = (int)(8 * 0.565f * i) + 128; |
| 191 | if (v < 0) |
| 192 | v = 0; |
| 193 | if (v > 255) |
| 194 | v = 255; |
| 195 | yuv_u[i + 32] = v; |
| 196 | v = (int)(8 * 0.713f * i) + 128; |
| 197 | if (v < 0) |
| 198 | v = 0; |
| 199 | if (v > 255) |
| 200 | v = 255; |
| 201 | yuv_v[i + 32] = v; |
| 202 | } |
| 203 | // valid Y range seems to be 16..235 |
| 204 | for (i = 0; i < 256; i++) { |
| 205 | yuv_y[i] = 16 + 219 * i / 32; |
| 206 | } |
| 207 | // everything combined into one large array for speed |
| 208 | for (i = 0; i < 32768; i++) { |
| 209 | int r = (i >> 0) & 0x1f, g = (i >> 5) & 0x1f, b = (i >> 10) & 0x1f; |
| 210 | int y = (yuv_ry[r] + yuv_gy[g] + yuv_by[b]) >> 16; |
| 211 | yuv_uyvy[i].y = yuv_y[y]; |
| 212 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 213 | yuv_uyvy[i].vyu = (yuv_v[b-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[r-y + 32]; |
| 214 | #else |
| 215 | yuv_uyvy[i].vyu = (yuv_v[r-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[b-y + 32]; |
| 216 | #endif |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | void rgb565_to_uyvy(void *d, const void *s, int pixels) |
| 221 | { |
| 222 | unsigned int *dst = d; |
| 223 | const unsigned short *src = s; |
| 224 | const unsigned char *yu = yuv_u + 32; |
| 225 | const unsigned char *yv = yuv_v + 32; |
| 226 | int r0, g0, b0, r1, g1, b1; |
| 227 | int y0, y1, u, v; |
| 228 | |
| 229 | for (; pixels > 0; src += 2, dst++, pixels -= 2) |
| 230 | { |
| 231 | r0 = (src[0] >> 11) & 0x1f; |
| 232 | g0 = (src[0] >> 6) & 0x1f; |
| 233 | b0 = src[0] & 0x1f; |
| 234 | r1 = (src[1] >> 11) & 0x1f; |
| 235 | g1 = (src[1] >> 6) & 0x1f; |
| 236 | b1 = src[1] & 0x1f; |
| 237 | y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16; |
| 238 | y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16; |
| 239 | u = yu[b0 - y0]; |
| 240 | v = yv[r0 - y0]; |
| 241 | // valid Y range seems to be 16..235 |
| 242 | y0 = 16 + 219 * y0 / 31; |
| 243 | y1 = 16 + 219 * y1 / 31; |
| 244 | |
| 245 | *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u; |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | void bgr555_to_uyvy(void *d, const void *s, int pixels, int x2) |
| 250 | { |
| 251 | uint32_t *dst = d; |
| 252 | const uint16_t *src = s; |
| 253 | int i; |
| 254 | |
| 255 | if (x2) { |
| 256 | for (i = pixels; i >= 4; src += 4, dst += 4, i -= 4) |
| 257 | { |
| 258 | const struct uyvy *uyvy0 = yuv_uyvy + (src[0] & 0x7fff); |
| 259 | const struct uyvy *uyvy1 = yuv_uyvy + (src[1] & 0x7fff); |
| 260 | const struct uyvy *uyvy2 = yuv_uyvy + (src[2] & 0x7fff); |
| 261 | const struct uyvy *uyvy3 = yuv_uyvy + (src[3] & 0x7fff); |
| 262 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 263 | dst[0] = uyvy0->y | (uyvy0->vyu << 8); |
| 264 | dst[1] = uyvy1->y | (uyvy1->vyu << 8); |
| 265 | dst[2] = uyvy2->y | (uyvy2->vyu << 8); |
| 266 | dst[3] = uyvy3->y | (uyvy3->vyu << 8); |
| 267 | #else |
| 268 | dst[0] = (uyvy0->y << 24) | uyvy0->vyu; |
| 269 | dst[1] = (uyvy1->y << 24) | uyvy1->vyu; |
| 270 | dst[2] = (uyvy2->y << 24) | uyvy2->vyu; |
| 271 | dst[3] = (uyvy3->y << 24) | uyvy3->vyu; |
| 272 | #endif |
| 273 | } |
| 274 | } else { |
| 275 | for (i = pixels; i >= 4; src += 4, dst += 2, i -= 4) |
| 276 | { |
| 277 | const struct uyvy *uyvy0 = yuv_uyvy + (src[0] & 0x7fff); |
| 278 | const struct uyvy *uyvy1 = yuv_uyvy + (src[1] & 0x7fff); |
| 279 | const struct uyvy *uyvy2 = yuv_uyvy + (src[2] & 0x7fff); |
| 280 | const struct uyvy *uyvy3 = yuv_uyvy + (src[3] & 0x7fff); |
| 281 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 282 | dst[0] = uyvy1->y | (uyvy0->vyu << 8); |
| 283 | dst[1] = uyvy3->y | (uyvy2->vyu << 8); |
| 284 | #else |
| 285 | dst[0] = (uyvy1->y << 24) | uyvy0->vyu; |
| 286 | dst[1] = (uyvy3->y << 24) | uyvy2->vyu; |
| 287 | #endif |
| 288 | } |
| 289 | } |
| 290 | } |
| 291 | |
| 292 | void bgr888_to_uyvy(void *d, const void *s, int pixels, int x2) |
| 293 | { |
| 294 | unsigned int *dst = d; |
| 295 | const unsigned char *src8 = s; |
| 296 | const unsigned char *yu = yuv_u + 32; |
| 297 | const unsigned char *yv = yuv_v + 32; |
| 298 | int r0, g0, b0, r1, g1, b1; |
| 299 | int y0, y1, u0, u1, v0, v1; |
| 300 | |
| 301 | if (x2) { |
| 302 | for (; pixels >= 2; src8 += 3*2, pixels -= 2) |
| 303 | { |
| 304 | r0 = src8[0], g0 = src8[1], b0 = src8[2]; |
| 305 | r1 = src8[3], g1 = src8[4], b1 = src8[5]; |
| 306 | y0 = (r0 * 19595 + g0 * 38470 + b0 * 7471) >> 16; |
| 307 | y1 = (r1 * 19595 + g1 * 38470 + b1 * 7471) >> 16; |
| 308 | u0 = yu[(b0 - y0) / 8]; |
| 309 | u1 = yu[(b1 - y1) / 8]; |
| 310 | v0 = yv[(r0 - y0) / 8]; |
| 311 | v1 = yv[(r1 - y1) / 8]; |
| 312 | y0 = 16 + 219 * y0 / 255; |
| 313 | y1 = 16 + 219 * y1 / 255; |
| 314 | |
| 315 | *dst++ = (y0 << 24) | (v0 << 16) | (y0 << 8) | u0; |
| 316 | *dst++ = (y1 << 24) | (v1 << 16) | (y1 << 8) | u1; |
| 317 | } |
| 318 | } |
| 319 | else { |
| 320 | for (; pixels >= 2; src8 += 3*2, dst++, pixels -= 2) |
| 321 | { |
| 322 | r0 = src8[0], g0 = src8[1], b0 = src8[2]; |
| 323 | r1 = src8[3], g1 = src8[4], b1 = src8[5]; |
| 324 | y0 = (r0 * 19595 + g0 * 38470 + b0 * 7471) >> 16; |
| 325 | y1 = (r1 * 19595 + g1 * 38470 + b1 * 7471) >> 16; |
| 326 | u0 = yu[(b0 - y0) / 8]; |
| 327 | v0 = yv[(r0 - y0) / 8]; |
| 328 | y0 = 16 + 219 * y0 / 255; |
| 329 | y1 = 16 + 219 * y1 / 255; |
| 330 | |
| 331 | *dst = (y1 << 24) | (v0 << 16) | (y0 << 8) | u0; |
| 332 | } |
| 333 | } |
| 334 | } |