cspace: generic implementation with vector extensions
[pcsx_rearmed.git] / frontend / cspace.c
CommitLineData
c9099d02 1/*
d639fa7f 2 * (C) GraÅžvydas "notaz" Ignotas, 2011,2012,2022
c9099d02 3 *
4 * This work is licensed under the terms of any of these licenses
5 * (at your option):
6 * - GNU GPL, version 2 or later.
7 * - GNU LGPL, version 2.1 or later.
8 * See the COPYING file in the top-level directory.
9 */
10
a80ae4a0 11#include "cspace.h"
12
4ea7de6a 13/*
14 * note: these are intended for testing and should be avoided
15 * in favor of NEON version or platform-specific conversion
16 */
17
ae8f89db
PC
18#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
19#define SWAP16(x) __builtin_bswap16(x)
20#define LE16TOHx2(x) ((SWAP16((x) >> 16) << 16) | SWAP16(x))
21#else
22#define LE16TOHx2(x) (x)
23#endif
24
d639fa7f 25#if defined(HAVE_bgr555_to_rgb565)
26
27/* have bgr555_to_rgb565 somewhere else */
28
29#elif ((defined(__clang_major__) && __clang_major__ >= 4) \
30 || (defined(__GNUC__) && __GNUC__ >= 5)) \
31 && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
32
33#include <stdint.h>
34#include <assert.h>
35
36#if defined(__ARM_NEON) || defined(__ARM_NEON__)
37#include <arm_neon.h>
38#define gsli(d_, s_, n_) d_ = vsliq_n_u16(d_, s_, n_)
39#define gsri(d_, s_, n_) d_ = vsriq_n_u16(d_, s_, n_)
40#else
41#define gsli(d_, s_, n_) d_ |= s_ << n_
42#define gsri(d_, s_, n_) d_ |= s_ >> n_
43#endif
44
45typedef uint16_t gvu16 __attribute__((vector_size(16),aligned(16)));
46typedef uint16_t gvu16u __attribute__((vector_size(16),aligned(2)));
47#define gdup(v_) {v_, v_, v_, v_, v_, v_, v_, v_}
48#define do_one(s) ({ \
49 uint16_t d_ = (s) << 1; d_ = (d_ & 0x07c0) | (d_ << 10) | (d_ >> 11); d_; \
50})
51#define do_one_simd(d_, s_, c0x07c0_) { \
52 gvu16 s1 = s_ << 1; \
53 d_ = s1 & c0x07c0_; \
54 gsli(d_, s_, 11); \
55 gsri(d_, s1, 11); \
56}
57
58void bgr555_to_rgb565(void * __restrict__ dst_, const void * __restrict__ src_, int bytes)
59{
60 const uint16_t * __restrict__ src = src_;
61 uint16_t * __restrict__ dst = dst_;
62 gvu16 c0x07c0 = gdup(0x07c0);
63
64 assert(!(((uintptr_t)dst | (uintptr_t)src | bytes) & 1));
65
66 // align the destination
67 if ((uintptr_t)dst & 0x0e)
68 {
69 uintptr_t left = 0x10 - ((uintptr_t)dst & 0x0e);
70 gvu16 d, s = *(const gvu16u *)src;
71 do_one_simd(d, s, c0x07c0);
72 *(gvu16u *)dst = d;
73 dst += left / 2;
74 src += left / 2;
75 bytes -= left;
76 }
77 // go
78 for (; bytes >= 16; dst += 8, src += 8, bytes -= 16)
79 {
80 gvu16 d, s = *(const gvu16u *)src;
81 do_one_simd(d, s, c0x07c0);
82 *(gvu16 *)dst = d;
83 __builtin_prefetch(src + 128/2);
84 }
85 // finish it
86 for (; bytes > 0; dst++, src++, bytes -= 2)
87 *dst = do_one(*src);
88}
89#undef do_one
90#undef do_one_simd
91
92#else
c9099d02 93
55b0eeea 94void bgr555_to_rgb565(void *dst_, const void *src_, int bytes)
95{
4ea7de6a 96 const unsigned int *src = src_;
97 unsigned int *dst = dst_;
ae8f89db 98 unsigned int x, p, r, g, b;
55b0eeea 99
100 for (x = 0; x < bytes / 4; x++) {
ae8f89db
PC
101 p = LE16TOHx2(src[x]);
102
103 r = (p & 0x001f001f) << 11;
104 g = (p & 0x03e003e0) << 1;
105 b = (p & 0x7c007c00) >> 10;
106
107 dst[x] = r | g | b;
55b0eeea 108 }
109}
110
d57557c0 111#endif
112
57467c77 113#ifndef HAVE_bgr888_to_x
d57557c0 114
4ea7de6a 115void bgr888_to_rgb565(void *dst_, const void *src_, int bytes)
116{
117 const unsigned char *src = src_;
118 unsigned int *dst = dst_;
119 unsigned int r1, g1, b1, r2, g2, b2;
120
121 for (; bytes >= 6; bytes -= 6, src += 6, dst++) {
122 r1 = src[0] & 0xf8;
123 g1 = src[1] & 0xfc;
124 b1 = src[2] & 0xf8;
125 r2 = src[3] & 0xf8;
126 g2 = src[4] & 0xfc;
127 b2 = src[5] & 0xf8;
ae8f89db
PC
128#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
129 *dst = (r1 << 24) | (g1 << 19) | (b1 << 13) |
130 (r2 << 8) | (g2 << 3) | (b2 >> 3);
131#else
4ea7de6a 132 *dst = (r2 << 24) | (g2 << 19) | (b2 << 13) |
133 (r1 << 8) | (g1 << 3) | (b1 >> 3);
ae8f89db 134#endif
4ea7de6a 135 }
136}
137
a80ae4a0 138// TODO?
00a5d459 139void rgb888_to_rgb565(void *dst, const void *src, int bytes) {}
a80ae4a0 140void bgr888_to_rgb888(void *dst, const void *src, int bytes) {}
a80ae4a0 141
c9099d02 142#endif // __ARM_NEON__
143
144/* YUV stuff */
145static int yuv_ry[32], yuv_gy[32], yuv_by[32];
146static unsigned char yuv_u[32 * 2], yuv_v[32 * 2];
147
148void bgr_to_uyvy_init(void)
149{
150 int i, v;
151
152 /* init yuv converter:
153 y0 = (int)((0.299f * r0) + (0.587f * g0) + (0.114f * b0));
154 y1 = (int)((0.299f * r1) + (0.587f * g1) + (0.114f * b1));
155 u = (int)(8 * 0.565f * (b0 - y0)) + 128;
156 v = (int)(8 * 0.713f * (r0 - y0)) + 128;
157 */
158 for (i = 0; i < 32; i++) {
159 yuv_ry[i] = (int)(0.299f * i * 65536.0f + 0.5f);
160 yuv_gy[i] = (int)(0.587f * i * 65536.0f + 0.5f);
161 yuv_by[i] = (int)(0.114f * i * 65536.0f + 0.5f);
162 }
163 for (i = -32; i < 32; i++) {
164 v = (int)(8 * 0.565f * i) + 128;
165 if (v < 0)
166 v = 0;
167 if (v > 255)
168 v = 255;
169 yuv_u[i + 32] = v;
170 v = (int)(8 * 0.713f * i) + 128;
171 if (v < 0)
172 v = 0;
173 if (v > 255)
174 v = 255;
175 yuv_v[i + 32] = v;
176 }
177}
178
5b9aa749 179void rgb565_to_uyvy(void *d, const void *s, int pixels)
180{
181 unsigned int *dst = d;
182 const unsigned short *src = s;
183 const unsigned char *yu = yuv_u + 32;
184 const unsigned char *yv = yuv_v + 32;
185 int r0, g0, b0, r1, g1, b1;
186 int y0, y1, u, v;
187
188 for (; pixels > 0; src += 2, dst++, pixels -= 2)
189 {
190 r0 = (src[0] >> 11) & 0x1f;
191 g0 = (src[0] >> 6) & 0x1f;
192 b0 = src[0] & 0x1f;
193 r1 = (src[1] >> 11) & 0x1f;
194 g1 = (src[1] >> 6) & 0x1f;
195 b1 = src[1] & 0x1f;
196 y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16;
197 y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16;
198 u = yu[b0 - y0];
199 v = yv[r0 - y0];
200 // valid Y range seems to be 16..235
201 y0 = 16 + 219 * y0 / 31;
202 y1 = 16 + 219 * y1 / 31;
203
204 *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u;
205 }
206}
207
c9099d02 208void bgr555_to_uyvy(void *d, const void *s, int pixels)
209{
210 unsigned int *dst = d;
211 const unsigned short *src = s;
212 const unsigned char *yu = yuv_u + 32;
213 const unsigned char *yv = yuv_v + 32;
214 int r0, g0, b0, r1, g1, b1;
215 int y0, y1, u, v;
216
217 for (; pixels > 0; src += 2, dst++, pixels -= 2)
218 {
219 b0 = (src[0] >> 10) & 0x1f;
220 g0 = (src[0] >> 5) & 0x1f;
221 r0 = src[0] & 0x1f;
222 b1 = (src[1] >> 10) & 0x1f;
223 g1 = (src[1] >> 5) & 0x1f;
224 r1 = src[1] & 0x1f;
225 y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16;
226 y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16;
227 u = yu[b0 - y0];
228 v = yv[r0 - y0];
c9099d02 229 y0 = 16 + 219 * y0 / 31;
230 y1 = 16 + 219 * y1 / 31;
231
232 *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u;
233 }
234}
235
236void bgr888_to_uyvy(void *d, const void *s, int pixels)
237{
238 unsigned int *dst = d;
239 const unsigned char *src8 = s;
240 const unsigned char *yu = yuv_u + 32;
241 const unsigned char *yv = yuv_v + 32;
242 int r0, g0, b0, r1, g1, b1;
243 int y0, y1, u, v;
244
245 for (; pixels > 0; src8 += 3*2, dst++, pixels -= 2)
246 {
247 r0 = src8[0], g0 = src8[1], b0 = src8[2];
248 r1 = src8[3], g1 = src8[4], b1 = src8[5];
249 y0 = (r0 * 19595 + g0 * 38470 + b0 * 7471) >> 16;
250 y1 = (r1 * 19595 + g1 * 38470 + b1 * 7471) >> 16;
251 u = yu[(b0 - y0) / 8];
252 v = yv[(r0 - y0) / 8];
253 y0 = 16 + 219 * y0 / 255;
254 y1 = 16 + 219 * y1 / 255;
255
256 *dst = (y1 << 24) | (v << 16) | (y0 << 8) | u;
257 }
258}