libretro: disable spu thread by default, with option to reenable
[pcsx_rearmed.git] / libretro-common / encodings / encoding_utf.c
CommitLineData
07a88422 1/* Copyright (C) 2010-2020 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (encoding_utf.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include <stdint.h>
24#include <stdlib.h>
25#include <stddef.h>
26#include <string.h>
27
28#include <boolean.h>
29#include <compat/strl.h>
30#include <retro_inline.h>
31
32#include <encodings/utf.h>
33
34#if defined(_WIN32) && !defined(_XBOX)
35#include <windows.h>
36#elif defined(_XBOX)
37#include <xtl.h>
38#endif
39
40#define UTF8_WALKBYTE(string) (*((*(string))++))
41
42static unsigned leading_ones(uint8_t c)
43{
44 unsigned ones = 0;
45 while (c & 0x80)
46 {
47 ones++;
48 c <<= 1;
49 }
50
51 return ones;
52}
53
54/* Simple implementation. Assumes the sequence is
55 * properly synchronized and terminated. */
56
57size_t utf8_conv_utf32(uint32_t *out, size_t out_chars,
58 const char *in, size_t in_size)
59{
60 unsigned i;
61 size_t ret = 0;
62 while (in_size && out_chars)
63 {
64 unsigned extra, shift;
65 uint32_t c;
66 uint8_t first = *in++;
67 unsigned ones = leading_ones(first);
68
69 if (ones > 6 || ones == 1) /* Invalid or desync. */
70 break;
71
72 extra = ones ? ones - 1 : ones;
73 if (1 + extra > in_size) /* Overflow. */
74 break;
75
76 shift = (extra - 1) * 6;
77 c = (first & ((1 << (7 - ones)) - 1)) << (6 * extra);
78
79 for (i = 0; i < extra; i++, in++, shift -= 6)
80 c |= (*in & 0x3f) << shift;
81
82 *out++ = c;
83 in_size -= 1 + extra;
84 out_chars--;
85 ret++;
86 }
87
88 return ret;
89}
90
91bool utf16_conv_utf8(uint8_t *out, size_t *out_chars,
92 const uint16_t *in, size_t in_size)
93{
94 size_t out_pos = 0;
95 size_t in_pos = 0;
96 static const
97 uint8_t utf8_limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
98
99 for (;;)
100 {
101 unsigned num_adds;
102 uint32_t value;
103
104 if (in_pos == in_size)
105 {
106 *out_chars = out_pos;
107 return true;
108 }
109 value = in[in_pos++];
110 if (value < 0x80)
111 {
112 if (out)
113 out[out_pos] = (char)value;
114 out_pos++;
115 continue;
116 }
117
118 if (value >= 0xD800 && value < 0xE000)
119 {
120 uint32_t c2;
121
122 if (value >= 0xDC00 || in_pos == in_size)
123 break;
124 c2 = in[in_pos++];
125 if (c2 < 0xDC00 || c2 >= 0xE000)
126 break;
127 value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
128 }
129
130 for (num_adds = 1; num_adds < 5; num_adds++)
131 if (value < (((uint32_t)1) << (num_adds * 5 + 6)))
132 break;
133 if (out)
134 out[out_pos] = (char)(utf8_limits[num_adds - 1]
135 + (value >> (6 * num_adds)));
136 out_pos++;
137 do
138 {
139 num_adds--;
140 if (out)
141 out[out_pos] = (char)(0x80
142 + ((value >> (6 * num_adds)) & 0x3F));
143 out_pos++;
144 }while (num_adds != 0);
145 }
146
147 *out_chars = out_pos;
148 return false;
149}
150
151/* Acts mostly like strlcpy.
152 *
153 * Copies the given number of UTF-8 characters,
154 * but at most d_len bytes.
155 *
156 * Always NULL terminates.
157 * Does not copy half a character.
158 *
159 * Returns number of bytes. 's' is assumed valid UTF-8.
160 * Use only if 'chars' is considerably less than 'd_len'. */
161size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars)
162{
163 const uint8_t *sb = (const uint8_t*)s;
164 const uint8_t *sb_org = sb;
165
166 if (!s)
167 return 0;
168
169 while (*sb && chars-- > 0)
170 {
171 sb++;
172 while ((*sb & 0xC0) == 0x80)
173 sb++;
174 }
175
176 if ((size_t)(sb - sb_org) > d_len-1 /* NUL */)
177 {
178 sb = sb_org + d_len-1;
179 while ((*sb & 0xC0) == 0x80)
180 sb--;
181 }
182
183 memcpy(d, sb_org, sb-sb_org);
184 d[sb-sb_org] = '\0';
185
186 return sb-sb_org;
187}
188
189const char *utf8skip(const char *str, size_t chars)
190{
191 const uint8_t *strb = (const uint8_t*)str;
192
193 if (!chars)
194 return str;
195
196 do
197 {
198 strb++;
199 while ((*strb & 0xC0)==0x80)
200 strb++;
201 chars--;
202 }while (chars);
203
204 return (const char*)strb;
205}
206
207size_t utf8len(const char *string)
208{
209 size_t ret = 0;
210
211 if (!string)
212 return 0;
213
214 while (*string)
215 {
216 if ((*string & 0xC0) != 0x80)
217 ret++;
218 string++;
219 }
220 return ret;
221}
222
223/* Does not validate the input, returns garbage if it's not UTF-8. */
224uint32_t utf8_walk(const char **string)
225{
226 uint8_t first = UTF8_WALKBYTE(string);
227 uint32_t ret = 0;
228
229 if (first < 128)
230 return first;
231
232 ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
233 if (first >= 0xE0)
234 {
235 ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
236 if (first >= 0xF0)
237 {
238 ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
239 return ret | (first & 7) << 18;
240 }
241 return ret | (first & 15) << 12;
242 }
243
244 return ret | (first & 31) << 6;
245}
246
247static bool utf16_to_char(uint8_t **utf_data,
248 size_t *dest_len, const uint16_t *in)
249{
250 unsigned len = 0;
251
252 while (in[len] != '\0')
253 len++;
254
255 utf16_conv_utf8(NULL, dest_len, in, len);
256 *dest_len += 1;
257 *utf_data = (uint8_t*)malloc(*dest_len);
258 if (*utf_data == 0)
259 return false;
260
261 return utf16_conv_utf8(*utf_data, dest_len, in, len);
262}
263
264bool utf16_to_char_string(const uint16_t *in, char *s, size_t len)
265{
266 size_t dest_len = 0;
267 uint8_t *utf16_data = NULL;
268 bool ret = utf16_to_char(&utf16_data, &dest_len, in);
269
270 if (ret)
271 {
272 utf16_data[dest_len] = 0;
273 strlcpy(s, (const char*)utf16_data, len);
274 }
275
276 free(utf16_data);
277 utf16_data = NULL;
278
279 return ret;
280}
281
282#if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
283/* Returned pointer MUST be freed by the caller if non-NULL. */
284static char *mb_to_mb_string_alloc(const char *str,
285 enum CodePage cp_in, enum CodePage cp_out)
286{
287 wchar_t *path_buf_wide = NULL;
288 int path_buf_wide_len = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0);
289
290 /* Windows 95 will return 0 from these functions with
291 * a UTF8 codepage set without MSLU.
292 *
293 * From an unknown MSDN version (others omit this info):
294 * - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later:
295 * Translate using UTF-8. When this is set, dwFlags must be zero.
296 * - Windows 95: Under the Microsoft Layer for Unicode,
297 * MultiByteToWideChar also supports CP_UTF7 and CP_UTF8.
298 */
299
300 if (!path_buf_wide_len)
301 return strdup(str);
302
303 path_buf_wide = (wchar_t*)
304 calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t));
305
306 if (path_buf_wide)
307 {
308 MultiByteToWideChar(cp_in, 0,
309 str, -1, path_buf_wide, path_buf_wide_len);
310
311 if (*path_buf_wide)
312 {
313 int path_buf_len = WideCharToMultiByte(cp_out, 0,
314 path_buf_wide, -1, NULL, 0, NULL, NULL);
315
316 if (path_buf_len)
317 {
318 char *path_buf = (char*)
319 calloc(path_buf_len + sizeof(char), sizeof(char));
320
321 if (path_buf)
322 {
323 WideCharToMultiByte(cp_out, 0,
324 path_buf_wide, -1, path_buf,
325 path_buf_len, NULL, NULL);
326
327 free(path_buf_wide);
328
329 if (*path_buf)
330 return path_buf;
331
332 free(path_buf);
333 return NULL;
334 }
335 }
336 else
337 {
338 free(path_buf_wide);
339 return strdup(str);
340 }
341 }
342
343 free(path_buf_wide);
344 }
345
346 return NULL;
347}
348#endif
349
350/* Returned pointer MUST be freed by the caller if non-NULL. */
351char* utf8_to_local_string_alloc(const char *str)
352{
353 if (str && *str)
354 {
355#if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
356 return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL);
357#else
358 /* assume string needs no modification if not on Windows */
359 return strdup(str);
360#endif
361 }
362 return NULL;
363}
364
365/* Returned pointer MUST be freed by the caller if non-NULL. */
366char* local_to_utf8_string_alloc(const char *str)
367{
368 if (str && *str)
369 {
370#if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
371 return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8);
372#else
373 /* assume string needs no modification if not on Windows */
374 return strdup(str);
375#endif
376 }
377 return NULL;
378}
379
380/* Returned pointer MUST be freed by the caller if non-NULL. */
381wchar_t* utf8_to_utf16_string_alloc(const char *str)
382{
383#ifdef _WIN32
384 int len = 0;
385 int out_len = 0;
386#else
387 size_t len = 0;
388 size_t out_len = 0;
389#endif
390 wchar_t *buf = NULL;
391
392 if (!str || !*str)
393 return NULL;
394
395#ifdef _WIN32
396 len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
397
398 if (len)
399 {
400 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
401
402 if (!buf)
403 return NULL;
404
405 out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len);
406 }
407 else
408 {
409 /* fallback to ANSI codepage instead */
410 len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0);
411
412 if (len)
413 {
414 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
415
416 if (!buf)
417 return NULL;
418
419 out_len = MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len);
420 }
421 }
422
423 if (out_len < 0)
424 {
425 free(buf);
426 return NULL;
427 }
428#else
429 /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
430 len = mbstowcs(NULL, str, 0) + 1;
431
432 if (len)
433 {
434 buf = (wchar_t*)calloc(len, sizeof(wchar_t));
435
436 if (!buf)
437 return NULL;
438
439 out_len = mbstowcs(buf, str, len);
440 }
441
442 if (out_len == (size_t)-1)
443 {
444 free(buf);
445 return NULL;
446 }
447#endif
448
449 return buf;
450}
451
452/* Returned pointer MUST be freed by the caller if non-NULL. */
453char* utf16_to_utf8_string_alloc(const wchar_t *str)
454{
455#ifdef _WIN32
456 int len = 0;
457#else
458 size_t len = 0;
459#endif
460 char *buf = NULL;
461
462 if (!str || !*str)
463 return NULL;
464
465#ifdef _WIN32
466 {
467 UINT code_page = CP_UTF8;
468 len = WideCharToMultiByte(code_page,
469 0, str, -1, NULL, 0, NULL, NULL);
470
471 /* fallback to ANSI codepage instead */
472 if (!len)
473 {
474 code_page = CP_ACP;
475 len = WideCharToMultiByte(code_page,
476 0, str, -1, NULL, 0, NULL, NULL);
477 }
478
479 buf = (char*)calloc(len, sizeof(char));
480
481 if (!buf)
482 return NULL;
483
484 if (WideCharToMultiByte(code_page,
485 0, str, -1, buf, len, NULL, NULL) < 0)
486 {
487 free(buf);
488 return NULL;
489 }
490 }
491#else
492 /* NOTE: For now, assume non-Windows platforms'
493 * locale is already UTF-8. */
494 len = wcstombs(NULL, str, 0) + 1;
495
496 if (len)
497 {
498 buf = (char*)calloc(len, sizeof(char));
499
500 if (!buf)
501 return NULL;
502
503 if (wcstombs(buf, str, len) == (size_t)-1)
504 {
505 free(buf);
506 return NULL;
507 }
508 }
509#endif
510
511 return buf;
512}