cdrom: fix a copy-paste mistake
[pcsx_rearmed.git] / deps / libretro-common / encodings / encoding_utf.c
CommitLineData
3719602c
PC
1/* Copyright (C) 2010-2020 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (encoding_utf.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include <stdint.h>
24#include <stdlib.h>
25#include <stddef.h>
26#include <string.h>
27
28#include <boolean.h>
29#include <compat/strl.h>
30#include <retro_inline.h>
31
32#include <encodings/utf.h>
33
34#if defined(_WIN32) && !defined(_XBOX)
35#include <windows.h>
36#elif defined(_XBOX)
37#include <xtl.h>
38#endif
39
40#define UTF8_WALKBYTE(string) (*((*(string))++))
41
42static unsigned leading_ones(uint8_t c)
43{
44 unsigned ones = 0;
45 while (c & 0x80)
46 {
47 ones++;
48 c <<= 1;
49 }
50
51 return ones;
52}
53
54/**
55 * utf8_conv_utf32:
56 *
57 * Simple implementation. Assumes the sequence is
58 * properly synchronized and terminated.
59 **/
60size_t utf8_conv_utf32(uint32_t *out, size_t out_chars,
61 const char *in, size_t in_size)
62{
63 unsigned i;
64 size_t ret = 0;
65 while (in_size && out_chars)
66 {
67 unsigned extra, shift;
68 uint32_t c;
69 uint8_t first = *in++;
70 unsigned ones = leading_ones(first);
71
72 if (ones > 6 || ones == 1) /* Invalid or desync. */
73 break;
74
75 extra = ones ? ones - 1 : ones;
76 if (1 + extra > in_size) /* Overflow. */
77 break;
78
79 shift = (extra - 1) * 6;
80 c = (first & ((1 << (7 - ones)) - 1)) << (6 * extra);
81
82 for (i = 0; i < extra; i++, in++, shift -= 6)
83 c |= (*in & 0x3f) << shift;
84
85 *out++ = c;
86 in_size -= 1 + extra;
87 out_chars--;
88 ret++;
89 }
90
91 return ret;
92}
93
94/**
95 * utf16_conv_utf8:
96 *
97 * Leaf function.
98 **/
99bool utf16_conv_utf8(uint8_t *out, size_t *out_chars,
100 const uint16_t *in, size_t in_size)
101{
102 size_t out_pos = 0;
103 size_t in_pos = 0;
104 static const
105 uint8_t utf8_limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
106
107 for (;;)
108 {
109 unsigned num_adds;
110 uint32_t value;
111
112 if (in_pos == in_size)
113 {
114 *out_chars = out_pos;
115 return true;
116 }
117 value = in[in_pos++];
118 if (value < 0x80)
119 {
120 if (out)
121 out[out_pos] = (char)value;
122 out_pos++;
123 continue;
124 }
125
126 if (value >= 0xD800 && value < 0xE000)
127 {
128 uint32_t c2;
129
130 if (value >= 0xDC00 || in_pos == in_size)
131 break;
132 c2 = in[in_pos++];
133 if (c2 < 0xDC00 || c2 >= 0xE000)
134 break;
135 value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
136 }
137
138 for (num_adds = 1; num_adds < 5; num_adds++)
139 if (value < (((uint32_t)1) << (num_adds * 5 + 6)))
140 break;
141 if (out)
142 out[out_pos] = (char)(utf8_limits[num_adds - 1]
143 + (value >> (6 * num_adds)));
144 out_pos++;
145 do
146 {
147 num_adds--;
148 if (out)
149 out[out_pos] = (char)(0x80
150 + ((value >> (6 * num_adds)) & 0x3F));
151 out_pos++;
152 }while (num_adds != 0);
153 }
154
155 *out_chars = out_pos;
156 return false;
157}
158
159/**
160 * utf8cpy:
161 *
162 * Acts mostly like strlcpy.
163 *
164 * Copies the given number of UTF-8 characters,
165 * but at most @d_len bytes.
166 *
167 * Always NULL terminates. Does not copy half a character.
168 * @s is assumed valid UTF-8.
169 * Use only if @chars is considerably less than @d_len.
170 *
171 * @return Number of bytes.
172 **/
173size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars)
174{
175 const uint8_t *sb = (const uint8_t*)s;
176 const uint8_t *sb_org = sb;
177
178 if (!s)
179 return 0;
180
181 while (*sb && chars-- > 0)
182 {
183 sb++;
184 while ((*sb & 0xC0) == 0x80)
185 sb++;
186 }
187
188 if ((size_t)(sb - sb_org) > d_len-1 /* NUL */)
189 {
190 sb = sb_org + d_len-1;
191 while ((*sb & 0xC0) == 0x80)
192 sb--;
193 }
194
195 memcpy(d, sb_org, sb-sb_org);
196 d[sb-sb_org] = '\0';
197
198 return sb-sb_org;
199}
200
201/**
202 * utf8skip:
203 *
204 * Leaf function
205 **/
206const char *utf8skip(const char *str, size_t chars)
207{
208 const uint8_t *strb = (const uint8_t*)str;
209
210 if (!chars)
211 return str;
212
213 do
214 {
215 strb++;
216 while ((*strb & 0xC0)==0x80)
217 strb++;
218 chars--;
219 }while (chars);
220
221 return (const char*)strb;
222}
223
224/**
225 * utf8len:
226 *
227 * Leaf function.
228 **/
229size_t utf8len(const char *string)
230{
231 size_t ret = 0;
232
233 if (!string)
234 return 0;
235
236 while (*string)
237 {
238 if ((*string & 0xC0) != 0x80)
239 ret++;
240 string++;
241 }
242 return ret;
243}
244
245/**
246 * utf8_walk:
247 *
248 * Does not validate the input.
249 *
250 * Leaf function.
251 *
252 * @return Returns garbage if it's not UTF-8.
253 **/
254uint32_t utf8_walk(const char **string)
255{
256 uint8_t first = UTF8_WALKBYTE(string);
257 uint32_t ret = 0;
258
259 if (first < 128)
260 return first;
261
262 ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
263 if (first >= 0xE0)
264 {
265 ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
266 if (first >= 0xF0)
267 {
268 ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
269 return ret | (first & 7) << 18;
270 }
271 return ret | (first & 15) << 12;
272 }
273
274 return ret | (first & 31) << 6;
275}
276
277static bool utf16_to_char(uint8_t **utf_data,
278 size_t *dest_len, const uint16_t *in)
279{
280 unsigned len = 0;
281 while (in[len] != '\0')
282 len++;
283 utf16_conv_utf8(NULL, dest_len, in, len);
284 *dest_len += 1;
285 if ((*utf_data = (uint8_t*)malloc(*dest_len)) != 0)
286 return utf16_conv_utf8(*utf_data, dest_len, in, len);
287 return false;
288}
289
290/**
291 * utf16_to_char_string:
292 **/
293bool utf16_to_char_string(const uint16_t *in, char *s, size_t len)
294{
295 size_t dest_len = 0;
296 uint8_t *utf16_data = NULL;
297 bool ret = utf16_to_char(&utf16_data, &dest_len, in);
298
299 if (ret)
300 {
301 utf16_data[dest_len] = 0;
302 strlcpy(s, (const char*)utf16_data, len);
303 }
304
305 free(utf16_data);
306 utf16_data = NULL;
307
308 return ret;
309}
310
311#if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
312/**
313 * mb_to_mb_string_alloc:
314 *
315 * @return Returned pointer MUST be freed by the caller if non-NULL.
316 **/
317static char *mb_to_mb_string_alloc(const char *str,
318 enum CodePage cp_in, enum CodePage cp_out)
319{
320 wchar_t *path_buf_wide = NULL;
321 int path_buf_wide_len = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0);
322
323 /* Windows 95 will return 0 from these functions with
324 * a UTF8 codepage set without MSLU.
325 *
326 * From an unknown MSDN version (others omit this info):
327 * - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later:
328 * Translate using UTF-8. When this is set, dwFlags must be zero.
329 * - Windows 95: Under the Microsoft Layer for Unicode,
330 * MultiByteToWideChar also supports CP_UTF7 and CP_UTF8.
331 */
332
333 if (!path_buf_wide_len)
334 return strdup(str);
335
336 if ((path_buf_wide = (wchar_t*)
337 calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t))))
338 {
339 MultiByteToWideChar(cp_in, 0,
340 str, -1, path_buf_wide, path_buf_wide_len);
341
342 if (*path_buf_wide)
343 {
344 int path_buf_len = WideCharToMultiByte(cp_out, 0,
345 path_buf_wide, -1, NULL, 0, NULL, NULL);
346
347 if (path_buf_len)
348 {
349 char *path_buf = (char*)
350 calloc(path_buf_len + sizeof(char), sizeof(char));
351
352 if (path_buf)
353 {
354 WideCharToMultiByte(cp_out, 0,
355 path_buf_wide, -1, path_buf,
356 path_buf_len, NULL, NULL);
357
358 free(path_buf_wide);
359
360 if (*path_buf)
361 return path_buf;
362
363 free(path_buf);
364 return NULL;
365 }
366 }
367 else
368 {
369 free(path_buf_wide);
370 return strdup(str);
371 }
372 }
373
374 free(path_buf_wide);
375 }
376
377 return NULL;
378}
379#endif
380
381/**
382 * utf8_to_local_string_alloc:
383 *
384 * @return Returned pointer MUST be freed by the caller if non-NULL.
385 **/
386char* utf8_to_local_string_alloc(const char *str)
387{
388 if (str && *str)
389#if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
390 return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL);
391#else
392 return strdup(str); /* Assume string needs no modification if not on Windows */
393#endif
394 return NULL;
395}
396
397/**
398 * local_to_utf8_string_alloc:
399 *
400 * @return Returned pointer MUST be freed by the caller if non-NULL.
401 **/
402char *local_to_utf8_string_alloc(const char *str)
403{
404 if (str && *str)
405#if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
406 return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8);
407#else
408 return strdup(str); /* Assume string needs no modification if not on Windows */
409#endif
410 return NULL;
411}
412
413/**
414 * utf8_to_utf16_string_alloc:
415 *
416 * @return Returned pointer MUST be freed by the caller if non-NULL.
417 **/
418wchar_t* utf8_to_utf16_string_alloc(const char *str)
419{
420#ifdef _WIN32
421 int len = 0;
422#else
423 size_t len = 0;
424#endif
425 wchar_t *buf = NULL;
426
427 if (!str || !*str)
428 return NULL;
429
430#ifdef _WIN32
431 if ((len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0)))
432 {
433 if (!(buf = (wchar_t*)calloc(len, sizeof(wchar_t))))
434 return NULL;
435
436 if ((MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len)) < 0)
437 {
438 free(buf);
439 return NULL;
440 }
441 }
442 else
443 {
444 /* Fallback to ANSI codepage instead */
445 if ((len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0)))
446 {
447 if (!(buf = (wchar_t*)calloc(len, sizeof(wchar_t))))
448 return NULL;
449
450 if ((MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len)) < 0)
451 {
452 free(buf);
453 return NULL;
454 }
455 }
456 }
457#else
458 /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
459 if ((len = mbstowcs(NULL, str, 0) + 1))
460 {
461 if (!(buf = (wchar_t*)calloc(len, sizeof(wchar_t))))
462 return NULL;
463
464 if ((mbstowcs(buf, str, len)) == (size_t)-1)
465 {
466 free(buf);
467 return NULL;
468 }
469 }
470#endif
471
472 return buf;
473}
474
475/**
476 * utf16_to_utf8_string_alloc:
477 *
478 * @return Returned pointer MUST be freed by the caller if non-NULL.
479 **/
480char* utf16_to_utf8_string_alloc(const wchar_t *str)
481{
482#ifdef _WIN32
483 int len = 0;
484#else
485 size_t len = 0;
486#endif
487 char *buf = NULL;
488
489 if (!str || !*str)
490 return NULL;
491
492#ifdef _WIN32
493 {
494 UINT code_page = CP_UTF8;
495
496 /* fallback to ANSI codepage instead */
497 if (!(len = WideCharToMultiByte(code_page,
498 0, str, -1, NULL, 0, NULL, NULL)))
499 {
500 code_page = CP_ACP;
501 len = WideCharToMultiByte(code_page,
502 0, str, -1, NULL, 0, NULL, NULL);
503 }
504
505 if (!(buf = (char*)calloc(len, sizeof(char))))
506 return NULL;
507
508 if (WideCharToMultiByte(code_page,
509 0, str, -1, buf, len, NULL, NULL) < 0)
510 {
511 free(buf);
512 return NULL;
513 }
514 }
515#else
516 /* NOTE: For now, assume non-Windows platforms'
517 * locale is already UTF-8. */
518 if ((len = wcstombs(NULL, str, 0) + 1))
519 {
520 if (!(buf = (char*)calloc(len, sizeof(char))))
521 return NULL;
522
523 if (wcstombs(buf, str, len) == (size_t)-1)
524 {
525 free(buf);
526 return NULL;
527 }
528 }
529#endif
530
531 return buf;
532}