Commit | Line | Data |
---|---|---|
3719602c PC |
1 | /* Copyright (C) 2010-2020 The RetroArch team |
2 | * | |
3 | * --------------------------------------------------------------------------------------- | |
4 | * The following license statement only applies to this file (encoding_utf.c). | |
5 | * --------------------------------------------------------------------------------------- | |
6 | * | |
7 | * Permission is hereby granted, free of charge, | |
8 | * to any person obtaining a copy of this software and associated documentation files (the "Software"), | |
9 | * to deal in the Software without restriction, including without limitation the rights to | |
10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
11 | * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | |
12 | * | |
13 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
16 | * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
19 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
21 | */ | |
22 | ||
23 | #include <stdint.h> | |
24 | #include <stdlib.h> | |
25 | #include <stddef.h> | |
26 | #include <string.h> | |
27 | ||
28 | #include <boolean.h> | |
29 | #include <compat/strl.h> | |
30 | #include <retro_inline.h> | |
31 | ||
32 | #include <encodings/utf.h> | |
33 | ||
34 | #if defined(_WIN32) && !defined(_XBOX) | |
35 | #include <windows.h> | |
36 | #elif defined(_XBOX) | |
37 | #include <xtl.h> | |
38 | #endif | |
39 | ||
40 | #define UTF8_WALKBYTE(string) (*((*(string))++)) | |
41 | ||
42 | static unsigned leading_ones(uint8_t c) | |
43 | { | |
44 | unsigned ones = 0; | |
45 | while (c & 0x80) | |
46 | { | |
47 | ones++; | |
48 | c <<= 1; | |
49 | } | |
50 | ||
51 | return ones; | |
52 | } | |
53 | ||
54 | /** | |
55 | * utf8_conv_utf32: | |
56 | * | |
57 | * Simple implementation. Assumes the sequence is | |
58 | * properly synchronized and terminated. | |
59 | **/ | |
60 | size_t utf8_conv_utf32(uint32_t *out, size_t out_chars, | |
61 | const char *in, size_t in_size) | |
62 | { | |
63 | unsigned i; | |
64 | size_t ret = 0; | |
65 | while (in_size && out_chars) | |
66 | { | |
67 | unsigned extra, shift; | |
68 | uint32_t c; | |
69 | uint8_t first = *in++; | |
70 | unsigned ones = leading_ones(first); | |
71 | ||
72 | if (ones > 6 || ones == 1) /* Invalid or desync. */ | |
73 | break; | |
74 | ||
75 | extra = ones ? ones - 1 : ones; | |
76 | if (1 + extra > in_size) /* Overflow. */ | |
77 | break; | |
78 | ||
79 | shift = (extra - 1) * 6; | |
80 | c = (first & ((1 << (7 - ones)) - 1)) << (6 * extra); | |
81 | ||
82 | for (i = 0; i < extra; i++, in++, shift -= 6) | |
83 | c |= (*in & 0x3f) << shift; | |
84 | ||
85 | *out++ = c; | |
86 | in_size -= 1 + extra; | |
87 | out_chars--; | |
88 | ret++; | |
89 | } | |
90 | ||
91 | return ret; | |
92 | } | |
93 | ||
94 | /** | |
95 | * utf16_conv_utf8: | |
96 | * | |
97 | * Leaf function. | |
98 | **/ | |
99 | bool utf16_conv_utf8(uint8_t *out, size_t *out_chars, | |
100 | const uint16_t *in, size_t in_size) | |
101 | { | |
102 | size_t out_pos = 0; | |
103 | size_t in_pos = 0; | |
104 | static const | |
105 | uint8_t utf8_limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; | |
106 | ||
107 | for (;;) | |
108 | { | |
109 | unsigned num_adds; | |
110 | uint32_t value; | |
111 | ||
112 | if (in_pos == in_size) | |
113 | { | |
114 | *out_chars = out_pos; | |
115 | return true; | |
116 | } | |
117 | value = in[in_pos++]; | |
118 | if (value < 0x80) | |
119 | { | |
120 | if (out) | |
121 | out[out_pos] = (char)value; | |
122 | out_pos++; | |
123 | continue; | |
124 | } | |
125 | ||
126 | if (value >= 0xD800 && value < 0xE000) | |
127 | { | |
128 | uint32_t c2; | |
129 | ||
130 | if (value >= 0xDC00 || in_pos == in_size) | |
131 | break; | |
132 | c2 = in[in_pos++]; | |
133 | if (c2 < 0xDC00 || c2 >= 0xE000) | |
134 | break; | |
135 | value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000; | |
136 | } | |
137 | ||
138 | for (num_adds = 1; num_adds < 5; num_adds++) | |
139 | if (value < (((uint32_t)1) << (num_adds * 5 + 6))) | |
140 | break; | |
141 | if (out) | |
142 | out[out_pos] = (char)(utf8_limits[num_adds - 1] | |
143 | + (value >> (6 * num_adds))); | |
144 | out_pos++; | |
145 | do | |
146 | { | |
147 | num_adds--; | |
148 | if (out) | |
149 | out[out_pos] = (char)(0x80 | |
150 | + ((value >> (6 * num_adds)) & 0x3F)); | |
151 | out_pos++; | |
152 | }while (num_adds != 0); | |
153 | } | |
154 | ||
155 | *out_chars = out_pos; | |
156 | return false; | |
157 | } | |
158 | ||
159 | /** | |
160 | * utf8cpy: | |
161 | * | |
162 | * Acts mostly like strlcpy. | |
163 | * | |
164 | * Copies the given number of UTF-8 characters, | |
165 | * but at most @d_len bytes. | |
166 | * | |
167 | * Always NULL terminates. Does not copy half a character. | |
168 | * @s is assumed valid UTF-8. | |
169 | * Use only if @chars is considerably less than @d_len. | |
170 | * | |
171 | * @return Number of bytes. | |
172 | **/ | |
173 | size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars) | |
174 | { | |
175 | const uint8_t *sb = (const uint8_t*)s; | |
176 | const uint8_t *sb_org = sb; | |
177 | ||
178 | if (!s) | |
179 | return 0; | |
180 | ||
181 | while (*sb && chars-- > 0) | |
182 | { | |
183 | sb++; | |
184 | while ((*sb & 0xC0) == 0x80) | |
185 | sb++; | |
186 | } | |
187 | ||
188 | if ((size_t)(sb - sb_org) > d_len-1 /* NUL */) | |
189 | { | |
190 | sb = sb_org + d_len-1; | |
191 | while ((*sb & 0xC0) == 0x80) | |
192 | sb--; | |
193 | } | |
194 | ||
195 | memcpy(d, sb_org, sb-sb_org); | |
196 | d[sb-sb_org] = '\0'; | |
197 | ||
198 | return sb-sb_org; | |
199 | } | |
200 | ||
201 | /** | |
202 | * utf8skip: | |
203 | * | |
204 | * Leaf function | |
205 | **/ | |
206 | const char *utf8skip(const char *str, size_t chars) | |
207 | { | |
208 | const uint8_t *strb = (const uint8_t*)str; | |
209 | ||
210 | if (!chars) | |
211 | return str; | |
212 | ||
213 | do | |
214 | { | |
215 | strb++; | |
216 | while ((*strb & 0xC0)==0x80) | |
217 | strb++; | |
218 | chars--; | |
219 | }while (chars); | |
220 | ||
221 | return (const char*)strb; | |
222 | } | |
223 | ||
224 | /** | |
225 | * utf8len: | |
226 | * | |
227 | * Leaf function. | |
228 | **/ | |
229 | size_t utf8len(const char *string) | |
230 | { | |
231 | size_t ret = 0; | |
232 | ||
233 | if (!string) | |
234 | return 0; | |
235 | ||
236 | while (*string) | |
237 | { | |
238 | if ((*string & 0xC0) != 0x80) | |
239 | ret++; | |
240 | string++; | |
241 | } | |
242 | return ret; | |
243 | } | |
244 | ||
245 | /** | |
246 | * utf8_walk: | |
247 | * | |
248 | * Does not validate the input. | |
249 | * | |
250 | * Leaf function. | |
251 | * | |
252 | * @return Returns garbage if it's not UTF-8. | |
253 | **/ | |
254 | uint32_t utf8_walk(const char **string) | |
255 | { | |
256 | uint8_t first = UTF8_WALKBYTE(string); | |
257 | uint32_t ret = 0; | |
258 | ||
259 | if (first < 128) | |
260 | return first; | |
261 | ||
262 | ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F); | |
263 | if (first >= 0xE0) | |
264 | { | |
265 | ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F); | |
266 | if (first >= 0xF0) | |
267 | { | |
268 | ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F); | |
269 | return ret | (first & 7) << 18; | |
270 | } | |
271 | return ret | (first & 15) << 12; | |
272 | } | |
273 | ||
274 | return ret | (first & 31) << 6; | |
275 | } | |
276 | ||
277 | static bool utf16_to_char(uint8_t **utf_data, | |
278 | size_t *dest_len, const uint16_t *in) | |
279 | { | |
280 | unsigned len = 0; | |
281 | while (in[len] != '\0') | |
282 | len++; | |
283 | utf16_conv_utf8(NULL, dest_len, in, len); | |
284 | *dest_len += 1; | |
285 | if ((*utf_data = (uint8_t*)malloc(*dest_len)) != 0) | |
286 | return utf16_conv_utf8(*utf_data, dest_len, in, len); | |
287 | return false; | |
288 | } | |
289 | ||
290 | /** | |
291 | * utf16_to_char_string: | |
292 | **/ | |
293 | bool utf16_to_char_string(const uint16_t *in, char *s, size_t len) | |
294 | { | |
295 | size_t dest_len = 0; | |
296 | uint8_t *utf16_data = NULL; | |
297 | bool ret = utf16_to_char(&utf16_data, &dest_len, in); | |
298 | ||
299 | if (ret) | |
300 | { | |
301 | utf16_data[dest_len] = 0; | |
302 | strlcpy(s, (const char*)utf16_data, len); | |
303 | } | |
304 | ||
305 | free(utf16_data); | |
306 | utf16_data = NULL; | |
307 | ||
308 | return ret; | |
309 | } | |
310 | ||
311 | #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE) | |
312 | /** | |
313 | * mb_to_mb_string_alloc: | |
314 | * | |
315 | * @return Returned pointer MUST be freed by the caller if non-NULL. | |
316 | **/ | |
317 | static char *mb_to_mb_string_alloc(const char *str, | |
318 | enum CodePage cp_in, enum CodePage cp_out) | |
319 | { | |
320 | wchar_t *path_buf_wide = NULL; | |
321 | int path_buf_wide_len = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0); | |
322 | ||
323 | /* Windows 95 will return 0 from these functions with | |
324 | * a UTF8 codepage set without MSLU. | |
325 | * | |
326 | * From an unknown MSDN version (others omit this info): | |
327 | * - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later: | |
328 | * Translate using UTF-8. When this is set, dwFlags must be zero. | |
329 | * - Windows 95: Under the Microsoft Layer for Unicode, | |
330 | * MultiByteToWideChar also supports CP_UTF7 and CP_UTF8. | |
331 | */ | |
332 | ||
333 | if (!path_buf_wide_len) | |
334 | return strdup(str); | |
335 | ||
336 | if ((path_buf_wide = (wchar_t*) | |
337 | calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t)))) | |
338 | { | |
339 | MultiByteToWideChar(cp_in, 0, | |
340 | str, -1, path_buf_wide, path_buf_wide_len); | |
341 | ||
342 | if (*path_buf_wide) | |
343 | { | |
344 | int path_buf_len = WideCharToMultiByte(cp_out, 0, | |
345 | path_buf_wide, -1, NULL, 0, NULL, NULL); | |
346 | ||
347 | if (path_buf_len) | |
348 | { | |
349 | char *path_buf = (char*) | |
350 | calloc(path_buf_len + sizeof(char), sizeof(char)); | |
351 | ||
352 | if (path_buf) | |
353 | { | |
354 | WideCharToMultiByte(cp_out, 0, | |
355 | path_buf_wide, -1, path_buf, | |
356 | path_buf_len, NULL, NULL); | |
357 | ||
358 | free(path_buf_wide); | |
359 | ||
360 | if (*path_buf) | |
361 | return path_buf; | |
362 | ||
363 | free(path_buf); | |
364 | return NULL; | |
365 | } | |
366 | } | |
367 | else | |
368 | { | |
369 | free(path_buf_wide); | |
370 | return strdup(str); | |
371 | } | |
372 | } | |
373 | ||
374 | free(path_buf_wide); | |
375 | } | |
376 | ||
377 | return NULL; | |
378 | } | |
379 | #endif | |
380 | ||
381 | /** | |
382 | * utf8_to_local_string_alloc: | |
383 | * | |
384 | * @return Returned pointer MUST be freed by the caller if non-NULL. | |
385 | **/ | |
386 | char* utf8_to_local_string_alloc(const char *str) | |
387 | { | |
388 | if (str && *str) | |
389 | #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE) | |
390 | return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL); | |
391 | #else | |
392 | return strdup(str); /* Assume string needs no modification if not on Windows */ | |
393 | #endif | |
394 | return NULL; | |
395 | } | |
396 | ||
397 | /** | |
398 | * local_to_utf8_string_alloc: | |
399 | * | |
400 | * @return Returned pointer MUST be freed by the caller if non-NULL. | |
401 | **/ | |
402 | char *local_to_utf8_string_alloc(const char *str) | |
403 | { | |
404 | if (str && *str) | |
405 | #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE) | |
406 | return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8); | |
407 | #else | |
408 | return strdup(str); /* Assume string needs no modification if not on Windows */ | |
409 | #endif | |
410 | return NULL; | |
411 | } | |
412 | ||
413 | /** | |
414 | * utf8_to_utf16_string_alloc: | |
415 | * | |
416 | * @return Returned pointer MUST be freed by the caller if non-NULL. | |
417 | **/ | |
418 | wchar_t* utf8_to_utf16_string_alloc(const char *str) | |
419 | { | |
420 | #ifdef _WIN32 | |
421 | int len = 0; | |
422 | #else | |
423 | size_t len = 0; | |
424 | #endif | |
425 | wchar_t *buf = NULL; | |
426 | ||
427 | if (!str || !*str) | |
428 | return NULL; | |
429 | ||
430 | #ifdef _WIN32 | |
431 | if ((len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0))) | |
432 | { | |
433 | if (!(buf = (wchar_t*)calloc(len, sizeof(wchar_t)))) | |
434 | return NULL; | |
435 | ||
436 | if ((MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len)) < 0) | |
437 | { | |
438 | free(buf); | |
439 | return NULL; | |
440 | } | |
441 | } | |
442 | else | |
443 | { | |
444 | /* Fallback to ANSI codepage instead */ | |
445 | if ((len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0))) | |
446 | { | |
447 | if (!(buf = (wchar_t*)calloc(len, sizeof(wchar_t)))) | |
448 | return NULL; | |
449 | ||
450 | if ((MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len)) < 0) | |
451 | { | |
452 | free(buf); | |
453 | return NULL; | |
454 | } | |
455 | } | |
456 | } | |
457 | #else | |
458 | /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */ | |
459 | if ((len = mbstowcs(NULL, str, 0) + 1)) | |
460 | { | |
461 | if (!(buf = (wchar_t*)calloc(len, sizeof(wchar_t)))) | |
462 | return NULL; | |
463 | ||
464 | if ((mbstowcs(buf, str, len)) == (size_t)-1) | |
465 | { | |
466 | free(buf); | |
467 | return NULL; | |
468 | } | |
469 | } | |
470 | #endif | |
471 | ||
472 | return buf; | |
473 | } | |
474 | ||
475 | /** | |
476 | * utf16_to_utf8_string_alloc: | |
477 | * | |
478 | * @return Returned pointer MUST be freed by the caller if non-NULL. | |
479 | **/ | |
480 | char* utf16_to_utf8_string_alloc(const wchar_t *str) | |
481 | { | |
482 | #ifdef _WIN32 | |
483 | int len = 0; | |
484 | #else | |
485 | size_t len = 0; | |
486 | #endif | |
487 | char *buf = NULL; | |
488 | ||
489 | if (!str || !*str) | |
490 | return NULL; | |
491 | ||
492 | #ifdef _WIN32 | |
493 | { | |
494 | UINT code_page = CP_UTF8; | |
495 | ||
496 | /* fallback to ANSI codepage instead */ | |
497 | if (!(len = WideCharToMultiByte(code_page, | |
498 | 0, str, -1, NULL, 0, NULL, NULL))) | |
499 | { | |
500 | code_page = CP_ACP; | |
501 | len = WideCharToMultiByte(code_page, | |
502 | 0, str, -1, NULL, 0, NULL, NULL); | |
503 | } | |
504 | ||
505 | if (!(buf = (char*)calloc(len, sizeof(char)))) | |
506 | return NULL; | |
507 | ||
508 | if (WideCharToMultiByte(code_page, | |
509 | 0, str, -1, buf, len, NULL, NULL) < 0) | |
510 | { | |
511 | free(buf); | |
512 | return NULL; | |
513 | } | |
514 | } | |
515 | #else | |
516 | /* NOTE: For now, assume non-Windows platforms' | |
517 | * locale is already UTF-8. */ | |
518 | if ((len = wcstombs(NULL, str, 0) + 1)) | |
519 | { | |
520 | if (!(buf = (char*)calloc(len, sizeof(char)))) | |
521 | return NULL; | |
522 | ||
523 | if (wcstombs(buf, str, len) == (size_t)-1) | |
524 | { | |
525 | free(buf); | |
526 | return NULL; | |
527 | } | |
528 | } | |
529 | #endif | |
530 | ||
531 | return buf; | |
532 | } |