libretro-common/encodings/encoding_utf.c

   1 /* Copyright  (C) 2010-2020 The RetroArch team
   2  *
   3  * ---------------------------------------------------------------------------------------
   4  * The following license statement only applies to this file (encoding_utf.c).
   5  * ---------------------------------------------------------------------------------------
   6  *
   7  * Permission is hereby granted, free of charge,
   8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation the rights to
  10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include <stdint.h>
  24 #include <stdlib.h>
  25 #include <stddef.h>
  26 #include <string.h>
  27
  28 #include <boolean.h>
  29 #include <compat/strl.h>
  30 #include <retro_inline.h>
  31
  32 #include <encodings/utf.h>
  33
  34 #if defined(_WIN32) && !defined(_XBOX)
  35 #include <windows.h>
  36 #elif defined(_XBOX)
  37 #include <xtl.h>
  38 #endif
  39
  40 #define UTF8_WALKBYTE(string) (*((*(string))++))
  41
  42 static unsigned leading_ones(uint8_t c)
  43 {
  44    unsigned ones = 0;
  45    while (c & 0x80)
  46    {
  47       ones++;
  48       c <<= 1;
  49    }
  50
  51    return ones;
  52 }
  53
  54 /* Simple implementation. Assumes the sequence is
  55  * properly synchronized and terminated. */
  56
  57 size_t utf8_conv_utf32(uint32_t *out, size_t out_chars,
  58       const char *in, size_t in_size)
  59 {
  60    unsigned i;
  61    size_t ret = 0;
  62    while (in_size && out_chars)
  63    {
  64       unsigned extra, shift;
  65       uint32_t c;
  66       uint8_t first = *in++;
  67       unsigned ones = leading_ones(first);
  68
  69       if (ones > 6 || ones == 1) /* Invalid or desync. */
  70          break;
  71
  72       extra = ones ? ones - 1 : ones;
  73       if (1 + extra > in_size) /* Overflow. */
  74          break;
  75
  76       shift = (extra - 1) * 6;
  77       c     = (first & ((1 << (7 - ones)) - 1)) << (6 * extra);
  78
  79       for (i = 0; i < extra; i++, in++, shift -= 6)
  80          c |= (*in & 0x3f) << shift;
  81
  82       *out++ = c;
  83       in_size -= 1 + extra;
  84       out_chars--;
  85       ret++;
  86    }
  87
  88    return ret;
  89 }
  90
  91 bool utf16_conv_utf8(uint8_t *out, size_t *out_chars,
  92      const uint16_t *in, size_t in_size)
  93 {
  94    size_t out_pos            = 0;
  95    size_t in_pos             = 0;
  96    static const
  97       uint8_t utf8_limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  98
  99    for (;;)
 100    {
 101       unsigned num_adds;
 102       uint32_t value;
 103
 104       if (in_pos == in_size)
 105       {
 106          *out_chars = out_pos;
 107          return true;
 108       }
 109       value = in[in_pos++];
 110       if (value < 0x80)
 111       {
 112          if (out)
 113             out[out_pos] = (char)value;
 114          out_pos++;
 115          continue;
 116       }
 117
 118       if (value >= 0xD800 && value < 0xE000)
 119       {
 120          uint32_t c2;
 121
 122          if (value >= 0xDC00 || in_pos == in_size)
 123             break;
 124          c2 = in[in_pos++];
 125          if (c2 < 0xDC00 || c2 >= 0xE000)
 126             break;
 127          value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
 128       }
 129
 130       for (num_adds = 1; num_adds < 5; num_adds++)
 131          if (value < (((uint32_t)1) << (num_adds * 5 + 6)))
 132             break;
 133       if (out)
 134          out[out_pos] = (char)(utf8_limits[num_adds - 1]
 135                + (value >> (6 * num_adds)));
 136       out_pos++;
 137       do
 138       {
 139          num_adds--;
 140          if (out)
 141             out[out_pos] = (char)(0x80
 142                   + ((value >> (6 * num_adds)) & 0x3F));
 143          out_pos++;
 144       }while (num_adds != 0);
 145    }
 146
 147    *out_chars = out_pos;
 148    return false;
 149 }
 150
 151 /* Acts mostly like strlcpy.
 152  *
 153  * Copies the given number of UTF-8 characters,
 154  * but at most d_len bytes.
 155  *
 156  * Always NULL terminates.
 157  * Does not copy half a character.
 158  *
 159  * Returns number of bytes. 's' is assumed valid UTF-8.
 160  * Use only if 'chars' is considerably less than 'd_len'. */
 161 size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars)
 162 {
 163    const uint8_t *sb     = (const uint8_t*)s;
 164    const uint8_t *sb_org = sb;
 165
 166    if (!s)
 167       return 0;
 168
 169    while (*sb && chars-- > 0)
 170    {
 171       sb++;
 172       while ((*sb & 0xC0) == 0x80)
 173          sb++;
 174    }
 175
 176    if ((size_t)(sb - sb_org) > d_len-1 /* NUL */)
 177    {
 178       sb = sb_org + d_len-1;
 179       while ((*sb & 0xC0) == 0x80)
 180          sb--;
 181    }
 182
 183    memcpy(d, sb_org, sb-sb_org);
 184    d[sb-sb_org] = '\0';
 185
 186    return sb-sb_org;
 187 }
 188
 189 const char *utf8skip(const char *str, size_t chars)
 190 {
 191    const uint8_t *strb = (const uint8_t*)str;
 192
 193    if (!chars)
 194       return str;
 195
 196    do
 197    {
 198       strb++;
 199       while ((*strb & 0xC0)==0x80)
 200          strb++;
 201       chars--;
 202    }while (chars);
 203
 204    return (const char*)strb;
 205 }
 206
 207 size_t utf8len(const char *string)
 208 {
 209    size_t ret = 0;
 210
 211    if (!string)
 212       return 0;
 213
 214    while (*string)
 215    {
 216       if ((*string & 0xC0) != 0x80)
 217          ret++;
 218       string++;
 219    }
 220    return ret;
 221 }
 222
 223 /* Does not validate the input, returns garbage if it's not UTF-8. */
 224 uint32_t utf8_walk(const char **string)
 225 {
 226    uint8_t first = UTF8_WALKBYTE(string);
 227    uint32_t ret  = 0;
 228
 229    if (first < 128)
 230       return first;
 231
 232    ret    = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
 233    if (first >= 0xE0)
 234    {
 235       ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
 236       if (first >= 0xF0)
 237       {
 238          ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F);
 239          return ret | (first & 7) << 18;
 240       }
 241       return ret | (first & 15) << 12;
 242    }
 243
 244    return ret | (first & 31) << 6;
 245 }
 246
 247 static bool utf16_to_char(uint8_t **utf_data,
 248       size_t *dest_len, const uint16_t *in)
 249 {
 250    unsigned len    = 0;
 251
 252    while (in[len] != '\0')
 253       len++;
 254
 255    utf16_conv_utf8(NULL, dest_len, in, len);
 256    *dest_len  += 1;
 257    *utf_data   = (uint8_t*)malloc(*dest_len);
 258    if (*utf_data == 0)
 259       return false;
 260
 261    return utf16_conv_utf8(*utf_data, dest_len, in, len);
 262 }
 263
 264 bool utf16_to_char_string(const uint16_t *in, char *s, size_t len)
 265 {
 266    size_t     dest_len  = 0;
 267    uint8_t *utf16_data  = NULL;
 268    bool            ret  = utf16_to_char(&utf16_data, &dest_len, in);
 269
 270    if (ret)
 271    {
 272       utf16_data[dest_len] = 0;
 273       strlcpy(s, (const char*)utf16_data, len);
 274    }
 275
 276    free(utf16_data);
 277    utf16_data = NULL;
 278
 279    return ret;
 280 }
 281
 282 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
 283 /* Returned pointer MUST be freed by the caller if non-NULL. */
 284 static char *mb_to_mb_string_alloc(const char *str,
 285       enum CodePage cp_in, enum CodePage cp_out)
 286 {
 287    wchar_t *path_buf_wide = NULL;
 288    int path_buf_wide_len  = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0);
 289
 290    /* Windows 95 will return 0 from these functions with
 291     * a UTF8 codepage set without MSLU.
 292     *
 293     * From an unknown MSDN version (others omit this info):
 294     *   - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later:
 295     *   Translate using UTF-8. When this is set, dwFlags must be zero.
 296     *   - Windows 95: Under the Microsoft Layer for Unicode,
 297     *   MultiByteToWideChar also supports CP_UTF7 and CP_UTF8.
 298     */
 299
 300    if (!path_buf_wide_len)
 301       return strdup(str);
 302
 303    path_buf_wide = (wchar_t*)
 304       calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t));
 305
 306    if (path_buf_wide)
 307    {
 308       MultiByteToWideChar(cp_in, 0,
 309             str, -1, path_buf_wide, path_buf_wide_len);
 310
 311       if (*path_buf_wide)
 312       {
 313          int path_buf_len = WideCharToMultiByte(cp_out, 0,
 314                path_buf_wide, -1, NULL, 0, NULL, NULL);
 315
 316          if (path_buf_len)
 317          {
 318             char *path_buf = (char*)
 319                calloc(path_buf_len + sizeof(char), sizeof(char));
 320
 321             if (path_buf)
 322             {
 323                WideCharToMultiByte(cp_out, 0,
 324                      path_buf_wide, -1, path_buf,
 325                      path_buf_len, NULL, NULL);
 326
 327                free(path_buf_wide);
 328
 329                if (*path_buf)
 330                   return path_buf;
 331
 332                free(path_buf);
 333                return NULL;
 334             }
 335          }
 336          else
 337          {
 338             free(path_buf_wide);
 339             return strdup(str);
 340          }
 341       }
 342
 343       free(path_buf_wide);
 344    }
 345
 346    return NULL;
 347 }
 348 #endif
 349
 350 /* Returned pointer MUST be freed by the caller if non-NULL. */
 351 char* utf8_to_local_string_alloc(const char *str)
 352 {
 353    if (str && *str)
 354    {
 355 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
 356       return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL);
 357 #else
 358       /* assume string needs no modification if not on Windows */
 359       return strdup(str);
 360 #endif
 361    }
 362    return NULL;
 363 }
 364
 365 /* Returned pointer MUST be freed by the caller if non-NULL. */
 366 char* local_to_utf8_string_alloc(const char *str)
 367 {
 368    if (str && *str)
 369    {
 370 #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
 371       return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8);
 372 #else
 373       /* assume string needs no modification if not on Windows */
 374       return strdup(str);
 375 #endif
 376    }
 377    return NULL;
 378 }
 379
 380 /* Returned pointer MUST be freed by the caller if non-NULL. */
 381 wchar_t* utf8_to_utf16_string_alloc(const char *str)
 382 {
 383 #ifdef _WIN32
 384    int len        = 0;
 385    int out_len    = 0;
 386 #else
 387    size_t len     = 0;
 388    size_t out_len = 0;
 389 #endif
 390    wchar_t *buf   = NULL;
 391
 392    if (!str || !*str)
 393       return NULL;
 394
 395 #ifdef _WIN32
 396    len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
 397
 398    if (len)
 399    {
 400       buf = (wchar_t*)calloc(len, sizeof(wchar_t));
 401
 402       if (!buf)
 403          return NULL;
 404
 405       out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len);
 406    }
 407    else
 408    {
 409       /* fallback to ANSI codepage instead */
 410       len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0);
 411
 412       if (len)
 413       {
 414          buf = (wchar_t*)calloc(len, sizeof(wchar_t));
 415
 416          if (!buf)
 417             return NULL;
 418
 419          out_len = MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len);
 420       }
 421    }
 422
 423    if (out_len < 0)
 424    {
 425       free(buf);
 426       return NULL;
 427    }
 428 #else
 429    /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */
 430    len = mbstowcs(NULL, str, 0) + 1;
 431
 432    if (len)
 433    {
 434       buf = (wchar_t*)calloc(len, sizeof(wchar_t));
 435
 436       if (!buf)
 437          return NULL;
 438
 439       out_len = mbstowcs(buf, str, len);
 440    }
 441
 442    if (out_len == (size_t)-1)
 443    {
 444       free(buf);
 445       return NULL;
 446    }
 447 #endif
 448
 449    return buf;
 450 }
 451
 452 /* Returned pointer MUST be freed by the caller if non-NULL. */
 453 char* utf16_to_utf8_string_alloc(const wchar_t *str)
 454 {
 455 #ifdef _WIN32
 456    int len        = 0;
 457 #else
 458    size_t len     = 0;
 459 #endif
 460    char *buf      = NULL;
 461
 462    if (!str || !*str)
 463       return NULL;
 464
 465 #ifdef _WIN32
 466    {
 467       UINT code_page = CP_UTF8;
 468       len            = WideCharToMultiByte(code_page,
 469             0, str, -1, NULL, 0, NULL, NULL);
 470
 471       /* fallback to ANSI codepage instead */
 472       if (!len)
 473       {
 474          code_page   = CP_ACP;
 475          len         = WideCharToMultiByte(code_page,
 476                0, str, -1, NULL, 0, NULL, NULL);
 477       }
 478
 479       buf = (char*)calloc(len, sizeof(char));
 480
 481       if (!buf)
 482          return NULL;
 483
 484       if (WideCharToMultiByte(code_page,
 485             0, str, -1, buf, len, NULL, NULL) < 0)
 486       {
 487          free(buf);
 488          return NULL;
 489       }
 490    }
 491 #else
 492    /* NOTE: For now, assume non-Windows platforms'
 493     * locale is already UTF-8. */
 494    len = wcstombs(NULL, str, 0) + 1;
 495
 496    if (len)
 497    {
 498       buf = (char*)calloc(len, sizeof(char));
 499
 500       if (!buf)
 501          return NULL;
 502
 503       if (wcstombs(buf, str, len) == (size_t)-1)
 504       {
 505          free(buf);
 506          return NULL;
 507       }
 508    }
 509 #endif
 510
 511    return buf;
 512 }