| 1 | /* |
| 2 | * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> |
| 3 | * |
| 4 | * This program is free software; you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License as published by |
| 6 | * the Free Software Foundation; either version 2 of the License, or |
| 7 | * (at your option) any later version. |
| 8 | * |
| 9 | * This program is distributed in the hope that it will be useful, |
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | * GNU General Public License for more details. |
| 13 | * |
| 14 | * You should have received a copy of the GNU General Public License along |
| 15 | * with this program; if not, write to the Free Software Foundation, Inc., |
| 16 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| 17 | */ |
| 18 | |
| 19 | /* |
| 20 | * See the corresponding header file for a description of the functions |
| 21 | * that this file provides. |
| 22 | * |
| 23 | * This was first written for Ogg Vorbis but could be of general use. |
| 24 | * |
| 25 | * The only deliberate assumption about data sizes is that a short has |
| 26 | * at least 16 bits, but this code has only been tested on systems with |
| 27 | * 8-bit char, 16-bit short and 32-bit int. |
| 28 | */ |
| 29 | |
| 30 | #ifdef HAVE_CONFIG_H |
| 31 | # include <config.h> |
| 32 | #endif |
| 33 | |
| 34 | #if !defined _WIN32 && !defined HAVE_ICONV /* should be && defined USE_CHARSET_CONVERT */ |
| 35 | |
| 36 | #include <stdlib.h> |
| 37 | |
| 38 | #include "share/alloc.h" |
| 39 | #include "charset.h" |
| 40 | |
| 41 | #include "charmaps.h" |
| 42 | |
| 43 | /* |
| 44 | * This is like the standard strcasecmp, but it does not depend |
| 45 | * on the locale. Locale-dependent functions can be dangerous: |
| 46 | * we once had a bug involving strcasecmp("iso", "ISO") in a |
| 47 | * Turkish locale! |
| 48 | * |
| 49 | * (I'm not really sure what the official standard says |
| 50 | * about the sign of strcasecmp("Z", "["), but usually |
| 51 | * we're only interested in whether it's zero.) |
| 52 | */ |
| 53 | |
| 54 | static int ascii_strcasecmp(const char *s1, const char *s2) |
| 55 | { |
| 56 | char c1, c2; |
| 57 | |
| 58 | for (;; s1++, s2++) { |
| 59 | if (!*s1 || !*s2) |
| 60 | break; |
| 61 | if (*s1 == *s2) |
| 62 | continue; |
| 63 | c1 = *s1; |
| 64 | if ('a' <= c1 && c1 <= 'z') |
| 65 | c1 += 'A' - 'a'; |
| 66 | c2 = *s2; |
| 67 | if ('a' <= c2 && c2 <= 'z') |
| 68 | c2 += 'A' - 'a'; |
| 69 | if (c1 != c2) |
| 70 | break; |
| 71 | } |
| 72 | return (unsigned char)*s1 - (unsigned char)*s2; |
| 73 | } |
| 74 | |
| 75 | /* |
| 76 | * UTF-8 equivalents of the C library's wctomb() and mbtowc(). |
| 77 | */ |
| 78 | |
| 79 | int utf8_mbtowc(int *pwc, const char *s, size_t n) |
| 80 | { |
| 81 | unsigned char c; |
| 82 | int wc, i, k; |
| 83 | |
| 84 | if (!n || !s) |
| 85 | return 0; |
| 86 | |
| 87 | c = *s; |
| 88 | if (c < 0x80) { |
| 89 | if (pwc) |
| 90 | *pwc = c; |
| 91 | return c ? 1 : 0; |
| 92 | } |
| 93 | else if (c < 0xc2) |
| 94 | return -1; |
| 95 | else if (c < 0xe0) { |
| 96 | if (n >= 2 && (s[1] & 0xc0) == 0x80) { |
| 97 | if (pwc) |
| 98 | *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f); |
| 99 | return 2; |
| 100 | } |
| 101 | else |
| 102 | return -1; |
| 103 | } |
| 104 | else if (c < 0xf0) |
| 105 | k = 3; |
| 106 | else if (c < 0xf8) |
| 107 | k = 4; |
| 108 | else if (c < 0xfc) |
| 109 | k = 5; |
| 110 | else if (c < 0xfe) |
| 111 | k = 6; |
| 112 | else |
| 113 | return -1; |
| 114 | |
| 115 | if (n < (size_t)k) |
| 116 | return -1; |
| 117 | wc = *s++ & ((1 << (7 - k)) - 1); |
| 118 | for (i = 1; i < k; i++) { |
| 119 | if ((*s & 0xc0) != 0x80) |
| 120 | return -1; |
| 121 | wc = (wc << 6) | (*s++ & 0x3f); |
| 122 | } |
| 123 | if (wc < (1 << (5 * k - 4))) |
| 124 | return -1; |
| 125 | if (pwc) |
| 126 | *pwc = wc; |
| 127 | return k; |
| 128 | } |
| 129 | |
| 130 | int utf8_wctomb(char *s, int wc1) |
| 131 | { |
| 132 | unsigned int wc = wc1; |
| 133 | |
| 134 | if (!s) |
| 135 | return 0; |
| 136 | if (wc < (1u << 7)) { |
| 137 | *s++ = wc; |
| 138 | return 1; |
| 139 | } |
| 140 | else if (wc < (1u << 11)) { |
| 141 | *s++ = 0xc0 | (wc >> 6); |
| 142 | *s++ = 0x80 | (wc & 0x3f); |
| 143 | return 2; |
| 144 | } |
| 145 | else if (wc < (1u << 16)) { |
| 146 | *s++ = 0xe0 | (wc >> 12); |
| 147 | *s++ = 0x80 | ((wc >> 6) & 0x3f); |
| 148 | *s++ = 0x80 | (wc & 0x3f); |
| 149 | return 3; |
| 150 | } |
| 151 | else if (wc < (1u << 21)) { |
| 152 | *s++ = 0xf0 | (wc >> 18); |
| 153 | *s++ = 0x80 | ((wc >> 12) & 0x3f); |
| 154 | *s++ = 0x80 | ((wc >> 6) & 0x3f); |
| 155 | *s++ = 0x80 | (wc & 0x3f); |
| 156 | return 4; |
| 157 | } |
| 158 | else if (wc < (1u << 26)) { |
| 159 | *s++ = 0xf8 | (wc >> 24); |
| 160 | *s++ = 0x80 | ((wc >> 18) & 0x3f); |
| 161 | *s++ = 0x80 | ((wc >> 12) & 0x3f); |
| 162 | *s++ = 0x80 | ((wc >> 6) & 0x3f); |
| 163 | *s++ = 0x80 | (wc & 0x3f); |
| 164 | return 5; |
| 165 | } |
| 166 | else if (wc < (1u << 31)) { |
| 167 | *s++ = 0xfc | (wc >> 30); |
| 168 | *s++ = 0x80 | ((wc >> 24) & 0x3f); |
| 169 | *s++ = 0x80 | ((wc >> 18) & 0x3f); |
| 170 | *s++ = 0x80 | ((wc >> 12) & 0x3f); |
| 171 | *s++ = 0x80 | ((wc >> 6) & 0x3f); |
| 172 | *s++ = 0x80 | (wc & 0x3f); |
| 173 | return 6; |
| 174 | } |
| 175 | else |
| 176 | return -1; |
| 177 | } |
| 178 | |
| 179 | /* |
| 180 | * The charset "object" and methods. |
| 181 | */ |
| 182 | |
| 183 | struct charset { |
| 184 | int max; |
| 185 | int (*mbtowc)(void *table, int *pwc, const char *s, size_t n); |
| 186 | int (*wctomb)(void *table, char *s, int wc); |
| 187 | void *map; |
| 188 | }; |
| 189 | |
| 190 | int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n) |
| 191 | { |
| 192 | return (*charset->mbtowc)(charset->map, pwc, s, n); |
| 193 | } |
| 194 | |
| 195 | int charset_wctomb(struct charset *charset, char *s, int wc) |
| 196 | { |
| 197 | return (*charset->wctomb)(charset->map, s, wc); |
| 198 | } |
| 199 | |
| 200 | int charset_max(struct charset *charset) |
| 201 | { |
| 202 | return charset->max; |
| 203 | } |
| 204 | |
| 205 | /* |
| 206 | * Implementation of UTF-8. |
| 207 | */ |
| 208 | |
| 209 | static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n) |
| 210 | { |
| 211 | (void)map; |
| 212 | return utf8_mbtowc(pwc, s, n); |
| 213 | } |
| 214 | |
| 215 | static int wctomb_utf8(void *map, char *s, int wc) |
| 216 | { |
| 217 | (void)map; |
| 218 | return utf8_wctomb(s, wc); |
| 219 | } |
| 220 | |
| 221 | /* |
| 222 | * Implementation of US-ASCII. |
| 223 | * Probably on most architectures this compiles to less than 256 bytes |
| 224 | * of code, so we can save space by not having a table for this one. |
| 225 | */ |
| 226 | |
| 227 | static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n) |
| 228 | { |
| 229 | int wc; |
| 230 | |
| 231 | (void)map; |
| 232 | if (!n || !s) |
| 233 | return 0; |
| 234 | wc = (unsigned char)*s; |
| 235 | if (wc & ~0x7f) |
| 236 | return -1; |
| 237 | if (pwc) |
| 238 | *pwc = wc; |
| 239 | return wc ? 1 : 0; |
| 240 | } |
| 241 | |
| 242 | static int wctomb_ascii(void *map, char *s, int wc) |
| 243 | { |
| 244 | (void)map; |
| 245 | if (!s) |
| 246 | return 0; |
| 247 | if (wc & ~0x7f) |
| 248 | return -1; |
| 249 | *s = wc; |
| 250 | return 1; |
| 251 | } |
| 252 | |
| 253 | /* |
| 254 | * Implementation of ISO-8859-1. |
| 255 | * Probably on most architectures this compiles to less than 256 bytes |
| 256 | * of code, so we can save space by not having a table for this one. |
| 257 | */ |
| 258 | |
| 259 | static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n) |
| 260 | { |
| 261 | int wc; |
| 262 | |
| 263 | (void)map; |
| 264 | if (!n || !s) |
| 265 | return 0; |
| 266 | wc = (unsigned char)*s; |
| 267 | if (wc & ~0xff) |
| 268 | return -1; |
| 269 | if (pwc) |
| 270 | *pwc = wc; |
| 271 | return wc ? 1 : 0; |
| 272 | } |
| 273 | |
| 274 | static int wctomb_iso1(void *map, char *s, int wc) |
| 275 | { |
| 276 | (void)map; |
| 277 | if (!s) |
| 278 | return 0; |
| 279 | if (wc & ~0xff) |
| 280 | return -1; |
| 281 | *s = wc; |
| 282 | return 1; |
| 283 | } |
| 284 | |
| 285 | /* |
| 286 | * Implementation of any 8-bit charset. |
| 287 | */ |
| 288 | |
| 289 | struct map { |
| 290 | const unsigned short *from; |
| 291 | struct inverse_map *to; |
| 292 | }; |
| 293 | |
| 294 | static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n) |
| 295 | { |
| 296 | struct map *map = map1; |
| 297 | unsigned short wc; |
| 298 | |
| 299 | if (!n || !s) |
| 300 | return 0; |
| 301 | wc = map->from[(unsigned char)*s]; |
| 302 | if (wc == 0xffff) |
| 303 | return -1; |
| 304 | if (pwc) |
| 305 | *pwc = (int)wc; |
| 306 | return wc ? 1 : 0; |
| 307 | } |
| 308 | |
| 309 | /* |
| 310 | * For the inverse map we use a hash table, which has the advantages |
| 311 | * of small constant memory requirement and simple memory allocation, |
| 312 | * but the disadvantage of slow conversion in the worst case. |
| 313 | * If you need real-time performance while letting a potentially |
| 314 | * malicious user define their own map, then the method used in |
| 315 | * linux/drivers/char/consolemap.c would be more appropriate. |
| 316 | */ |
| 317 | |
| 318 | struct inverse_map { |
| 319 | unsigned char first[256]; |
| 320 | unsigned char next[256]; |
| 321 | }; |
| 322 | |
| 323 | /* |
| 324 | * The simple hash is good enough for this application. |
| 325 | * Use the alternative trivial hashes for testing. |
| 326 | */ |
| 327 | #define HASH(i) ((i) & 0xff) |
| 328 | /* #define HASH(i) 0 */ |
| 329 | /* #define HASH(i) 99 */ |
| 330 | |
| 331 | static struct inverse_map *make_inverse_map(const unsigned short *from) |
| 332 | { |
| 333 | struct inverse_map *to; |
| 334 | char used[256]; |
| 335 | int i, j, k; |
| 336 | |
| 337 | to = malloc(sizeof(struct inverse_map)); |
| 338 | if (!to) |
| 339 | return 0; |
| 340 | for (i = 0; i < 256; i++) |
| 341 | to->first[i] = to->next[i] = used[i] = 0; |
| 342 | for (i = 255; i >= 0; i--) |
| 343 | if (from[i] != 0xffff) { |
| 344 | k = HASH(from[i]); |
| 345 | to->next[i] = to->first[k]; |
| 346 | to->first[k] = i; |
| 347 | used[k] = 1; |
| 348 | } |
| 349 | |
| 350 | /* Point the empty buckets at an empty list. */ |
| 351 | for (i = 0; i < 256; i++) |
| 352 | if (!to->next[i]) |
| 353 | break; |
| 354 | if (i < 256) |
| 355 | for (j = 0; j < 256; j++) |
| 356 | if (!used[j]) |
| 357 | to->first[j] = i; |
| 358 | |
| 359 | return to; |
| 360 | } |
| 361 | |
| 362 | static int wctomb_8bit(void *map1, char *s, int wc1) |
| 363 | { |
| 364 | struct map *map = map1; |
| 365 | unsigned short wc = wc1; |
| 366 | int i; |
| 367 | |
| 368 | if (!s) |
| 369 | return 0; |
| 370 | |
| 371 | if (wc1 & ~0xffff) |
| 372 | return -1; |
| 373 | |
| 374 | if (1) /* Change 1 to 0 to test the case where malloc fails. */ |
| 375 | if (!map->to) |
| 376 | map->to = make_inverse_map(map->from); |
| 377 | |
| 378 | if (map->to) { |
| 379 | /* Use the inverse map. */ |
| 380 | i = map->to->first[HASH(wc)]; |
| 381 | for (;;) { |
| 382 | if (map->from[i] == wc) { |
| 383 | *s = i; |
| 384 | return 1; |
| 385 | } |
| 386 | if (!(i = map->to->next[i])) |
| 387 | break; |
| 388 | } |
| 389 | } |
| 390 | else { |
| 391 | /* We don't have an inverse map, so do a linear search. */ |
| 392 | for (i = 0; i < 256; i++) |
| 393 | if (map->from[i] == wc) { |
| 394 | *s = i; |
| 395 | return 1; |
| 396 | } |
| 397 | } |
| 398 | |
| 399 | return -1; |
| 400 | } |
| 401 | |
| 402 | /* |
| 403 | * The "constructor" charset_find(). |
| 404 | */ |
| 405 | |
| 406 | struct charset charset_utf8 = { |
| 407 | 6, |
| 408 | &mbtowc_utf8, |
| 409 | &wctomb_utf8, |
| 410 | 0 |
| 411 | }; |
| 412 | |
| 413 | struct charset charset_iso1 = { |
| 414 | 1, |
| 415 | &mbtowc_iso1, |
| 416 | &wctomb_iso1, |
| 417 | 0 |
| 418 | }; |
| 419 | |
| 420 | struct charset charset_ascii = { |
| 421 | 1, |
| 422 | &mbtowc_ascii, |
| 423 | &wctomb_ascii, |
| 424 | 0 |
| 425 | }; |
| 426 | |
| 427 | struct charset *charset_find(const char *code) |
| 428 | { |
| 429 | int i; |
| 430 | |
| 431 | /* Find good (MIME) name. */ |
| 432 | for (i = 0; names[i].bad; i++) |
| 433 | if (!ascii_strcasecmp(code, names[i].bad)) { |
| 434 | code = names[i].good; |
| 435 | break; |
| 436 | } |
| 437 | |
| 438 | /* Recognise some charsets for which we avoid using a table. */ |
| 439 | if (!ascii_strcasecmp(code, "UTF-8")) |
| 440 | return &charset_utf8; |
| 441 | if (!ascii_strcasecmp(code, "US-ASCII")) |
| 442 | return &charset_ascii; |
| 443 | if (!ascii_strcasecmp(code, "ISO-8859-1")) |
| 444 | return &charset_iso1; |
| 445 | |
| 446 | /* Look for a mapping for a simple 8-bit encoding. */ |
| 447 | for (i = 0; maps[i].name; i++) |
| 448 | if (!ascii_strcasecmp(code, maps[i].name)) { |
| 449 | if (!maps[i].charset) { |
| 450 | maps[i].charset = malloc(sizeof(struct charset)); |
| 451 | if (maps[i].charset) { |
| 452 | struct map *map = malloc(sizeof(struct map)); |
| 453 | if (!map) { |
| 454 | free(maps[i].charset); |
| 455 | maps[i].charset = 0; |
| 456 | } |
| 457 | else { |
| 458 | maps[i].charset->max = 1; |
| 459 | maps[i].charset->mbtowc = &mbtowc_8bit; |
| 460 | maps[i].charset->wctomb = &wctomb_8bit; |
| 461 | maps[i].charset->map = map; |
| 462 | map->from = maps[i].map; |
| 463 | map->to = 0; /* inverse mapping is created when required */ |
| 464 | } |
| 465 | } |
| 466 | } |
| 467 | return maps[i].charset; |
| 468 | } |
| 469 | |
| 470 | return 0; |
| 471 | } |
| 472 | |
| 473 | /* |
| 474 | * Function to convert a buffer from one encoding to another. |
| 475 | * Invalid bytes are replaced by '#', and characters that are |
| 476 | * not available in the target encoding are replaced by '?'. |
| 477 | * Each of TO and TOLEN may be zero, if the result is not needed. |
| 478 | * The output buffer is null-terminated, so it is all right to |
| 479 | * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0). |
| 480 | */ |
| 481 | |
| 482 | int charset_convert(const char *fromcode, const char *tocode, |
| 483 | const char *from, size_t fromlen, |
| 484 | char **to, size_t *tolen) |
| 485 | { |
| 486 | int ret = 0; |
| 487 | struct charset *charset1, *charset2; |
| 488 | char *tobuf, *p, *newbuf; |
| 489 | int i, j, wc; |
| 490 | |
| 491 | charset1 = charset_find(fromcode); |
| 492 | charset2 = charset_find(tocode); |
| 493 | if (!charset1 || !charset2 ) |
| 494 | return -1; |
| 495 | |
| 496 | tobuf = safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1); |
| 497 | if (!tobuf) |
| 498 | return -2; |
| 499 | |
| 500 | for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) { |
| 501 | i = charset_mbtowc(charset1, &wc, from, fromlen); |
| 502 | if (!i) |
| 503 | i = 1; |
| 504 | else if (i == -1) { |
| 505 | i = 1; |
| 506 | wc = '#'; |
| 507 | ret = 2; |
| 508 | } |
| 509 | j = charset_wctomb(charset2, p, wc); |
| 510 | if (j == -1) { |
| 511 | if (!ret) |
| 512 | ret = 1; |
| 513 | j = charset_wctomb(charset2, p, '?'); |
| 514 | if (j == -1) |
| 515 | j = 0; |
| 516 | } |
| 517 | } |
| 518 | |
| 519 | if (tolen) |
| 520 | *tolen = p - tobuf; |
| 521 | *p++ = '\0'; |
| 522 | if (to) { |
| 523 | newbuf = realloc(tobuf, p - tobuf); |
| 524 | *to = newbuf ? newbuf : tobuf; |
| 525 | } |
| 526 | else |
| 527 | free(tobuf); |
| 528 | |
| 529 | return ret; |
| 530 | } |
| 531 | |
| 532 | #endif /* USE_CHARSET_ICONV */ |