ce188d4d |
1 | /* |
2 | * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> |
3 | * |
4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by |
6 | * the Free Software Foundation; either version 2 of the License, or |
7 | * (at your option) any later version. |
8 | * |
9 | * This program is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU General Public License along |
15 | * with this program; if not, write to the Free Software Foundation, Inc., |
16 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
17 | */ |
18 | |
19 | /* |
20 | * See the corresponding header file for a description of the functions |
21 | * that this file provides. |
22 | * |
23 | * This was first written for Ogg Vorbis but could be of general use. |
24 | * |
25 | * The only deliberate assumption about data sizes is that a short has |
26 | * at least 16 bits, but this code has only been tested on systems with |
27 | * 8-bit char, 16-bit short and 32-bit int. |
28 | */ |
29 | |
30 | #ifdef HAVE_CONFIG_H |
31 | # include <config.h> |
32 | #endif |
33 | |
34 | #if !defined _WIN32 && !defined HAVE_ICONV /* should be && defined USE_CHARSET_CONVERT */ |
35 | |
36 | #include <stdlib.h> |
37 | |
38 | #include "share/alloc.h" |
39 | #include "charset.h" |
40 | |
41 | #include "charmaps.h" |
42 | |
43 | /* |
44 | * This is like the standard strcasecmp, but it does not depend |
45 | * on the locale. Locale-dependent functions can be dangerous: |
46 | * we once had a bug involving strcasecmp("iso", "ISO") in a |
47 | * Turkish locale! |
48 | * |
49 | * (I'm not really sure what the official standard says |
50 | * about the sign of strcasecmp("Z", "["), but usually |
51 | * we're only interested in whether it's zero.) |
52 | */ |
53 | |
54 | static int ascii_strcasecmp(const char *s1, const char *s2) |
55 | { |
56 | char c1, c2; |
57 | |
58 | for (;; s1++, s2++) { |
59 | if (!*s1 || !*s2) |
60 | break; |
61 | if (*s1 == *s2) |
62 | continue; |
63 | c1 = *s1; |
64 | if ('a' <= c1 && c1 <= 'z') |
65 | c1 += 'A' - 'a'; |
66 | c2 = *s2; |
67 | if ('a' <= c2 && c2 <= 'z') |
68 | c2 += 'A' - 'a'; |
69 | if (c1 != c2) |
70 | break; |
71 | } |
72 | return (unsigned char)*s1 - (unsigned char)*s2; |
73 | } |
74 | |
75 | /* |
76 | * UTF-8 equivalents of the C library's wctomb() and mbtowc(). |
77 | */ |
78 | |
79 | int utf8_mbtowc(int *pwc, const char *s, size_t n) |
80 | { |
81 | unsigned char c; |
82 | int wc, i, k; |
83 | |
84 | if (!n || !s) |
85 | return 0; |
86 | |
87 | c = *s; |
88 | if (c < 0x80) { |
89 | if (pwc) |
90 | *pwc = c; |
91 | return c ? 1 : 0; |
92 | } |
93 | else if (c < 0xc2) |
94 | return -1; |
95 | else if (c < 0xe0) { |
96 | if (n >= 2 && (s[1] & 0xc0) == 0x80) { |
97 | if (pwc) |
98 | *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f); |
99 | return 2; |
100 | } |
101 | else |
102 | return -1; |
103 | } |
104 | else if (c < 0xf0) |
105 | k = 3; |
106 | else if (c < 0xf8) |
107 | k = 4; |
108 | else if (c < 0xfc) |
109 | k = 5; |
110 | else if (c < 0xfe) |
111 | k = 6; |
112 | else |
113 | return -1; |
114 | |
115 | if (n < (size_t)k) |
116 | return -1; |
117 | wc = *s++ & ((1 << (7 - k)) - 1); |
118 | for (i = 1; i < k; i++) { |
119 | if ((*s & 0xc0) != 0x80) |
120 | return -1; |
121 | wc = (wc << 6) | (*s++ & 0x3f); |
122 | } |
123 | if (wc < (1 << (5 * k - 4))) |
124 | return -1; |
125 | if (pwc) |
126 | *pwc = wc; |
127 | return k; |
128 | } |
129 | |
130 | int utf8_wctomb(char *s, int wc1) |
131 | { |
132 | unsigned int wc = wc1; |
133 | |
134 | if (!s) |
135 | return 0; |
136 | if (wc < (1u << 7)) { |
137 | *s++ = wc; |
138 | return 1; |
139 | } |
140 | else if (wc < (1u << 11)) { |
141 | *s++ = 0xc0 | (wc >> 6); |
142 | *s++ = 0x80 | (wc & 0x3f); |
143 | return 2; |
144 | } |
145 | else if (wc < (1u << 16)) { |
146 | *s++ = 0xe0 | (wc >> 12); |
147 | *s++ = 0x80 | ((wc >> 6) & 0x3f); |
148 | *s++ = 0x80 | (wc & 0x3f); |
149 | return 3; |
150 | } |
151 | else if (wc < (1u << 21)) { |
152 | *s++ = 0xf0 | (wc >> 18); |
153 | *s++ = 0x80 | ((wc >> 12) & 0x3f); |
154 | *s++ = 0x80 | ((wc >> 6) & 0x3f); |
155 | *s++ = 0x80 | (wc & 0x3f); |
156 | return 4; |
157 | } |
158 | else if (wc < (1u << 26)) { |
159 | *s++ = 0xf8 | (wc >> 24); |
160 | *s++ = 0x80 | ((wc >> 18) & 0x3f); |
161 | *s++ = 0x80 | ((wc >> 12) & 0x3f); |
162 | *s++ = 0x80 | ((wc >> 6) & 0x3f); |
163 | *s++ = 0x80 | (wc & 0x3f); |
164 | return 5; |
165 | } |
166 | else if (wc < (1u << 31)) { |
167 | *s++ = 0xfc | (wc >> 30); |
168 | *s++ = 0x80 | ((wc >> 24) & 0x3f); |
169 | *s++ = 0x80 | ((wc >> 18) & 0x3f); |
170 | *s++ = 0x80 | ((wc >> 12) & 0x3f); |
171 | *s++ = 0x80 | ((wc >> 6) & 0x3f); |
172 | *s++ = 0x80 | (wc & 0x3f); |
173 | return 6; |
174 | } |
175 | else |
176 | return -1; |
177 | } |
178 | |
179 | /* |
180 | * The charset "object" and methods. |
181 | */ |
182 | |
183 | struct charset { |
184 | int max; |
185 | int (*mbtowc)(void *table, int *pwc, const char *s, size_t n); |
186 | int (*wctomb)(void *table, char *s, int wc); |
187 | void *map; |
188 | }; |
189 | |
190 | int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n) |
191 | { |
192 | return (*charset->mbtowc)(charset->map, pwc, s, n); |
193 | } |
194 | |
195 | int charset_wctomb(struct charset *charset, char *s, int wc) |
196 | { |
197 | return (*charset->wctomb)(charset->map, s, wc); |
198 | } |
199 | |
200 | int charset_max(struct charset *charset) |
201 | { |
202 | return charset->max; |
203 | } |
204 | |
205 | /* |
206 | * Implementation of UTF-8. |
207 | */ |
208 | |
209 | static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n) |
210 | { |
211 | (void)map; |
212 | return utf8_mbtowc(pwc, s, n); |
213 | } |
214 | |
215 | static int wctomb_utf8(void *map, char *s, int wc) |
216 | { |
217 | (void)map; |
218 | return utf8_wctomb(s, wc); |
219 | } |
220 | |
221 | /* |
222 | * Implementation of US-ASCII. |
223 | * Probably on most architectures this compiles to less than 256 bytes |
224 | * of code, so we can save space by not having a table for this one. |
225 | */ |
226 | |
227 | static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n) |
228 | { |
229 | int wc; |
230 | |
231 | (void)map; |
232 | if (!n || !s) |
233 | return 0; |
234 | wc = (unsigned char)*s; |
235 | if (wc & ~0x7f) |
236 | return -1; |
237 | if (pwc) |
238 | *pwc = wc; |
239 | return wc ? 1 : 0; |
240 | } |
241 | |
242 | static int wctomb_ascii(void *map, char *s, int wc) |
243 | { |
244 | (void)map; |
245 | if (!s) |
246 | return 0; |
247 | if (wc & ~0x7f) |
248 | return -1; |
249 | *s = wc; |
250 | return 1; |
251 | } |
252 | |
253 | /* |
254 | * Implementation of ISO-8859-1. |
255 | * Probably on most architectures this compiles to less than 256 bytes |
256 | * of code, so we can save space by not having a table for this one. |
257 | */ |
258 | |
259 | static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n) |
260 | { |
261 | int wc; |
262 | |
263 | (void)map; |
264 | if (!n || !s) |
265 | return 0; |
266 | wc = (unsigned char)*s; |
267 | if (wc & ~0xff) |
268 | return -1; |
269 | if (pwc) |
270 | *pwc = wc; |
271 | return wc ? 1 : 0; |
272 | } |
273 | |
274 | static int wctomb_iso1(void *map, char *s, int wc) |
275 | { |
276 | (void)map; |
277 | if (!s) |
278 | return 0; |
279 | if (wc & ~0xff) |
280 | return -1; |
281 | *s = wc; |
282 | return 1; |
283 | } |
284 | |
285 | /* |
286 | * Implementation of any 8-bit charset. |
287 | */ |
288 | |
289 | struct map { |
290 | const unsigned short *from; |
291 | struct inverse_map *to; |
292 | }; |
293 | |
294 | static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n) |
295 | { |
296 | struct map *map = map1; |
297 | unsigned short wc; |
298 | |
299 | if (!n || !s) |
300 | return 0; |
301 | wc = map->from[(unsigned char)*s]; |
302 | if (wc == 0xffff) |
303 | return -1; |
304 | if (pwc) |
305 | *pwc = (int)wc; |
306 | return wc ? 1 : 0; |
307 | } |
308 | |
309 | /* |
310 | * For the inverse map we use a hash table, which has the advantages |
311 | * of small constant memory requirement and simple memory allocation, |
312 | * but the disadvantage of slow conversion in the worst case. |
313 | * If you need real-time performance while letting a potentially |
314 | * malicious user define their own map, then the method used in |
315 | * linux/drivers/char/consolemap.c would be more appropriate. |
316 | */ |
317 | |
318 | struct inverse_map { |
319 | unsigned char first[256]; |
320 | unsigned char next[256]; |
321 | }; |
322 | |
323 | /* |
324 | * The simple hash is good enough for this application. |
325 | * Use the alternative trivial hashes for testing. |
326 | */ |
327 | #define HASH(i) ((i) & 0xff) |
328 | /* #define HASH(i) 0 */ |
329 | /* #define HASH(i) 99 */ |
330 | |
331 | static struct inverse_map *make_inverse_map(const unsigned short *from) |
332 | { |
333 | struct inverse_map *to; |
334 | char used[256]; |
335 | int i, j, k; |
336 | |
337 | to = malloc(sizeof(struct inverse_map)); |
338 | if (!to) |
339 | return 0; |
340 | for (i = 0; i < 256; i++) |
341 | to->first[i] = to->next[i] = used[i] = 0; |
342 | for (i = 255; i >= 0; i--) |
343 | if (from[i] != 0xffff) { |
344 | k = HASH(from[i]); |
345 | to->next[i] = to->first[k]; |
346 | to->first[k] = i; |
347 | used[k] = 1; |
348 | } |
349 | |
350 | /* Point the empty buckets at an empty list. */ |
351 | for (i = 0; i < 256; i++) |
352 | if (!to->next[i]) |
353 | break; |
354 | if (i < 256) |
355 | for (j = 0; j < 256; j++) |
356 | if (!used[j]) |
357 | to->first[j] = i; |
358 | |
359 | return to; |
360 | } |
361 | |
362 | static int wctomb_8bit(void *map1, char *s, int wc1) |
363 | { |
364 | struct map *map = map1; |
365 | unsigned short wc = wc1; |
366 | int i; |
367 | |
368 | if (!s) |
369 | return 0; |
370 | |
371 | if (wc1 & ~0xffff) |
372 | return -1; |
373 | |
374 | if (1) /* Change 1 to 0 to test the case where malloc fails. */ |
375 | if (!map->to) |
376 | map->to = make_inverse_map(map->from); |
377 | |
378 | if (map->to) { |
379 | /* Use the inverse map. */ |
380 | i = map->to->first[HASH(wc)]; |
381 | for (;;) { |
382 | if (map->from[i] == wc) { |
383 | *s = i; |
384 | return 1; |
385 | } |
386 | if (!(i = map->to->next[i])) |
387 | break; |
388 | } |
389 | } |
390 | else { |
391 | /* We don't have an inverse map, so do a linear search. */ |
392 | for (i = 0; i < 256; i++) |
393 | if (map->from[i] == wc) { |
394 | *s = i; |
395 | return 1; |
396 | } |
397 | } |
398 | |
399 | return -1; |
400 | } |
401 | |
402 | /* |
403 | * The "constructor" charset_find(). |
404 | */ |
405 | |
406 | struct charset charset_utf8 = { |
407 | 6, |
408 | &mbtowc_utf8, |
409 | &wctomb_utf8, |
410 | 0 |
411 | }; |
412 | |
413 | struct charset charset_iso1 = { |
414 | 1, |
415 | &mbtowc_iso1, |
416 | &wctomb_iso1, |
417 | 0 |
418 | }; |
419 | |
420 | struct charset charset_ascii = { |
421 | 1, |
422 | &mbtowc_ascii, |
423 | &wctomb_ascii, |
424 | 0 |
425 | }; |
426 | |
427 | struct charset *charset_find(const char *code) |
428 | { |
429 | int i; |
430 | |
431 | /* Find good (MIME) name. */ |
432 | for (i = 0; names[i].bad; i++) |
433 | if (!ascii_strcasecmp(code, names[i].bad)) { |
434 | code = names[i].good; |
435 | break; |
436 | } |
437 | |
438 | /* Recognise some charsets for which we avoid using a table. */ |
439 | if (!ascii_strcasecmp(code, "UTF-8")) |
440 | return &charset_utf8; |
441 | if (!ascii_strcasecmp(code, "US-ASCII")) |
442 | return &charset_ascii; |
443 | if (!ascii_strcasecmp(code, "ISO-8859-1")) |
444 | return &charset_iso1; |
445 | |
446 | /* Look for a mapping for a simple 8-bit encoding. */ |
447 | for (i = 0; maps[i].name; i++) |
448 | if (!ascii_strcasecmp(code, maps[i].name)) { |
449 | if (!maps[i].charset) { |
450 | maps[i].charset = malloc(sizeof(struct charset)); |
451 | if (maps[i].charset) { |
452 | struct map *map = malloc(sizeof(struct map)); |
453 | if (!map) { |
454 | free(maps[i].charset); |
455 | maps[i].charset = 0; |
456 | } |
457 | else { |
458 | maps[i].charset->max = 1; |
459 | maps[i].charset->mbtowc = &mbtowc_8bit; |
460 | maps[i].charset->wctomb = &wctomb_8bit; |
461 | maps[i].charset->map = map; |
462 | map->from = maps[i].map; |
463 | map->to = 0; /* inverse mapping is created when required */ |
464 | } |
465 | } |
466 | } |
467 | return maps[i].charset; |
468 | } |
469 | |
470 | return 0; |
471 | } |
472 | |
473 | /* |
474 | * Function to convert a buffer from one encoding to another. |
475 | * Invalid bytes are replaced by '#', and characters that are |
476 | * not available in the target encoding are replaced by '?'. |
477 | * Each of TO and TOLEN may be zero, if the result is not needed. |
478 | * The output buffer is null-terminated, so it is all right to |
479 | * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0). |
480 | */ |
481 | |
482 | int charset_convert(const char *fromcode, const char *tocode, |
483 | const char *from, size_t fromlen, |
484 | char **to, size_t *tolen) |
485 | { |
486 | int ret = 0; |
487 | struct charset *charset1, *charset2; |
488 | char *tobuf, *p, *newbuf; |
489 | int i, j, wc; |
490 | |
491 | charset1 = charset_find(fromcode); |
492 | charset2 = charset_find(tocode); |
493 | if (!charset1 || !charset2 ) |
494 | return -1; |
495 | |
496 | tobuf = safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1); |
497 | if (!tobuf) |
498 | return -2; |
499 | |
500 | for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) { |
501 | i = charset_mbtowc(charset1, &wc, from, fromlen); |
502 | if (!i) |
503 | i = 1; |
504 | else if (i == -1) { |
505 | i = 1; |
506 | wc = '#'; |
507 | ret = 2; |
508 | } |
509 | j = charset_wctomb(charset2, p, wc); |
510 | if (j == -1) { |
511 | if (!ret) |
512 | ret = 1; |
513 | j = charset_wctomb(charset2, p, '?'); |
514 | if (j == -1) |
515 | j = 0; |
516 | } |
517 | } |
518 | |
519 | if (tolen) |
520 | *tolen = p - tobuf; |
521 | *p++ = '\0'; |
522 | if (to) { |
523 | newbuf = realloc(tobuf, p - tobuf); |
524 | *to = newbuf ? newbuf : tobuf; |
525 | } |
526 | else |
527 | free(tobuf); |
528 | |
529 | return ret; |
530 | } |
531 | |
532 | #endif /* USE_CHARSET_ICONV */ |