ce188d4d |
1 | /* |
2 | * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> |
3 | * |
4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by |
6 | * the Free Software Foundation; either version 2 of the License, or |
7 | * (at your option) any later version. |
8 | * |
9 | * This program is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU General Public License along |
15 | * with this program; if not, write to the Free Software Foundation, Inc., |
16 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
17 | */ |
18 | |
19 | #ifdef HAVE_CONFIG_H |
20 | # include <config.h> |
21 | #endif |
22 | |
23 | #if !defined _WIN32 && defined HAVE_ICONV |
24 | |
25 | #include <assert.h> |
26 | #include <errno.h> |
27 | #include <iconv.h> |
28 | #include <stdlib.h> |
29 | #include <string.h> |
30 | |
31 | #include "iconvert.h" |
32 | #include "share/alloc.h" |
33 | #include "share/safe_str.h" |
34 | |
35 | /* |
36 | * Convert data from one encoding to another. Return: |
37 | * |
38 | * -2 : memory allocation failed |
39 | * -1 : unknown encoding |
40 | * 0 : data was converted exactly |
41 | * 1 : data was converted inexactly |
42 | * 2 : data was invalid (but still converted) |
43 | * |
44 | * We convert in two steps, via UTF-8, as this is the only |
45 | * reliable way of distinguishing between invalid input |
46 | * and valid input which iconv refuses to transliterate. |
47 | * We convert from UTF-8 twice, because we have no way of |
48 | * knowing whether the conversion was exact if iconv returns |
49 | * E2BIG (due to a bug in the specification of iconv). |
50 | * An alternative approach is to assume that the output of |
51 | * iconv is never more than 4 times as long as the input, |
52 | * but I prefer to avoid that assumption if possible. |
53 | */ |
54 | |
55 | int iconvert(const char *fromcode, const char *tocode, |
56 | const char *from, size_t fromlen, |
57 | char **to, size_t *tolen) |
58 | { |
59 | int ret = 0; |
60 | iconv_t cd1, cd2; |
61 | char *ib; |
62 | char *ob; |
63 | char *utfbuf = 0, *outbuf, *newbuf; |
64 | size_t utflen, outlen, ibl, obl, k; |
65 | char tbuf[2048]; |
66 | |
67 | cd1 = iconv_open("UTF-8", fromcode); |
68 | if (cd1 == (iconv_t)(-1)) |
69 | return -1; |
70 | |
71 | cd2 = (iconv_t)(-1); |
72 | /* Don't use strcasecmp() as it's locale-dependent. */ |
73 | if (!strchr("Uu", tocode[0]) || |
74 | !strchr("Tt", tocode[1]) || |
75 | !strchr("Ff", tocode[2]) || |
76 | tocode[3] != '-' || |
77 | tocode[4] != '8' || |
78 | tocode[5] != '\0') { |
79 | char *tocode1; |
80 | size_t dest_len = strlen(tocode) + 11; |
81 | /* |
82 | * Try using this non-standard feature of glibc and libiconv. |
83 | * This is deliberately not a config option as people often |
84 | * change their iconv library without rebuilding applications. |
85 | */ |
86 | tocode1 = safe_malloc_(dest_len); |
87 | if (!tocode1) |
88 | goto fail; |
89 | |
90 | safe_strncpy(tocode1, tocode, dest_len); |
91 | safe_strncat(tocode1, "//TRANSLIT", dest_len); |
92 | cd2 = iconv_open(tocode1, "UTF-8"); |
93 | free(tocode1); |
94 | |
95 | if (cd2 == (iconv_t)(-1)) |
96 | cd2 = iconv_open(tocode, fromcode); |
97 | |
98 | if (cd2 == (iconv_t)(-1)) { |
99 | iconv_close(cd1); |
100 | return -1; |
101 | } |
102 | } |
103 | |
104 | utflen = 1; /*fromlen * 2 + 1; XXX */ |
105 | utfbuf = malloc(utflen); |
106 | if (!utfbuf) |
107 | goto fail; |
108 | |
109 | /* Convert to UTF-8 */ |
110 | ib = (char *)from; |
111 | ibl = fromlen; |
112 | ob = utfbuf; |
113 | obl = utflen; |
114 | for (;;) { |
115 | k = iconv(cd1, &ib, &ibl, &ob, &obl); |
116 | assert((!k && !ibl) || |
117 | (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) || |
118 | (k == (size_t)(-1) && |
119 | (errno == EILSEQ || errno == EINVAL) && ibl)); |
120 | if (!ibl) |
121 | break; |
122 | if (obl < 6) { |
123 | /* Enlarge the buffer */ |
124 | if(utflen*2 < utflen) /* overflow check */ |
125 | goto fail; |
126 | utflen *= 2; |
127 | newbuf = realloc(utfbuf, utflen); |
128 | if (!newbuf) |
129 | goto fail; |
130 | ob = (ob - utfbuf) + newbuf; |
131 | obl = utflen - (ob - newbuf); |
132 | utfbuf = newbuf; |
133 | } |
134 | else { |
135 | /* Invalid input */ |
136 | ib++, ibl--; |
137 | *ob++ = '#', obl--; |
138 | ret = 2; |
139 | iconv(cd1, 0, 0, 0, 0); |
140 | } |
141 | } |
142 | |
143 | if (cd2 == (iconv_t)(-1)) { |
144 | /* The target encoding was UTF-8 */ |
145 | if (tolen) |
146 | *tolen = ob - utfbuf; |
147 | if (!to) { |
148 | free(utfbuf); |
149 | iconv_close(cd1); |
150 | return ret; |
151 | } |
152 | newbuf = safe_realloc_add_2op_(utfbuf, (ob - utfbuf), /*+*/1); |
153 | if (!newbuf) |
154 | goto fail; |
155 | ob = (ob - utfbuf) + newbuf; |
156 | *ob = '\0'; |
157 | *to = newbuf; |
158 | iconv_close(cd1); |
159 | return ret; |
160 | } |
161 | |
162 | /* Truncate the buffer to be tidy */ |
163 | utflen = ob - utfbuf; |
164 | newbuf = realloc(utfbuf, utflen); |
165 | if (!newbuf) |
166 | goto fail; |
167 | utfbuf = newbuf; |
168 | |
169 | /* Convert from UTF-8 to discover how long the output is */ |
170 | outlen = 0; |
171 | ib = utfbuf; |
172 | ibl = utflen; |
173 | while (ibl) { |
174 | ob = tbuf; |
175 | obl = sizeof(tbuf); |
176 | k = iconv(cd2, &ib, &ibl, &ob, &obl); |
177 | assert((k != (size_t)(-1) && !ibl) || |
178 | (k == (size_t)(-1) && errno == E2BIG && ibl) || |
179 | (k == (size_t)(-1) && errno == EILSEQ && ibl)); |
180 | if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) { |
181 | /* Replace one character */ |
182 | char *tb = "?"; |
183 | size_t tbl = 1; |
184 | |
185 | outlen += ob - tbuf; |
186 | ob = tbuf; |
187 | obl = sizeof(tbuf); |
188 | k = iconv(cd2, &tb, &tbl, &ob, &obl); |
189 | assert((!k && !tbl) || |
190 | (k == (size_t)(-1) && errno == EILSEQ && tbl)); |
191 | for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--) |
192 | ; |
193 | } |
194 | outlen += ob - tbuf; |
195 | } |
196 | ob = tbuf; |
197 | obl = sizeof(tbuf); |
198 | k = iconv(cd2, 0, 0, &ob, &obl); |
199 | assert(!k); |
200 | outlen += ob - tbuf; |
201 | |
202 | /* Convert from UTF-8 for real */ |
203 | outbuf = safe_malloc_add_2op_(outlen, /*+*/1); |
204 | if (!outbuf) |
205 | goto fail; |
206 | ib = utfbuf; |
207 | ibl = utflen; |
208 | ob = outbuf; |
209 | obl = outlen; |
210 | while (ibl) { |
211 | k = iconv(cd2, &ib, &ibl, &ob, &obl); |
212 | assert((k != (size_t)(-1) && !ibl) || |
213 | (k == (size_t)(-1) && errno == EILSEQ && ibl)); |
214 | if (k && !ret) |
215 | ret = 1; |
216 | if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) { |
217 | /* Replace one character */ |
218 | char *tb = "?"; |
219 | size_t tbl = 1; |
220 | |
221 | k = iconv(cd2, &tb, &tbl, &ob, &obl); |
222 | assert((!k && !tbl) || |
223 | (k == (size_t)(-1) && errno == EILSEQ && tbl)); |
224 | for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--) |
225 | ; |
226 | } |
227 | } |
228 | k = iconv(cd2, 0, 0, &ob, &obl); |
229 | assert(!k); |
230 | assert(!obl); |
231 | *ob = '\0'; |
232 | |
233 | free(utfbuf); |
234 | iconv_close(cd1); |
235 | iconv_close(cd2); |
236 | if (tolen) |
237 | *tolen = outlen; |
238 | if (!to) { |
239 | free(outbuf); |
240 | return ret; |
241 | } |
242 | *to = outbuf; |
243 | return ret; |
244 | |
245 | fail: |
246 | if(0 != utfbuf) |
247 | free(utfbuf); |
248 | iconv_close(cd1); |
249 | if (cd2 != (iconv_t)(-1)) |
250 | iconv_close(cd2); |
251 | return -2; |
252 | } |
253 | |
254 | #endif /* HAVE_ICONV */ |