mutt stable branch with some hacks
1/*
2 * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 */
18
19/*
20 * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
21 */
22
23#if HAVE_CONFIG_H
24# include "config.h"
25#endif
26
27#include "mutt.h"
28#include "mbyte.h"
29#include "charset.h"
30
31#include <errno.h>
32
33#include <ctype.h>
34
35#ifndef EILSEQ
36#define EILSEQ EINVAL
37#endif
38
39int Charset_is_utf8 = 0;
40#ifndef HAVE_WC_FUNCS
41static int charset_is_ja = 0;
42static iconv_t charset_to_utf8 = (iconv_t)(-1);
43static iconv_t charset_from_utf8 = (iconv_t)(-1);
44#endif
45
46void mutt_set_charset (char *charset)
47{
48 char buffer[STRING];
49
50 mutt_canonical_charset (buffer, sizeof (buffer), charset);
51
52 Charset_is_utf8 = 0;
53#ifndef HAVE_WC_FUNCS
54 charset_is_ja = 0;
55 if (charset_to_utf8 != (iconv_t)(-1))
56 {
57 iconv_close (charset_to_utf8);
58 charset_to_utf8 = (iconv_t)(-1);
59 }
60 if (charset_from_utf8 != (iconv_t)(-1))
61 {
62 iconv_close (charset_from_utf8);
63 charset_from_utf8 = (iconv_t)(-1);
64 }
65#endif
66
67 if (mutt_is_utf8 (buffer))
68 Charset_is_utf8 = 1;
69#ifndef HAVE_WC_FUNCS
70 else if (!ascii_strcasecmp(buffer, "euc-jp") || !ascii_strcasecmp(buffer, "shift_jis")
71 || !ascii_strcasecmp(buffer, "cp932") || !ascii_strcasecmp(buffer, "eucJP-ms"))
72 {
73 charset_is_ja = 1;
74
75 /* Note flags=0 to skip charset-hooks: User masters the $charset
76 * name, and we are sure of our "utf-8" constant. So there is no
77 * possibility of wrong name that we would want to try to correct
78 * with a charset-hook. Or rather: If $charset was wrong, we would
79 * want to try to correct... $charset directly.
80 */
81 charset_to_utf8 = mutt_iconv_open ("utf-8", charset, 0);
82 charset_from_utf8 = mutt_iconv_open (charset, "utf-8", 0);
83 }
84#endif
85
86#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
87 bind_textdomain_codeset(PACKAGE, buffer);
88#endif
89}
90
91#ifndef HAVE_WC_FUNCS
92
93/*
94 * For systems that don't have them, we provide here our own
95 * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
96 * Instead of using the locale, as these functions normally would,
97 * we use Mutt's Charset variable. We support 3 types of charset:
98 * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
99 * (2) For UTF-8, wchar_t uses UCS.
100 * (3) For stateless Japanese encodings, we use UCS and convert
101 * via UTF-8 using iconv.
102 * Unfortunately, we can't handle non-stateless encodings.
103 */
104
105static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
106{
107 char buf[MB_LEN_MAX+1];
108 ICONV_CONST char *ib;
109 char *ob;
110 size_t ibl, obl, r;
111
112 if (s)
113 {
114 ibl = mutt_wctoutf8 (buf, wc, sizeof (buf));
115 if (ibl == (size_t)(-1))
116 return (size_t)(-1);
117 ib = buf;
118 ob = s;
119 obl = MB_LEN_MAX;
120 r = iconv (cd, &ib, &ibl, &ob, &obl);
121 }
122 else
123 {
124 ib = "";
125 ibl = 1;
126 ob = buf;
127 obl = sizeof (buf);
128 r = iconv (cd, &ib, &ibl, &ob, &obl);
129 }
130 return ob - s;
131}
132
133size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
134{
135 /* We only handle stateless encodings, so we can ignore ps. */
136
137 if (Charset_is_utf8)
138 return mutt_wctoutf8 (s, wc, MB_LEN_MAX);
139 else if (charset_from_utf8 != (iconv_t)(-1))
140 return wcrtomb_iconv (s, wc, charset_from_utf8);
141 else
142 {
143 if (!s)
144 return 1;
145 if (wc < 0x100)
146 {
147 *s = wc;
148 return 1;
149 }
150 errno = EILSEQ;
151 return (size_t)(-1);
152 }
153}
154
155size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n,
156 mbstate_t *ps, iconv_t cd)
157{
158 static mbstate_t mbstate;
159 ICONV_CONST char *ib, *ibmax;
160 char *ob, *t;
161 size_t ibl, obl, k, r;
162 char bufi[8], bufo[6];
163
164 if (!n)
165 return (size_t)(-2);
166
167 t = memchr (ps, 0, sizeof (*ps));
168 k = t ? (t - (char *)ps) : sizeof (*ps);
169 if (k > sizeof (bufi))
170 k = 0;
171 if (k)
172 {
173 /* use the buffer for input */
174 memcpy (bufi, ps, k);
175 ib = bufi;
176 ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
177 memcpy (bufi + k, s, ibmax - bufi - k);
178 }
179 else
180 {
181 /* use the real input */
182 ib = (ICONV_CONST char*) s;
183 ibmax = (ICONV_CONST char*) s + n;
184 }
185
186 ob = bufo;
187 obl = sizeof (bufo);
188 ibl = 1;
189
190 for (;;)
191 {
192 r = iconv (cd, &ib, &ibl, &ob, &obl);
193 if (ob > bufo && (!k || ib > bufi + k))
194 {
195 /* we have a character */
196 memset (ps, 0, sizeof (*ps));
197 utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
198 return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
199 }
200 else if (!r || (r == (size_t)(-1) && errno == EINVAL))
201 {
202 if (ib + ibl < ibmax)
203 /* try using more input */
204 ++ibl;
205 else if (k && ib > bufi + k && bufi + k + n > ibmax)
206 {
207 /* switch to using real input */
208 ib = (ICONV_CONST char*) s + (ib - bufi - k);
209 ibmax = (ICONV_CONST char*) s + n;
210 k = 0;
211 ++ibl;
212 }
213 else
214 {
215 /* save the state and give up */
216 memset (ps, 0, sizeof (*ps));
217 if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
218 memcpy (ps, ib, ibl);
219 return (size_t)(-2);
220 }
221 }
222 else
223 {
224 /* bad input */
225 errno = EILSEQ;
226 return (size_t)(-1);
227 }
228 }
229}
230
231size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
232{
233 static mbstate_t mbstate;
234
235 if (!ps)
236 ps = &mbstate;
237
238 if (Charset_is_utf8)
239 return utf8rtowc (pwc, s, n, ps);
240 else if (charset_to_utf8 != (iconv_t)(-1))
241 return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
242 else
243 {
244 if (!s)
245 {
246 memset(ps, 0, sizeof(*ps));
247 return 0;
248 }
249 if (!n)
250 return (size_t)-2;
251 if (pwc)
252 *pwc = (wchar_t)(unsigned char)*s;
253 return (*s != 0);
254 }
255}
256
257int iswprint (wint_t wc)
258{
259 if (Charset_is_utf8 || charset_is_ja)
260 return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
261 else
262 return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
263}
264
265int iswspace (wint_t wc)
266{
267 if (Charset_is_utf8 || charset_is_ja)
268 return (9 <= wc && wc <= 13) || wc == 32;
269 else
270 return (0 <= wc && wc < 256) ? isspace (wc) : 0;
271}
272
273static wint_t towupper_ucs (wint_t x)
274{
275 /* Only works for x < 0x130 */
276 if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
277 return x - 32;
278 else if (0x100 <= x && x < 0x130)
279 return x & ~1;
280 else if (x == 0xb5)
281 return 0x39c;
282 else if (x == 0xff)
283 return 0x178;
284 else
285 return x;
286}
287
288static int iswupper_ucs (wint_t x)
289{
290 /* Only works for x < 0x130 */
291 if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
292 return 0;
293 else if ((0x40 < x && x < 0x5b) || (0xbf < x && x < 0xde))
294 return 1;
295 else if (0x100 <= x && x < 0x130)
296 return 1;
297 else if (x == 0xb5)
298 return 1;
299 else if (x == 0xff)
300 return 0;
301 else
302 return 0;
303}
304
305static wint_t towlower_ucs (wint_t x)
306{
307 /* Only works for x < 0x130 */
308 if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
309 return x + 32;
310 else if (0x100 <= x && x < 0x130)
311 return x | 1;
312 else
313 return x;
314}
315
316static int iswalnum_ucs (wint_t wc)
317{
318 /* Only works for x < 0x220 */
319 if (wc >= 0x100)
320 return 1;
321 else if (wc < 0x30)
322 return 0;
323 else if (wc < 0x3a)
324 return 1;
325 else if (wc < 0xa0)
326 return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
327 else if (wc < 0xc0)
328 return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
329 else
330 return !(wc == 0xd7 || wc == 0xf7);
331}
332
333static int iswalpha_ucs (wint_t wc)
334{
335 /* Only works for x < 0x220 */
336 if (wc >= 0x100)
337 return 1;
338 else if (wc < 0x3a)
339 return 0;
340 else if (wc < 0xa0)
341 return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
342 else if (wc < 0xc0)
343 return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
344 else
345 return !(wc == 0xd7 || wc == 0xf7);
346}
347
348wint_t towupper (wint_t wc)
349{
350 if (Charset_is_utf8 || charset_is_ja)
351 return towupper_ucs (wc);
352 else
353 return (0 <= wc && wc < 256) ? toupper (wc) : wc;
354}
355
356wint_t towlower (wint_t wc)
357{
358 if (Charset_is_utf8 || charset_is_ja)
359 return towlower_ucs (wc);
360 else
361 return (0 <= wc && wc < 256) ? tolower (wc) : wc;
362}
363
364int iswalnum (wint_t wc)
365{
366 if (Charset_is_utf8 || charset_is_ja)
367 return iswalnum_ucs (wc);
368 else
369 return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
370}
371
372int iswalpha (wint_t wc)
373{
374 if (Charset_is_utf8 || charset_is_ja)
375 return iswalpha_ucs (wc);
376 else
377 return (0 <= wc && wc < 256) ? isalpha (wc) : 0;
378}
379
380int iswupper (wint_t wc)
381{
382 if (Charset_is_utf8 || charset_is_ja)
383 return iswupper_ucs (wc);
384 else
385 return (0 <= wc && wc < 256) ? isupper (wc) : 0;
386}
387
388/*
389 * l10n for Japanese:
390 * Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
391 * Character Set, have a column width of 2.
392 */
393int wcwidth_ja (wchar_t ucs)
394{
395 if (ucs >= 0x3021)
396 return -1; /* continue with the normal check */
397 /* a rough range for quick check */
398 if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
399 (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
400 (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */
401 (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */
402 return 2;
403 else
404 return -1;
405}
406
407int wcwidth_ucs(wchar_t ucs);
408
409int wcwidth (wchar_t wc)
410{
411 if (!Charset_is_utf8)
412 {
413 if (!charset_is_ja)
414 {
415 /* 8-bit case */
416 if (!wc)
417 return 0;
418 else if ((0 <= wc && wc < 256) && IsPrint (wc))
419 return 1;
420 else
421 return -1;
422 }
423 else
424 {
425 /* Japanese */
426 int k = wcwidth_ja (wc);
427 if (k != -1)
428 return k;
429 }
430 }
431 return wcwidth_ucs (wc);
432}
433
434size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
435{
436 static wchar_t mbstate;
437 wchar_t *ps = (wchar_t *)_ps;
438 size_t k = 1;
439 unsigned char c;
440 wchar_t wc;
441 int count;
442
443 if (!ps)
444 ps = &mbstate;
445
446 if (!s)
447 {
448 *ps = 0;
449 return 0;
450 }
451 if (!n)
452 return (size_t)-2;
453
454 if (!*ps)
455 {
456 c = (unsigned char)*s;
457 if (c < 0x80)
458 {
459 if (pwc)
460 *pwc = c;
461 return (c != 0);
462 }
463 else if (c < 0xc2)
464 {
465 errno = EILSEQ;
466 return (size_t)-1;
467 }
468 else if (c < 0xe0)
469 wc = ((c & 0x1f) << 6) + (count = 0);
470 else if (c < 0xf0)
471 wc = ((c & 0x0f) << 12) + (count = 1);
472 else if (c < 0xf8)
473 wc = ((c & 0x07) << 18) + (count = 2);
474 else if (c < 0xfc)
475 wc = ((c & 0x03) << 24) + (count = 3);
476 else if (c < 0xfe)
477 wc = ((c & 0x01) << 30) + (count = 4);
478 else
479 {
480 errno = EILSEQ;
481 return (size_t)-1;
482 }
483 ++s, --n, ++k;
484 }
485 else
486 {
487 wc = *ps & 0x7fffffff;
488 count = wc & 7; /* if count > 4 it will be caught below */
489 }
490
491 for (; n; ++s, --n, ++k)
492 {
493 c = (unsigned char)*s;
494 if (0x80 <= c && c < 0xc0)
495 {
496 wc |= (c & 0x3f) << (6 * count);
497 if (!count)
498 {
499 if (pwc)
500 *pwc = wc;
501 *ps = 0;
502 return wc ? k : 0;
503 }
504 --count, --wc;
505 if (!(wc >> (11+count*5)))
506 {
507 errno = count < 4 ? EILSEQ : EINVAL;
508 return (size_t)-1;
509 }
510 }
511 else
512 {
513 errno = EILSEQ;
514 return (size_t)-1;
515 }
516 }
517 *ps = wc;
518 return (size_t)-2;
519}
520
521#endif /* !HAVE_WC_FUNCS */
522
523wchar_t replacement_char (void)
524{
525 return Charset_is_utf8 ? 0xfffd : '?';
526}
527
528int is_display_corrupting_utf8 (wchar_t wc)
529{
530 if (wc == (wchar_t)0x200f || /* bidi markers: #3827 */
531 wc == (wchar_t)0x200e ||
532 wc == (wchar_t)0x00ad || /* soft hyphen: #3848 */
533 wc == (wchar_t)0xfeff || /* zero width no-break space */
534 (wc >= (wchar_t)0x2066 && /* misc directional markers */
535 wc <= (wchar_t)0x2069) ||
536 (wc >= (wchar_t)0x202a && /* misc directional markers: #3854 */
537 wc <= (wchar_t)0x202e))
538 return 1;
539 else
540 return 0;
541}
542
543int mutt_filter_unprintable (char **s)
544{
545 BUFFER *b = NULL;
546 wchar_t wc;
547 size_t k, k2;
548 char scratch[MB_LEN_MAX + 1];
549 char *p = *s;
550 mbstate_t mbstate1, mbstate2;
551
552 b = mutt_buffer_new ();
553 memset (&mbstate1, 0, sizeof (mbstate1));
554 memset (&mbstate2, 0, sizeof (mbstate2));
555 for (; (k = mbrtowc (&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
556 {
557 if (k == (size_t)(-1) || k == (size_t)(-2))
558 {
559 k = 1;
560 memset (&mbstate1, 0, sizeof (mbstate1));
561 wc = replacement_char();
562 }
563 if (!IsWPrint (wc))
564 wc = '?';
565 else if (Charset_is_utf8 &&
566 is_display_corrupting_utf8 (wc))
567 continue;
568 k2 = wcrtomb (scratch, wc, &mbstate2);
569 scratch[k2] = '\0';
570 mutt_buffer_addstr (b, scratch);
571 }
572 FREE (s); /* __FREE_CHECKED__ */
573 *s = b->data ? b->data : safe_calloc (1, 1);
574 FREE (&b);
575 return 0;
576}