mutt stable branch with some hacks
at jcs 576 lines 13 kB view raw
1/* 2 * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 */ 18 19/* 20 * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>. 21 */ 22 23#if HAVE_CONFIG_H 24# include "config.h" 25#endif 26 27#include "mutt.h" 28#include "mbyte.h" 29#include "charset.h" 30 31#include <errno.h> 32 33#include <ctype.h> 34 35#ifndef EILSEQ 36#define EILSEQ EINVAL 37#endif 38 39int Charset_is_utf8 = 0; 40#ifndef HAVE_WC_FUNCS 41static int charset_is_ja = 0; 42static iconv_t charset_to_utf8 = (iconv_t)(-1); 43static iconv_t charset_from_utf8 = (iconv_t)(-1); 44#endif 45 46void mutt_set_charset (char *charset) 47{ 48 char buffer[STRING]; 49 50 mutt_canonical_charset (buffer, sizeof (buffer), charset); 51 52 Charset_is_utf8 = 0; 53#ifndef HAVE_WC_FUNCS 54 charset_is_ja = 0; 55 if (charset_to_utf8 != (iconv_t)(-1)) 56 { 57 iconv_close (charset_to_utf8); 58 charset_to_utf8 = (iconv_t)(-1); 59 } 60 if (charset_from_utf8 != (iconv_t)(-1)) 61 { 62 iconv_close (charset_from_utf8); 63 charset_from_utf8 = (iconv_t)(-1); 64 } 65#endif 66 67 if (mutt_is_utf8 (buffer)) 68 Charset_is_utf8 = 1; 69#ifndef HAVE_WC_FUNCS 70 else if (!ascii_strcasecmp(buffer, "euc-jp") || !ascii_strcasecmp(buffer, "shift_jis") 71 || !ascii_strcasecmp(buffer, "cp932") || !ascii_strcasecmp(buffer, "eucJP-ms")) 72 { 73 charset_is_ja = 1; 74 75 /* Note flags=0 to skip charset-hooks: User masters the $charset 76 * name, and we are sure of our "utf-8" constant. So there is no 77 * possibility of wrong name that we would want to try to correct 78 * with a charset-hook. Or rather: If $charset was wrong, we would 79 * want to try to correct... $charset directly. 80 */ 81 charset_to_utf8 = mutt_iconv_open ("utf-8", charset, 0); 82 charset_from_utf8 = mutt_iconv_open (charset, "utf-8", 0); 83 } 84#endif 85 86#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS) 87 bind_textdomain_codeset(PACKAGE, buffer); 88#endif 89} 90 91#ifndef HAVE_WC_FUNCS 92 93/* 94 * For systems that don't have them, we provide here our own 95 * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth(). 96 * Instead of using the locale, as these functions normally would, 97 * we use Mutt's Charset variable. We support 3 types of charset: 98 * (1) For 8-bit charsets, wchar_t uses the same encoding as char. 99 * (2) For UTF-8, wchar_t uses UCS. 100 * (3) For stateless Japanese encodings, we use UCS and convert 101 * via UTF-8 using iconv. 102 * Unfortunately, we can't handle non-stateless encodings. 103 */ 104 105static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd) 106{ 107 char buf[MB_LEN_MAX+1]; 108 ICONV_CONST char *ib; 109 char *ob; 110 size_t ibl, obl, r; 111 112 if (s) 113 { 114 ibl = mutt_wctoutf8 (buf, wc, sizeof (buf)); 115 if (ibl == (size_t)(-1)) 116 return (size_t)(-1); 117 ib = buf; 118 ob = s; 119 obl = MB_LEN_MAX; 120 r = iconv (cd, &ib, &ibl, &ob, &obl); 121 } 122 else 123 { 124 ib = ""; 125 ibl = 1; 126 ob = buf; 127 obl = sizeof (buf); 128 r = iconv (cd, &ib, &ibl, &ob, &obl); 129 } 130 return ob - s; 131} 132 133size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps) 134{ 135 /* We only handle stateless encodings, so we can ignore ps. */ 136 137 if (Charset_is_utf8) 138 return mutt_wctoutf8 (s, wc, MB_LEN_MAX); 139 else if (charset_from_utf8 != (iconv_t)(-1)) 140 return wcrtomb_iconv (s, wc, charset_from_utf8); 141 else 142 { 143 if (!s) 144 return 1; 145 if (wc < 0x100) 146 { 147 *s = wc; 148 return 1; 149 } 150 errno = EILSEQ; 151 return (size_t)(-1); 152 } 153} 154 155size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n, 156 mbstate_t *ps, iconv_t cd) 157{ 158 static mbstate_t mbstate; 159 ICONV_CONST char *ib, *ibmax; 160 char *ob, *t; 161 size_t ibl, obl, k, r; 162 char bufi[8], bufo[6]; 163 164 if (!n) 165 return (size_t)(-2); 166 167 t = memchr (ps, 0, sizeof (*ps)); 168 k = t ? (t - (char *)ps) : sizeof (*ps); 169 if (k > sizeof (bufi)) 170 k = 0; 171 if (k) 172 { 173 /* use the buffer for input */ 174 memcpy (bufi, ps, k); 175 ib = bufi; 176 ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi)); 177 memcpy (bufi + k, s, ibmax - bufi - k); 178 } 179 else 180 { 181 /* use the real input */ 182 ib = (ICONV_CONST char*) s; 183 ibmax = (ICONV_CONST char*) s + n; 184 } 185 186 ob = bufo; 187 obl = sizeof (bufo); 188 ibl = 1; 189 190 for (;;) 191 { 192 r = iconv (cd, &ib, &ibl, &ob, &obl); 193 if (ob > bufo && (!k || ib > bufi + k)) 194 { 195 /* we have a character */ 196 memset (ps, 0, sizeof (*ps)); 197 utf8rtowc (pwc, bufo, ob - bufo, &mbstate); 198 return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0; 199 } 200 else if (!r || (r == (size_t)(-1) && errno == EINVAL)) 201 { 202 if (ib + ibl < ibmax) 203 /* try using more input */ 204 ++ibl; 205 else if (k && ib > bufi + k && bufi + k + n > ibmax) 206 { 207 /* switch to using real input */ 208 ib = (ICONV_CONST char*) s + (ib - bufi - k); 209 ibmax = (ICONV_CONST char*) s + n; 210 k = 0; 211 ++ibl; 212 } 213 else 214 { 215 /* save the state and give up */ 216 memset (ps, 0, sizeof (*ps)); 217 if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */ 218 memcpy (ps, ib, ibl); 219 return (size_t)(-2); 220 } 221 } 222 else 223 { 224 /* bad input */ 225 errno = EILSEQ; 226 return (size_t)(-1); 227 } 228 } 229} 230 231size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) 232{ 233 static mbstate_t mbstate; 234 235 if (!ps) 236 ps = &mbstate; 237 238 if (Charset_is_utf8) 239 return utf8rtowc (pwc, s, n, ps); 240 else if (charset_to_utf8 != (iconv_t)(-1)) 241 return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8); 242 else 243 { 244 if (!s) 245 { 246 memset(ps, 0, sizeof(*ps)); 247 return 0; 248 } 249 if (!n) 250 return (size_t)-2; 251 if (pwc) 252 *pwc = (wchar_t)(unsigned char)*s; 253 return (*s != 0); 254 } 255} 256 257int iswprint (wint_t wc) 258{ 259 if (Charset_is_utf8 || charset_is_ja) 260 return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc); 261 else 262 return (0 <= wc && wc < 256) ? IsPrint (wc) : 0; 263} 264 265int iswspace (wint_t wc) 266{ 267 if (Charset_is_utf8 || charset_is_ja) 268 return (9 <= wc && wc <= 13) || wc == 32; 269 else 270 return (0 <= wc && wc < 256) ? isspace (wc) : 0; 271} 272 273static wint_t towupper_ucs (wint_t x) 274{ 275 /* Only works for x < 0x130 */ 276 if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7)) 277 return x - 32; 278 else if (0x100 <= x && x < 0x130) 279 return x & ~1; 280 else if (x == 0xb5) 281 return 0x39c; 282 else if (x == 0xff) 283 return 0x178; 284 else 285 return x; 286} 287 288static int iswupper_ucs (wint_t x) 289{ 290 /* Only works for x < 0x130 */ 291 if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7)) 292 return 0; 293 else if ((0x40 < x && x < 0x5b) || (0xbf < x && x < 0xde)) 294 return 1; 295 else if (0x100 <= x && x < 0x130) 296 return 1; 297 else if (x == 0xb5) 298 return 1; 299 else if (x == 0xff) 300 return 0; 301 else 302 return 0; 303} 304 305static wint_t towlower_ucs (wint_t x) 306{ 307 /* Only works for x < 0x130 */ 308 if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7)) 309 return x + 32; 310 else if (0x100 <= x && x < 0x130) 311 return x | 1; 312 else 313 return x; 314} 315 316static int iswalnum_ucs (wint_t wc) 317{ 318 /* Only works for x < 0x220 */ 319 if (wc >= 0x100) 320 return 1; 321 else if (wc < 0x30) 322 return 0; 323 else if (wc < 0x3a) 324 return 1; 325 else if (wc < 0xa0) 326 return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b); 327 else if (wc < 0xc0) 328 return (wc == 0xaa || wc == 0xb5 || wc == 0xba); 329 else 330 return !(wc == 0xd7 || wc == 0xf7); 331} 332 333static int iswalpha_ucs (wint_t wc) 334{ 335 /* Only works for x < 0x220 */ 336 if (wc >= 0x100) 337 return 1; 338 else if (wc < 0x3a) 339 return 0; 340 else if (wc < 0xa0) 341 return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b); 342 else if (wc < 0xc0) 343 return (wc == 0xaa || wc == 0xb5 || wc == 0xba); 344 else 345 return !(wc == 0xd7 || wc == 0xf7); 346} 347 348wint_t towupper (wint_t wc) 349{ 350 if (Charset_is_utf8 || charset_is_ja) 351 return towupper_ucs (wc); 352 else 353 return (0 <= wc && wc < 256) ? toupper (wc) : wc; 354} 355 356wint_t towlower (wint_t wc) 357{ 358 if (Charset_is_utf8 || charset_is_ja) 359 return towlower_ucs (wc); 360 else 361 return (0 <= wc && wc < 256) ? tolower (wc) : wc; 362} 363 364int iswalnum (wint_t wc) 365{ 366 if (Charset_is_utf8 || charset_is_ja) 367 return iswalnum_ucs (wc); 368 else 369 return (0 <= wc && wc < 256) ? isalnum (wc) : 0; 370} 371 372int iswalpha (wint_t wc) 373{ 374 if (Charset_is_utf8 || charset_is_ja) 375 return iswalpha_ucs (wc); 376 else 377 return (0 <= wc && wc < 256) ? isalpha (wc) : 0; 378} 379 380int iswupper (wint_t wc) 381{ 382 if (Charset_is_utf8 || charset_is_ja) 383 return iswupper_ucs (wc); 384 else 385 return (0 <= wc && wc < 256) ? isupper (wc) : 0; 386} 387 388/* 389 * l10n for Japanese: 390 * Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji 391 * Character Set, have a column width of 2. 392 */ 393int wcwidth_ja (wchar_t ucs) 394{ 395 if (ucs >= 0x3021) 396 return -1; /* continue with the normal check */ 397 /* a rough range for quick check */ 398 if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */ 399 (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */ 400 (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */ 401 (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */ 402 return 2; 403 else 404 return -1; 405} 406 407int wcwidth_ucs(wchar_t ucs); 408 409int wcwidth (wchar_t wc) 410{ 411 if (!Charset_is_utf8) 412 { 413 if (!charset_is_ja) 414 { 415 /* 8-bit case */ 416 if (!wc) 417 return 0; 418 else if ((0 <= wc && wc < 256) && IsPrint (wc)) 419 return 1; 420 else 421 return -1; 422 } 423 else 424 { 425 /* Japanese */ 426 int k = wcwidth_ja (wc); 427 if (k != -1) 428 return k; 429 } 430 } 431 return wcwidth_ucs (wc); 432} 433 434size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps) 435{ 436 static wchar_t mbstate; 437 wchar_t *ps = (wchar_t *)_ps; 438 size_t k = 1; 439 unsigned char c; 440 wchar_t wc; 441 int count; 442 443 if (!ps) 444 ps = &mbstate; 445 446 if (!s) 447 { 448 *ps = 0; 449 return 0; 450 } 451 if (!n) 452 return (size_t)-2; 453 454 if (!*ps) 455 { 456 c = (unsigned char)*s; 457 if (c < 0x80) 458 { 459 if (pwc) 460 *pwc = c; 461 return (c != 0); 462 } 463 else if (c < 0xc2) 464 { 465 errno = EILSEQ; 466 return (size_t)-1; 467 } 468 else if (c < 0xe0) 469 wc = ((c & 0x1f) << 6) + (count = 0); 470 else if (c < 0xf0) 471 wc = ((c & 0x0f) << 12) + (count = 1); 472 else if (c < 0xf8) 473 wc = ((c & 0x07) << 18) + (count = 2); 474 else if (c < 0xfc) 475 wc = ((c & 0x03) << 24) + (count = 3); 476 else if (c < 0xfe) 477 wc = ((c & 0x01) << 30) + (count = 4); 478 else 479 { 480 errno = EILSEQ; 481 return (size_t)-1; 482 } 483 ++s, --n, ++k; 484 } 485 else 486 { 487 wc = *ps & 0x7fffffff; 488 count = wc & 7; /* if count > 4 it will be caught below */ 489 } 490 491 for (; n; ++s, --n, ++k) 492 { 493 c = (unsigned char)*s; 494 if (0x80 <= c && c < 0xc0) 495 { 496 wc |= (c & 0x3f) << (6 * count); 497 if (!count) 498 { 499 if (pwc) 500 *pwc = wc; 501 *ps = 0; 502 return wc ? k : 0; 503 } 504 --count, --wc; 505 if (!(wc >> (11+count*5))) 506 { 507 errno = count < 4 ? EILSEQ : EINVAL; 508 return (size_t)-1; 509 } 510 } 511 else 512 { 513 errno = EILSEQ; 514 return (size_t)-1; 515 } 516 } 517 *ps = wc; 518 return (size_t)-2; 519} 520 521#endif /* !HAVE_WC_FUNCS */ 522 523wchar_t replacement_char (void) 524{ 525 return Charset_is_utf8 ? 0xfffd : '?'; 526} 527 528int is_display_corrupting_utf8 (wchar_t wc) 529{ 530 if (wc == (wchar_t)0x200f || /* bidi markers: #3827 */ 531 wc == (wchar_t)0x200e || 532 wc == (wchar_t)0x00ad || /* soft hyphen: #3848 */ 533 wc == (wchar_t)0xfeff || /* zero width no-break space */ 534 (wc >= (wchar_t)0x2066 && /* misc directional markers */ 535 wc <= (wchar_t)0x2069) || 536 (wc >= (wchar_t)0x202a && /* misc directional markers: #3854 */ 537 wc <= (wchar_t)0x202e)) 538 return 1; 539 else 540 return 0; 541} 542 543int mutt_filter_unprintable (char **s) 544{ 545 BUFFER *b = NULL; 546 wchar_t wc; 547 size_t k, k2; 548 char scratch[MB_LEN_MAX + 1]; 549 char *p = *s; 550 mbstate_t mbstate1, mbstate2; 551 552 b = mutt_buffer_new (); 553 memset (&mbstate1, 0, sizeof (mbstate1)); 554 memset (&mbstate2, 0, sizeof (mbstate2)); 555 for (; (k = mbrtowc (&wc, p, MB_LEN_MAX, &mbstate1)); p += k) 556 { 557 if (k == (size_t)(-1) || k == (size_t)(-2)) 558 { 559 k = 1; 560 memset (&mbstate1, 0, sizeof (mbstate1)); 561 wc = replacement_char(); 562 } 563 if (!IsWPrint (wc)) 564 wc = '?'; 565 else if (Charset_is_utf8 && 566 is_display_corrupting_utf8 (wc)) 567 continue; 568 k2 = wcrtomb (scratch, wc, &mbstate2); 569 scratch[k2] = '\0'; 570 mutt_buffer_addstr (b, scratch); 571 } 572 FREE (s); /* __FREE_CHECKED__ */ 573 *s = b->data ? b->data : safe_calloc (1, 1); 574 FREE (&b); 575 return 0; 576}