mbyte.c at jcs · jcs.org/mutt

jcs.org / mutt
fork atom
mutt stable branch with some hacks
fork atom
mutt / mbyte.c
at jcs 576 lines 13 kB view raw
wrap content
Kevin McCarthy Clean up code indentation. 7y ago
248c2ee8
  1/*
  2 * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
  3 *
  4 *     This program is free software; you can redistribute it and/or modify
  5 *     it under the terms of the GNU General Public License as published by
  6 *     the Free Software Foundation; either version 2 of the License, or
  7 *     (at your option) any later version.
  8 *
  9 *     This program is distributed in the hope that it will be useful,
 10 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 *     GNU General Public License for more details.
 13 *
 14 *     You should have received a copy of the GNU General Public License
 15 *     along with this program; if not, write to the Free Software
 16 *     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 17 */
 18
 19/*
 20 * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
 21 */
 22
 23#if HAVE_CONFIG_H
 24# include "config.h"
 25#endif
 26
 27#include "mutt.h"
 28#include "mbyte.h"
 29#include "charset.h"
 30
 31#include <errno.h>
 32
 33#include <ctype.h>
 34
 35#ifndef EILSEQ
 36#define EILSEQ EINVAL
 37#endif
 38
 39int Charset_is_utf8 = 0;
 40#ifndef HAVE_WC_FUNCS
 41static int charset_is_ja = 0;
 42static iconv_t charset_to_utf8 = (iconv_t)(-1);
 43static iconv_t charset_from_utf8 = (iconv_t)(-1);
 44#endif
 45
 46void mutt_set_charset (char *charset)
 47{
 48  char buffer[STRING];
 49
 50  mutt_canonical_charset (buffer, sizeof (buffer), charset);
 51
 52  Charset_is_utf8 = 0;
 53#ifndef HAVE_WC_FUNCS
 54  charset_is_ja = 0;
 55  if (charset_to_utf8 != (iconv_t)(-1))
 56  {
 57    iconv_close (charset_to_utf8);
 58    charset_to_utf8 = (iconv_t)(-1);
 59  }
 60  if (charset_from_utf8 != (iconv_t)(-1))
 61  {
 62    iconv_close (charset_from_utf8);
 63    charset_from_utf8 = (iconv_t)(-1);
 64  }
 65#endif
 66
 67  if (mutt_is_utf8 (buffer))
 68    Charset_is_utf8 = 1;
 69#ifndef HAVE_WC_FUNCS
 70  else if (!ascii_strcasecmp(buffer, "euc-jp") || !ascii_strcasecmp(buffer, "shift_jis")
 71           || !ascii_strcasecmp(buffer, "cp932") || !ascii_strcasecmp(buffer, "eucJP-ms"))
 72  {
 73    charset_is_ja = 1;
 74
 75    /* Note flags=0 to skip charset-hooks: User masters the $charset
 76     * name, and we are sure of our "utf-8" constant. So there is no
 77     * possibility of wrong name that we would want to try to correct
 78     * with a charset-hook. Or rather: If $charset was wrong, we would
 79     * want to try to correct... $charset directly.
 80     */
 81    charset_to_utf8 = mutt_iconv_open ("utf-8", charset, 0);
 82    charset_from_utf8 = mutt_iconv_open (charset, "utf-8", 0);
 83  }
 84#endif
 85
 86#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
 87  bind_textdomain_codeset(PACKAGE, buffer);
 88#endif
 89}
 90
 91#ifndef HAVE_WC_FUNCS
 92
 93/*
 94 * For systems that don't have them, we provide here our own
 95 * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
 96 * Instead of using the locale, as these functions normally would,
 97 * we use Mutt's Charset variable. We support 3 types of charset:
 98 * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
 99 * (2) For UTF-8, wchar_t uses UCS.
100 * (3) For stateless Japanese encodings, we use UCS and convert
101 *     via UTF-8 using iconv.
102 * Unfortunately, we can't handle non-stateless encodings.
103 */
104
105static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
106{
107  char buf[MB_LEN_MAX+1];
108  ICONV_CONST char *ib;
109  char *ob;
110  size_t ibl, obl, r;
111
112  if (s)
113  {
114    ibl = mutt_wctoutf8 (buf, wc, sizeof (buf));
115    if (ibl == (size_t)(-1))
116      return (size_t)(-1);
117    ib = buf;
118    ob = s;
119    obl = MB_LEN_MAX;
120    r = iconv (cd, &ib, &ibl, &ob, &obl);
121  }
122  else
123  {
124    ib = "";
125    ibl = 1;
126    ob = buf;
127    obl = sizeof (buf);
128    r = iconv (cd, &ib, &ibl, &ob, &obl);
129  }
130  return ob - s;
131}
132
133size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
134{
135  /* We only handle stateless encodings, so we can ignore ps. */
136
137  if (Charset_is_utf8)
138    return mutt_wctoutf8 (s, wc, MB_LEN_MAX);
139  else if (charset_from_utf8 != (iconv_t)(-1))
140    return wcrtomb_iconv (s, wc, charset_from_utf8);
141  else
142  {
143    if (!s)
144      return 1;
145    if (wc < 0x100)
146    {
147      *s = wc;
148      return 1;
149    }
150    errno = EILSEQ;
151    return (size_t)(-1);
152  }
153}
154
155size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n,
156		      mbstate_t *ps, iconv_t cd)
157{
158  static mbstate_t mbstate;
159  ICONV_CONST char *ib, *ibmax;
160  char *ob, *t;
161  size_t ibl, obl, k, r;
162  char bufi[8], bufo[6];
163
164  if (!n)
165    return (size_t)(-2);
166
167  t = memchr (ps, 0, sizeof (*ps));
168  k = t ? (t - (char *)ps) : sizeof (*ps);
169  if (k > sizeof (bufi))
170    k = 0;
171  if (k)
172  {
173    /* use the buffer for input */
174    memcpy (bufi, ps, k);
175    ib = bufi;
176    ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
177    memcpy (bufi + k, s, ibmax - bufi - k);
178  }
179  else
180  {
181    /* use the real input */
182    ib = (ICONV_CONST char*) s;
183    ibmax = (ICONV_CONST char*) s + n;
184  }
185
186  ob = bufo;
187  obl = sizeof (bufo);
188  ibl = 1;
189
190  for (;;)
191  {
192    r = iconv (cd, &ib, &ibl, &ob, &obl);
193    if (ob > bufo && (!k || ib > bufi + k))
194    {
195      /* we have a character */
196      memset (ps, 0, sizeof (*ps));
197      utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
198      return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
199    }
200    else if (!r || (r == (size_t)(-1) && errno == EINVAL))
201    {
202      if (ib + ibl < ibmax)
203	/* try using more input */
204	++ibl;
205      else if (k && ib > bufi + k && bufi + k + n > ibmax)
206      {
207	/* switch to using real input */
208	ib = (ICONV_CONST char*) s + (ib - bufi - k);
209	ibmax = (ICONV_CONST char*) s + n;
210	k = 0;
211	++ibl;
212      }
213      else
214      {
215	/* save the state and give up */
216	memset (ps, 0, sizeof (*ps));
217	if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
218	  memcpy (ps, ib, ibl);
219	return (size_t)(-2);
220      }
221    }
222    else
223    {
224      /* bad input */
225      errno = EILSEQ;
226      return (size_t)(-1);
227    }
228  }
229}
230
231size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
232{
233  static mbstate_t mbstate;
234
235  if (!ps)
236    ps = &mbstate;
237
238  if (Charset_is_utf8)
239    return utf8rtowc (pwc, s, n, ps);
240  else if (charset_to_utf8 != (iconv_t)(-1))
241    return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
242  else
243  {
244    if (!s)
245    {
246      memset(ps, 0, sizeof(*ps));
247      return 0;
248    }
249    if (!n)
250      return (size_t)-2;
251    if (pwc)
252      *pwc = (wchar_t)(unsigned char)*s;
253    return (*s != 0);
254  }
255}
256
257int iswprint (wint_t wc)
258{
259  if (Charset_is_utf8 || charset_is_ja)
260    return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
261  else
262    return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
263}
264
265int iswspace (wint_t wc)
266{
267  if (Charset_is_utf8 || charset_is_ja)
268    return (9 <= wc && wc <= 13) || wc == 32;
269  else
270    return (0 <= wc && wc < 256) ? isspace (wc) : 0;
271}
272
273static wint_t towupper_ucs (wint_t x)
274{
275  /* Only works for x < 0x130 */
276  if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
277    return x - 32;
278  else if (0x100 <= x && x < 0x130)
279    return x & ~1;
280  else if (x == 0xb5)
281    return 0x39c;
282  else if (x == 0xff)
283    return 0x178;
284  else
285    return x;
286}
287
288static int iswupper_ucs (wint_t x)
289{
290  /* Only works for x < 0x130 */
291  if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
292    return 0;
293  else if ((0x40 < x && x < 0x5b) || (0xbf < x && x < 0xde))
294    return 1;
295  else if (0x100 <= x && x < 0x130)
296    return 1;
297  else if (x == 0xb5)
298    return 1;
299  else if (x == 0xff)
300    return 0;
301  else
302    return 0;
303}
304
305static wint_t towlower_ucs (wint_t x)
306{
307  /* Only works for x < 0x130 */
308  if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
309    return x + 32;
310  else if (0x100 <= x && x < 0x130)
311    return x | 1;
312  else
313    return x;
314}
315
316static int iswalnum_ucs (wint_t wc)
317{
318  /* Only works for x < 0x220 */
319  if (wc >= 0x100)
320    return 1;
321  else if (wc < 0x30)
322    return 0;
323  else if (wc < 0x3a)
324    return 1;
325  else if (wc < 0xa0)
326    return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
327  else if (wc < 0xc0)
328    return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
329  else
330    return !(wc == 0xd7 || wc == 0xf7);
331}
332
333static int iswalpha_ucs (wint_t wc)
334{
335  /* Only works for x < 0x220 */
336  if (wc >= 0x100)
337    return 1;
338  else if (wc < 0x3a)
339    return 0;
340  else if (wc < 0xa0)
341    return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
342  else if (wc < 0xc0)
343    return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
344  else
345    return !(wc == 0xd7 || wc == 0xf7);
346}
347
348wint_t towupper (wint_t wc)
349{
350  if (Charset_is_utf8 || charset_is_ja)
351    return towupper_ucs (wc);
352  else
353    return (0 <= wc && wc < 256) ? toupper (wc) : wc;
354}
355
356wint_t towlower (wint_t wc)
357{
358  if (Charset_is_utf8 || charset_is_ja)
359    return towlower_ucs (wc);
360  else
361    return (0 <= wc && wc < 256) ? tolower (wc) : wc;
362}
363
364int iswalnum (wint_t wc)
365{
366  if (Charset_is_utf8 || charset_is_ja)
367    return iswalnum_ucs (wc);
368  else
369    return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
370}
371
372int iswalpha (wint_t wc)
373{
374  if (Charset_is_utf8 || charset_is_ja)
375    return iswalpha_ucs (wc);
376  else
377    return (0 <= wc && wc < 256) ? isalpha (wc) : 0;
378}
379
380int iswupper (wint_t wc)
381{
382  if (Charset_is_utf8 || charset_is_ja)
383    return iswupper_ucs (wc);
384  else
385    return (0 <= wc && wc < 256) ? isupper (wc) : 0;
386}
387
388/*
389 * l10n for Japanese:
390 *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
391 *   Character Set, have a column width of 2.
392 */
393int wcwidth_ja (wchar_t ucs)
394{
395  if (ucs >= 0x3021)
396    return -1; /* continue with the normal check */
397  /* a rough range for quick check */
398  if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
399      (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
400      (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */
401      (ucs >= 0x3000 && ucs <= 0x3020))   /* CJK Symbols and Punctuation */
402    return 2;
403  else
404    return -1;
405}
406
407int wcwidth_ucs(wchar_t ucs);
408
409int wcwidth (wchar_t wc)
410{
411  if (!Charset_is_utf8)
412  {
413    if (!charset_is_ja)
414    {
415      /* 8-bit case */
416      if (!wc)
417	return 0;
418      else if ((0 <= wc && wc < 256) && IsPrint (wc))
419	return 1;
420      else
421	return -1;
422    }
423    else
424    {
425      /* Japanese */
426      int k = wcwidth_ja (wc);
427      if (k != -1)
428	return k;
429    }
430  }
431  return wcwidth_ucs (wc);
432}
433
434size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
435{
436  static wchar_t mbstate;
437  wchar_t *ps = (wchar_t *)_ps;
438  size_t k = 1;
439  unsigned char c;
440  wchar_t wc;
441  int count;
442
443  if (!ps)
444    ps = &mbstate;
445
446  if (!s)
447  {
448    *ps = 0;
449    return 0;
450  }
451  if (!n)
452    return (size_t)-2;
453
454  if (!*ps)
455  {
456    c = (unsigned char)*s;
457    if (c < 0x80)
458    {
459      if (pwc)
460	*pwc = c;
461      return (c != 0);
462    }
463    else if (c < 0xc2)
464    {
465      errno = EILSEQ;
466      return (size_t)-1;
467    }
468    else if (c < 0xe0)
469      wc = ((c & 0x1f) << 6) + (count = 0);
470    else if (c < 0xf0)
471      wc = ((c & 0x0f) << 12) + (count = 1);
472    else if (c < 0xf8)
473      wc = ((c & 0x07) << 18) + (count = 2);
474    else if (c < 0xfc)
475      wc = ((c & 0x03) << 24) + (count = 3);
476    else if (c < 0xfe)
477      wc = ((c & 0x01) << 30) + (count = 4);
478    else
479    {
480      errno = EILSEQ;
481      return (size_t)-1;
482    }
483    ++s, --n, ++k;
484  }
485  else
486  {
487    wc = *ps & 0x7fffffff;
488    count = wc & 7; /* if count > 4 it will be caught below */
489  }
490
491  for (; n; ++s, --n, ++k)
492  {
493    c = (unsigned char)*s;
494    if (0x80 <= c && c < 0xc0)
495    {
496      wc |= (c & 0x3f) << (6 * count);
497      if (!count)
498      {
499	if (pwc)
500	  *pwc = wc;
501	*ps = 0;
502	return wc ? k : 0;
503      }
504      --count, --wc;
505      if (!(wc >> (11+count*5)))
506      {
507	errno = count < 4 ? EILSEQ : EINVAL;
508	return (size_t)-1;
509      }
510    }
511    else
512    {
513      errno = EILSEQ;
514      return (size_t)-1;
515    }
516  }
517  *ps = wc;
518  return (size_t)-2;
519}
520
521#endif /* !HAVE_WC_FUNCS */
522
523wchar_t replacement_char (void)
524{
525  return Charset_is_utf8 ? 0xfffd : '?';
526}
527
528int is_display_corrupting_utf8 (wchar_t wc)
529{
530  if (wc == (wchar_t)0x200f ||   /* bidi markers: #3827 */
531      wc == (wchar_t)0x200e ||
532      wc == (wchar_t)0x00ad ||   /* soft hyphen: #3848 */
533      wc == (wchar_t)0xfeff ||   /* zero width no-break space */
534      (wc >= (wchar_t)0x2066 &&  /* misc directional markers */
535       wc <= (wchar_t)0x2069) ||
536      (wc >= (wchar_t)0x202a &&  /* misc directional markers: #3854 */
537       wc <= (wchar_t)0x202e))
538    return 1;
539  else
540    return 0;
541}
542
543int mutt_filter_unprintable (char **s)
544{
545  BUFFER *b = NULL;
546  wchar_t wc;
547  size_t k, k2;
548  char scratch[MB_LEN_MAX + 1];
549  char *p = *s;
550  mbstate_t mbstate1, mbstate2;
551
552  b = mutt_buffer_new ();
553  memset (&mbstate1, 0, sizeof (mbstate1));
554  memset (&mbstate2, 0, sizeof (mbstate2));
555  for (; (k = mbrtowc (&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
556  {
557    if (k == (size_t)(-1) || k == (size_t)(-2))
558    {
559      k = 1;
560      memset (&mbstate1, 0, sizeof (mbstate1));
561      wc = replacement_char();
562    }
563    if (!IsWPrint (wc))
564      wc = '?';
565    else if (Charset_is_utf8 &&
566             is_display_corrupting_utf8 (wc))
567      continue;
568    k2 = wcrtomb (scratch, wc, &mbstate2);
569    scratch[k2] = '\0';
570    mutt_buffer_addstr (b, scratch);
571  }
572  FREE (s);  /* __FREE_CHECKED__ */
573  *s = b->data ? b->data : safe_calloc (1, 1);
574  FREE (&b);
575  return 0;
576}