charset.c at master · jcs.org/mutt

jcs.org / mutt
fork atom
mutt stable branch with some hacks
fork atom
mutt / charset.c
at master 679 lines 17 kB view raw
wrap content
Derek Martin Change M_* symbols to MUTT_* 10y ago
5067b384
  1/*
  2 * Copyright (C) 1999-2002,2007 Thomas Roessler <roessler@does-not-exist.org>
  3 *
  4 *     This program is free software; you can redistribute it
  5 *     and/or modify it under the terms of the GNU General Public
  6 *     License as published by the Free Software Foundation; either
  7 *     version 2 of the License, or (at your option) any later
  8 *     version.
  9 *
 10 *     This program is distributed in the hope that it will be
 11 *     useful, but WITHOUT ANY WARRANTY; without even the implied
 12 *     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 13 *     PURPOSE.  See the GNU General Public License for more
 14 *     details.
 15 *
 16 *     You should have received a copy of the GNU General Public
 17 *     License along with this program; if not, write to the Free
 18 *     Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 19 *     Boston, MA  02110-1301, USA.
 20 */
 21
 22#if HAVE_CONFIG_H
 23# include "config.h"
 24#endif
 25
 26#include <string.h>
 27#include <stdio.h>
 28#include <stdlib.h>
 29
 30#include <ctype.h>
 31
 32#include <sys/types.h>
 33#include <dirent.h>
 34#include <unistd.h>
 35#include <errno.h>
 36
 37#include "mutt.h"
 38#include "charset.h"
 39
 40#ifndef EILSEQ
 41# define EILSEQ EINVAL
 42#endif
 43
 44/* 
 45 * The following list has been created manually from the data under:
 46 * http://www.isi.edu/in-notes/iana/assignments/character-sets
 47 * Last update: 2000-09-07
 48 *
 49 * Note that it includes only the subset of character sets for which
 50 * a preferred MIME name is given.
 51 */
 52
 53static const struct 
 54{
 55  const char *key;
 56  const char *pref;
 57}
 58PreferredMIMENames[] = 
 59{
 60  { "ansi_x3.4-1968", 	"us-ascii"     	},
 61  { "iso-ir-6",		"us-ascii" 	},
 62  { "iso_646.irv:1991",	"us-ascii" 	},
 63  { "ascii",		"us-ascii" 	},
 64  { "iso646-us",	"us-ascii" 	},
 65  { "us",		"us-ascii" 	},
 66  { "ibm367",		"us-ascii" 	},
 67  { "cp367",		"us-ascii" 	},
 68  { "csASCII",		"us-ascii" 	},
 69  
 70  { "csISO2022KR",	"iso-2022-kr" 	},
 71  { "csEUCKR",		"euc-kr"      	},
 72  { "csISO2022JP",	"iso-2022-jp"	},
 73  { "csISO2022JP2",	"iso-2022-jp-2" },
 74
 75  { "ISO_8859-1:1987",	"iso-8859-1"	},
 76  { "iso-ir-100",	"iso-8859-1"	},
 77  { "iso_8859-1",	"iso-8859-1"	},
 78  { "latin1",		"iso-8859-1"	},
 79  { "l1",		"iso-8859-1"	},
 80  { "IBM819",		"iso-8859-1"	},
 81  { "CP819",		"iso-8859-1"	},
 82  { "csISOLatin1",	"iso-8859-1"	},
 83  
 84  { "ISO_8859-2:1987",	"iso-8859-2"	},
 85  { "iso-ir-101",	"iso-8859-2"	},
 86  { "iso_8859-2",	"iso-8859-2"	},
 87  { "latin2",		"iso-8859-2"	},
 88  { "l2",		"iso-8859-2"	},
 89  { "csISOLatin2",	"iso-8859-2"	},
 90  
 91  { "ISO_8859-3:1988",	"iso-8859-3"	},
 92  { "iso-ir-109",	"iso-8859-3"	},
 93  { "ISO_8859-3",	"iso-8859-3"	},
 94  { "latin3",		"iso-8859-3"	},
 95  { "l3",		"iso-8859-3"	},
 96  { "csISOLatin3",	"iso-8859-3"	},
 97
 98  { "ISO_8859-4:1988",	"iso-8859-4"	},
 99  { "iso-ir-110",	"iso-8859-4"	},
100  { "ISO_8859-4",	"iso-8859-4"	},
101  { "latin4",		"iso-8859-4"	},
102  { "l4",		"iso-8859-4"	},
103  { "csISOLatin4",	"iso-8859-4"	},
104
105  { "ISO_8859-6:1987",	"iso-8859-6"	},
106  { "iso-ir-127",	"iso-8859-6"	},
107  { "iso_8859-6",	"iso-8859-6"	},
108  { "ECMA-114",		"iso-8859-6"	},
109  { "ASMO-708",		"iso-8859-6"	},
110  { "arabic",		"iso-8859-6"	},
111  { "csISOLatinArabic",	"iso-8859-6"	},
112  
113  { "ISO_8859-7:1987",	"iso-8859-7"	},
114  { "iso-ir-126",	"iso-8859-7"	},
115  { "ISO_8859-7",	"iso-8859-7"	},
116  { "ELOT_928",		"iso-8859-7"	},
117  { "ECMA-118",		"iso-8859-7"	},
118  { "greek",		"iso-8859-7"	},
119  { "greek8",		"iso-8859-7"	},
120  { "csISOLatinGreek",	"iso-8859-7"	},
121  
122  { "ISO_8859-8:1988",	"iso-8859-8"	},
123  { "iso-ir-138",	"iso-8859-8"	},
124  { "ISO_8859-8",	"iso-8859-8"	},
125  { "hebrew",		"iso-8859-8"	},
126  { "csISOLatinHebrew",	"iso-8859-8"	},
127
128  { "ISO_8859-5:1988",	"iso-8859-5"	},
129  { "iso-ir-144",	"iso-8859-5"	},
130  { "ISO_8859-5",	"iso-8859-5"	},
131  { "cyrillic",		"iso-8859-5"	},
132  { "csISOLatinCyrillic", "iso-8859-5"	},
133
134  { "ISO_8859-9:1989",	"iso-8859-9"	},
135  { "iso-ir-148",	"iso-8859-9"	},
136  { "ISO_8859-9",	"iso-8859-9"	},
137  { "latin5",		"iso-8859-9"	}, /* this is not a bug */
138  { "l5",		"iso-8859-9"	},
139  { "csISOLatin5",	"iso-8859-9"	},
140  
141  { "ISO_8859-10:1992",	"iso-8859-10"	},
142  { "iso-ir-157",	"iso-8859-10"	},
143  { "latin6",		"iso-8859-10"	}, /* this is not a bug */
144  { "l6",		"iso-8859-10"	},
145  { "csISOLatin6",	"iso-8859-10"	}, 
146  
147  { "csKOI8r",		"koi8-r"	},
148  
149  { "MS_Kanji",		"Shift_JIS"	}, /* Note the underscore! */
150  { "csShiftJis",	"Shift_JIS"	},
151  
152  { "Extended_UNIX_Code_Packed_Format_for_Japanese",
153      			"euc-jp"	},
154  { "csEUCPkdFmtJapanese", 
155      			"euc-jp"	},
156  
157  { "csGB2312",		"gb2312"	},
158  { "csbig5",		"big5"		},
159
160  /* 
161   * End of official brain damage.  What follows has been taken
162   * from glibc's localedata files. 
163   */
164
165  { "iso_8859-13",	"iso-8859-13"	},
166  { "iso-ir-179",	"iso-8859-13"	},
167  { "latin7",		"iso-8859-13"	}, /* this is not a bug */
168  { "l7",		"iso-8859-13"	},
169  
170  { "iso_8859-14",	"iso-8859-14"	},
171  { "latin8",		"iso-8859-14"	}, /* this is not a bug */
172  { "l8",		"iso-8859-14"	},
173
174  { "iso_8859-15",	"iso-8859-15"	},
175  { "latin9",		"iso-8859-15"	}, /* this is not a bug */
176
177  /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
178  { "latin0",           "iso-8859-15"   }, /* this is not a bug */
179  
180  { "iso_8859-16",      "iso-8859-16"   },
181  { "latin10",          "iso-8859-16"   }, /* this is not a bug */
182  
183  /* 
184   * David Champion <dgc@uchicago.edu> has observed this with
185   * nl_langinfo under SunOS 5.8. 
186   */
187  
188  { "646",		"us-ascii"	},
189  
190  /* 
191   * http://www.sun.com/software/white-papers/wp-unicode/
192   */
193
194  { "eucJP",		"euc-jp"	},
195  { "PCK",		"Shift_JIS"	},
196  { "ko_KR-euc",	"euc-kr"	},
197  { "zh_TW-big5",	"big5"		},
198
199  /* seems to be common on some systems */
200
201  { "sjis",		"Shift_JIS"	},
202  { "euc-jp-ms",	"eucJP-ms"	},
203
204
205  /*
206   * If you happen to encounter system-specific brain-damage with
207   * respect to character set naming, please add it above this
208   * comment, and submit a patch to <mutt-dev@mutt.org>. 
209   */
210  
211  /* End of aliases.  Please keep this line last. */
212  
213  { NULL, 		NULL		}
214};
215
216#ifdef HAVE_LANGINFO_CODESET
217# include <langinfo.h>
218
219
220void mutt_set_langinfo_charset (void)
221{
222  char buff[LONG_STRING];
223  char buff2[LONG_STRING];
224  
225  strfcpy (buff, nl_langinfo (CODESET), sizeof (buff));
226  mutt_canonical_charset (buff2, sizeof (buff2), buff);
227  
228  /* finally, set $charset */
229  if (!(Charset = safe_strdup (buff2)))
230    Charset = safe_strdup ("iso-8859-1");
231}
232
233#else
234
235void mutt_set_langinfo_charset (void)
236{
237  Charset = safe_strdup ("iso-8859-1");
238}
239
240#endif
241
242/* this first ties off any charset extension such as //TRANSLIT,
243   canonicalizes the charset and re-adds the extension */
244void mutt_canonical_charset (char *dest, size_t dlen, const char *name)
245{
246  size_t i;
247  char *p, *ext;
248  char in[LONG_STRING], scratch[LONG_STRING];
249
250  strfcpy (in, name, sizeof (in));
251  if ((ext = strchr (in, '/')))
252    *ext++ = 0;
253
254  if (!ascii_strcasecmp (in, "utf-8") || !ascii_strcasecmp (in, "utf8"))
255  {
256    strfcpy (dest, "utf-8", dlen);
257    goto out;
258  }
259
260  /* catch some common iso-8859-something misspellings */
261  if (!ascii_strncasecmp (in, "8859", 4) && in[4] != '-')
262    snprintf (scratch, sizeof (scratch), "iso-8859-%s", in +4);
263  else if (!ascii_strncasecmp (in, "8859-", 5))
264    snprintf (scratch, sizeof (scratch), "iso-8859-%s", in + 5);
265  else if (!ascii_strncasecmp (in, "iso8859", 7) && in[7] != '-')
266    snprintf (scratch, sizeof (scratch), "iso_8859-%s", in + 7);
267  else if (!ascii_strncasecmp (in, "iso8859-", 8))
268    snprintf (scratch, sizeof (scratch), "iso_8859-%s", in + 8);
269  else
270    strfcpy (scratch, in, sizeof (scratch));
271
272  for (i = 0; PreferredMIMENames[i].key; i++)
273    if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) ||
274	!mutt_strcasecmp (scratch, PreferredMIMENames[i].key))
275    {
276      strfcpy (dest, PreferredMIMENames[i].pref, dlen);
277      goto out;
278    }
279
280  strfcpy (dest, scratch, dlen);
281
282  /* for cosmetics' sake, transform to lowercase. */
283  for (p = dest; *p; p++)
284    *p = ascii_tolower (*p);
285
286out:
287  if (ext && *ext)
288  {
289    safe_strcat (dest, dlen, "/");
290    safe_strcat (dest, dlen, ext);
291  }
292}
293
294int mutt_chscmp (const char *s, const char *chs)
295{
296  char buffer[STRING];
297  int a, b;
298
299  if (!s) return 0;
300
301  /* charsets may have extensions mutt_canonical_charset()
302     leaves intact; we expect `chs' to originate from mutt
303     code, not user input (i.e. `chs' does _not_ have any
304     extension)
305     we simply check if the shorter string is a prefix for
306     the longer */
307  mutt_canonical_charset (buffer, sizeof (buffer), s);
308  a = mutt_strlen (buffer);
309  b = mutt_strlen (chs);
310  return !ascii_strncasecmp (a > b ? buffer : chs,
311			     a > b ? chs : buffer, MIN(a,b));
312}
313
314char *mutt_get_default_charset ()
315{
316  static char fcharset[SHORT_STRING];
317  const char *c = AssumedCharset;
318  const char *c1;
319
320  if (c && *c) {
321    c1 = strchr (c, ':');
322    strfcpy (fcharset, c, c1 ? (c1 - c + 1) : sizeof (fcharset));
323    return fcharset;
324  }
325  return strcpy (fcharset, "us-ascii"); /* __STRCPY_CHECKED__ */
326}
327
328#ifndef HAVE_ICONV
329
330iconv_t iconv_open (const char *tocode, const char *fromcode)
331{
332  return (iconv_t)(-1);
333}
334
335size_t iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
336	      char **outbuf, size_t *outbytesleft)
337{
338  return 0;
339}
340
341int iconv_close (iconv_t cd)
342{
343  return 0;
344}
345
346#endif /* !HAVE_ICONV */
347
348
349/*
350 * Like iconv_open, but canonicalises the charsets, applies
351 * charset-hooks, recanonicalises, and finally applies iconv-hooks.
352 * Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM
353 * applies them to fromcode. Callers should use flags=0 when fromcode
354 * can safely be considered true, either some constant, or some value
355 * provided by the user; MUTT_ICONV_HOOK_FROM should be used only when
356 * fromcode is unsure, taken from a possibly wrong incoming MIME label,
357 * or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions
358 * in some setups. Note: By design charset-hooks should never be, and
359 * are never, applied to tocode. Highlight note: The top-well-named
360 * MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.
361 */
362
363iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
364{
365  char tocode1[SHORT_STRING];
366  char fromcode1[SHORT_STRING];
367  char *tocode2, *fromcode2;
368  char *tmp;
369
370  iconv_t cd;
371
372  /* transform to MIME preferred charset names */
373  mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
374  mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
375
376  /* maybe apply charset-hooks and recanonicalise fromcode,
377   * but only when caller asked us to sanitize a potentialy wrong
378   * charset name incoming from the wild exterior. */
379  if ((flags & MUTT_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
380    mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
381
382  /* always apply iconv-hooks to suit system's iconv tastes */
383  tocode2 = mutt_iconv_hook (tocode1);
384  tocode2 = (tocode2) ? tocode2 : tocode1;
385  fromcode2 = mutt_iconv_hook (fromcode1);
386  fromcode2 = (fromcode2) ? fromcode2 : fromcode1;
387
388  /* call system iconv with names it appreciates */
389  if ((cd = iconv_open (tocode2, fromcode2)) != (iconv_t) -1)
390    return cd;
391  
392  return (iconv_t) -1;
393}
394
395
396/*
397 * Like iconv, but keeps going even when the input is invalid
398 * If you're supplying inrepls, the source charset should be stateless;
399 * if you're supplying an outrepl, the target charset should be.
400 */
401
402size_t mutt_iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
403		   char **outbuf, size_t *outbytesleft,
404		   ICONV_CONST char **inrepls, const char *outrepl)
405{
406  size_t ret = 0, ret1;
407  ICONV_CONST char *ib = *inbuf;
408  size_t ibl = *inbytesleft;
409  char *ob = *outbuf;
410  size_t obl = *outbytesleft;
411
412  for (;;)
413  {
414    ret1 = iconv (cd, &ib, &ibl, &ob, &obl);
415    if (ret1 != (size_t)-1)
416      ret += ret1;
417    if (ibl && obl && errno == EILSEQ)
418    {
419      if (inrepls)
420      {
421	/* Try replacing the input */
422	ICONV_CONST char **t;
423	for (t = inrepls; *t; t++)
424	{
425	  ICONV_CONST char *ib1 = *t;
426	  size_t ibl1 = strlen (*t);
427	  char *ob1 = ob;
428	  size_t obl1 = obl;
429	  iconv (cd, &ib1, &ibl1, &ob1, &obl1);
430	  if (!ibl1)
431	  {
432	    ++ib, --ibl;
433	    ob = ob1, obl = obl1;
434	    ++ret;
435	    break;
436	  }
437	}
438	if (*t)
439	  continue;
440      }
441      /* Replace the output */
442      if (!outrepl)
443	outrepl = "?";
444      iconv (cd, 0, 0, &ob, &obl);
445      if (obl)
446      {
447	int n = strlen (outrepl);
448	if (n > obl)
449	{
450	  outrepl = "?";
451	  n = 1;
452	}
453	memcpy (ob, outrepl, n);
454	++ib, --ibl;
455	ob += n, obl -= n;
456	++ret;
457	iconv (cd, 0, 0, 0, 0); /* for good measure */
458	continue;
459      }
460    }
461    *inbuf = ib, *inbytesleft = ibl;
462    *outbuf = ob, *outbytesleft = obl;
463    return ret;
464  }
465}
466
467
468/*
469 * Convert a string
470 * Used in rfc2047.c, rfc2231.c, crypt-gpgme.c, mutt_idna.c, and more.
471 * Parameter flags is given as-is to mutt_iconv_open(). See there
472 * for its meaning and usage policy.
473 */
474
475int mutt_convert_string (char **ps, const char *from, const char *to, int flags)
476{
477  iconv_t cd;
478  ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
479  char *s = *ps;
480
481  if (!s || !*s)
482    return 0;
483
484  if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t)-1)
485  {
486    int len;
487    ICONV_CONST char *ib;
488    char *buf, *ob;
489    size_t ibl, obl;
490    ICONV_CONST char **inrepls = 0;
491    char *outrepl = 0;
492
493    if (mutt_is_utf8 (to))
494      outrepl = "\357\277\275";
495    else if (mutt_is_utf8 (from))
496      inrepls = repls;
497    else
498      outrepl = "?";
499      
500    len = strlen (s);
501    ib = s, ibl = len + 1;
502    obl = MB_LEN_MAX * ibl;
503    ob = buf = safe_malloc (obl + 1);
504    
505    mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
506    iconv_close (cd);
507
508    *ob = '\0';
509
510    FREE (ps);		/* __FREE_CHECKED__ */
511    *ps = buf;
512    
513    mutt_str_adjust (ps);
514    return 0;
515  }
516  else
517    return -1;
518}
519
520
521/*
522 * FGETCONV stuff for converting a file while reading it
523 * Used in sendlib.c for converting from mutt's Charset
524 */
525
526struct fgetconv_s
527{
528  FILE *file;
529  iconv_t cd;
530  char bufi[512];
531  char bufo[512];
532  char *p;
533  char *ob;
534  char *ib;
535  size_t ibl;
536  ICONV_CONST char **inrepls;
537};
538
539struct fgetconv_not
540{
541  FILE *file;
542  iconv_t cd;
543};
544
545/*
546 * Parameter flags is given as-is to mutt_iconv_open(). See there
547 * for its meaning and usage policy.
548 */
549FGETCONV *fgetconv_open (FILE *file, const char *from, const char *to, int flags)
550{
551  struct fgetconv_s *fc;
552  iconv_t cd = (iconv_t)-1;
553  static ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
554
555  if (from && to)
556    cd = mutt_iconv_open (to, from, flags);
557
558  if (cd != (iconv_t)-1)
559  {
560    fc = safe_malloc (sizeof (struct fgetconv_s));
561    fc->p = fc->ob = fc->bufo;
562    fc->ib = fc->bufi;
563    fc->ibl = 0;
564    fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
565  }
566  else
567    fc = safe_malloc (sizeof (struct fgetconv_not));
568  fc->file = file;
569  fc->cd = cd;
570  return (FGETCONV *)fc;
571}
572
573char *fgetconvs (char *buf, size_t l, FGETCONV *_fc)
574{
575  int c;
576  size_t r;
577  
578  for (r = 0; r + 1 < l;)
579  {
580    if ((c = fgetconv (_fc)) == EOF)
581      break;
582    buf[r++] = (char) c;
583    if (c == '\n') 
584      break;
585  }
586  buf[r] = '\0';
587  
588  if (r) 
589    return buf;
590  else 
591    return NULL;
592}
593
594int fgetconv (FGETCONV *_fc)
595{
596  struct fgetconv_s *fc = (struct fgetconv_s *)_fc;
597
598  if (!fc)
599    return EOF;
600  if (fc->cd == (iconv_t)-1)
601    return fgetc (fc->file);
602  if (!fc->p)
603    return EOF;
604  if (fc->p < fc->ob)
605    return (unsigned char)*(fc->p)++;
606
607  /* Try to convert some more */
608  fc->p = fc->ob = fc->bufo;
609  if (fc->ibl)
610  {
611    size_t obl = sizeof (fc->bufo);
612    iconv (fc->cd, (ICONV_CONST char **)&fc->ib, &fc->ibl, &fc->ob, &obl);
613    if (fc->p < fc->ob)
614      return (unsigned char)*(fc->p)++;
615  }
616
617  /* If we trusted iconv a bit more, we would at this point
618   * ask why it had stopped converting ... */
619
620  /* Try to read some more */
621  if (fc->ibl == sizeof (fc->bufi) ||
622      (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi)))
623  {
624    fc->p = 0;
625    return EOF;
626  }
627  if (fc->ibl)
628    memcpy (fc->bufi, fc->ib, fc->ibl);
629  fc->ib = fc->bufi;
630  fc->ibl += fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
631
632  /* Try harder this time to convert some */
633  if (fc->ibl)
634  {
635    size_t obl = sizeof (fc->bufo);
636    mutt_iconv (fc->cd, (ICONV_CONST char **)&fc->ib, &fc->ibl, &fc->ob, &obl,
637		fc->inrepls, 0);
638    if (fc->p < fc->ob)
639      return (unsigned char)*(fc->p)++;
640  }
641
642  /* Either the file has finished or one of the buffers is too small */
643  fc->p = 0;
644  return EOF;
645}
646
647void fgetconv_close (FGETCONV **_fc)
648{
649  struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
650
651  if (fc->cd != (iconv_t)-1)
652    iconv_close (fc->cd);
653  FREE (_fc);		/* __FREE_CHECKED__ */
654}
655
656int mutt_check_charset (const char *s, int strict)
657{
658  int i;
659  iconv_t cd;
660
661  if (mutt_is_utf8 (s))
662    return 0;
663
664  if (!strict)
665    for (i = 0; PreferredMIMENames[i].key; i++)
666    {
667      if (ascii_strcasecmp (PreferredMIMENames[i].key, s) == 0 ||
668	  ascii_strcasecmp (PreferredMIMENames[i].pref, s) == 0)
669	return 0;
670    }
671
672  if ((cd = mutt_iconv_open (s, s, 0)) != (iconv_t)(-1))
673  {
674    iconv_close (cd);
675    return 0;
676  }
677
678  return -1;
679}