1 files changed, 662 insertions, 0 deletions
diff --git a/gettext-tools/src/po-charset.c b/gettext-tools/src/po-charset.c
new file mode 100644
index 0000000..4c0dcdb
--- /dev/null
+++ b/gettext-tools/src/po-charset.c
@@ -0,0 +1,662 @@
+/* Charset handling while reading PO files.
+   Copyright (C) 2001-2007, 2010 Free Software Foundation, Inc.
+   Written by Bruno Haible <haible@clisp.cons.org>, 2001.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+#include <alloca.h>
+
+/* Specification.  */
+#include "po-charset.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "xmalloca.h"
+#include "xvasprintf.h"
+#include "po-xerror.h"
+#include "basename.h"
+#include "progname.h"
+#include "c-strstr.h"
+#include "c-strcase.h"
+#include "gettext.h"
+
+#define _(str) gettext (str)
+
+#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
+
+static const char ascii[] = "ASCII";
+
+/* The canonicalized encoding name for ASCII.  */
+const char *po_charset_ascii = ascii;
+
+static const char utf8[] = "UTF-8";
+
+/* The canonicalized encoding name for UTF-8.  */
+const char *po_charset_utf8 = utf8;
+
+/* Canonicalize an encoding name.  */
+const char *
+po_charset_canonicalize (const char *charset)
+{
+  /* The list of charsets supported by glibc's iconv() and by the portable
+     iconv() across platforms.  Taken from intl/config.charset.  */
+  static const char *standard_charsets[] =
+  {
+    ascii, "ANSI_X3.4-1968", "US-ASCII",        /* i = 0..2 */
+    "ISO-8859-1", "ISO_8859-1",                 /* i = 3, 4 */
+    "ISO-8859-2", "ISO_8859-2",
+    "ISO-8859-3", "ISO_8859-3",
+    "ISO-8859-4", "ISO_8859-4",
+    "ISO-8859-5", "ISO_8859-5",
+    "ISO-8859-6", "ISO_8859-6",
+    "ISO-8859-7", "ISO_8859-7",
+    "ISO-8859-8", "ISO_8859-8",
+    "ISO-8859-9", "ISO_8859-9",
+    "ISO-8859-13", "ISO_8859-13",
+    "ISO-8859-14", "ISO_8859-14",
+    "ISO-8859-15", "ISO_8859-15",               /* i = 25, 26 */
+    "KOI8-R",
+    "KOI8-U",
+    "KOI8-T",
+    "CP850",
+    "CP866",
+    "CP874",
+    "CP932",
+    "CP949",
+    "CP950",
+    "CP1250",
+    "CP1251",
+    "CP1252",
+    "CP1253",
+    "CP1254",
+    "CP1255",
+    "CP1256",
+    "CP1257",
+    "GB2312",
+    "EUC-JP",
+    "EUC-KR",
+    "EUC-TW",
+    "BIG5",
+    "BIG5-HKSCS",
+    "GBK",
+    "GB18030",
+    "SHIFT_JIS",
+    "JOHAB",
+    "TIS-620",
+    "VISCII",
+    "GEORGIAN-PS",
+    utf8
+  };
+  size_t i;
+
+  for (i = 0; i < SIZEOF (standard_charsets); i++)
+    if (c_strcasecmp (charset, standard_charsets[i]) == 0)
+      return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
+  return NULL;
+}
+
+/* Test for ASCII compatibility.  */
+bool
+po_charset_ascii_compatible (const char *canon_charset)
+{
+  /* There are only a few exceptions to ASCII compatibility.  */
+  if (strcmp (canon_charset, "SHIFT_JIS") == 0
+      || strcmp (canon_charset, "JOHAB") == 0
+      || strcmp (canon_charset, "VISCII") == 0)
+    return false;
+  else
+    return true;
+}
+
+/* Test for a weird encoding, i.e. an encoding which has double-byte
+   characters ending in 0x5C.  */
+bool po_is_charset_weird (const char *canon_charset)
+{
+  static const char *weird_charsets[] =
+  {
+    "BIG5",
+    "BIG5-HKSCS",
+    "GBK",
+    "GB18030",
+    "SHIFT_JIS",
+    "JOHAB"
+  };
+  size_t i;
+
+  for (i = 0; i < SIZEOF (weird_charsets); i++)
+    if (strcmp (canon_charset, weird_charsets[i]) == 0)
+      return true;
+  return false;
+}
+
+/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
+   An encoding has CJK structure if every valid character stream is composed
+   of single bytes in the range 0x{00..7F} and of byte pairs in the range
+   0x{80..FF}{30..FF}.  */
+bool po_is_charset_weird_cjk (const char *canon_charset)
+{
+  static const char *weird_cjk_charsets[] =
+  {                     /* single bytes   double bytes       */
+    "BIG5",             /* 0x{00..7F},    0x{A1..F9}{40..FE} */
+    "BIG5-HKSCS",       /* 0x{00..7F},    0x{88..FE}{40..FE} */
+    "GBK",              /* 0x{00..7F},    0x{81..FE}{40..FE} */
+    "GB18030",          /* 0x{00..7F},    0x{81..FE}{30..FE} */
+    "SHIFT_JIS",        /* 0x{00..7F},    0x{81..F9}{40..FC} */
+    "JOHAB"             /* 0x{00..7F},    0x{84..F9}{31..FE} */
+  };
+  size_t i;
+
+  for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
+    if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
+      return true;
+  return false;
+}
+
+/* Hardcoded iterator functions for all kinds of encodings.
+   We could also implement a general iterator function with iconv(),
+   but we need a fast one.  */
+
+/* Character iterator for 8-bit encodings.  */
+static size_t
+char_iterator (const char *s)
+{
+  return 1;
+}
+
+/* Character iterator for GB2312.  See libiconv/lib/euc_cn.h.  */
+/* Character iterator for EUC-KR.  See libiconv/lib/euc_kr.h.  */
+static size_t
+euc_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0xa1 && c < 0xff)
+    {
+      unsigned char c2 = s[1];
+      if (c2 >= 0xa1 && c2 < 0xff)
+        return 2;
+    }
+  return 1;
+}
+
+/* Character iterator for EUC-JP.  See libiconv/lib/euc_jp.h.  */
+static size_t
+euc_jp_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0xa1 && c < 0xff)
+    {
+      unsigned char c2 = s[1];
+      if (c2 >= 0xa1 && c2 < 0xff)
+        return 2;
+    }
+  else if (c == 0x8e)
+    {
+      unsigned char c2 = s[1];
+      if (c2 >= 0xa1 && c2 < 0xe0)
+        return 2;
+    }
+  else if (c == 0x8f)
+    {
+      unsigned char c2 = s[1];
+      if (c2 >= 0xa1 && c2 < 0xff)
+        {
+          unsigned char c3 = s[2];
+          if (c3 >= 0xa1 && c3 < 0xff)
+            return 3;
+        }
+    }
+  return 1;
+}
+
+/* Character iterator for EUC-TW.  See libiconv/lib/euc_tw.h.  */
+static size_t
+euc_tw_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0xa1 && c < 0xff)
+    {
+      unsigned char c2 = s[1];
+      if (c2 >= 0xa1 && c2 < 0xff)
+        return 2;
+    }
+  else if (c == 0x8e)
+    {
+      unsigned char c2 = s[1];
+      if (c2 >= 0xa1 && c2 <= 0xb0)
+        {
+          unsigned char c3 = s[2];
+          if (c3 >= 0xa1 && c3 < 0xff)
+            {
+              unsigned char c4 = s[3];
+              if (c4 >= 0xa1 && c4 < 0xff)
+                return 4;
+            }
+        }
+    }
+  return 1;
+}
+
+/* Character iterator for BIG5.  See libiconv/lib/ces_big5.h.  */
+static size_t
+big5_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0xa1 && c < 0xff)
+    {
+      unsigned char c2 = s[1];
+      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
+        return 2;
+    }
+  return 1;
+}
+
+/* Character iterator for BIG5-HKSCS.  See libiconv/lib/big5hkscs.h.  */
+static size_t
+big5hkscs_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0x88 && c < 0xff)
+    {
+      unsigned char c2 = s[1];
+      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
+        return 2;
+    }
+  return 1;
+}
+
+/* Character iterator for GBK.  See libiconv/lib/ces_gbk.h and
+   libiconv/lib/gbk.h.  */
+static size_t
+gbk_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0x81 && c < 0xff)
+    {
+      unsigned char c2 = s[1];
+      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
+        return 2;
+    }
+  return 1;
+}
+
+/* Character iterator for GB18030.  See libiconv/lib/gb18030.h.  */
+static size_t
+gb18030_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0x81 && c < 0xff)
+    {
+      unsigned char c2 = s[1];
+      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
+        return 2;
+    }
+  if (c >= 0x81 && c <= 0x84)
+    {
+      unsigned char c2 = s[1];
+      if (c2 >= 0x30 && c2 <= 0x39)
+        {
+          unsigned char c3 = s[2];
+          if (c3 >= 0x81 && c3 < 0xff)
+            {
+              unsigned char c4 = s[3];
+              if (c4 >= 0x30 && c4 <= 0x39)
+                return 4;
+            }
+        }
+    }
+  return 1;
+}
+
+/* Character iterator for SHIFT_JIS.  See libiconv/lib/sjis.h.  */
+static size_t
+shift_jis_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
+    {
+      unsigned char c2 = s[1];
+      if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
+        return 2;
+    }
+  return 1;
+}
+
+/* Character iterator for JOHAB.  See libiconv/lib/johab.h and
+   libiconv/lib/johab_hangul.h.  */
+static size_t
+johab_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0x84 && c <= 0xd3)
+    {
+      unsigned char c2 = s[1];
+      if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
+        return 2;
+    }
+  else if (c >= 0xd9 && c <= 0xf9)
+    {
+      unsigned char c2 = s[1];
+      if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
+        return 2;
+    }
+  return 1;
+}
+
+/* Character iterator for UTF-8.  See libiconv/lib/utf8.h.  */
+static size_t
+utf8_character_iterator (const char *s)
+{
+  unsigned char c = *s;
+  if (c >= 0xc2)
+    {
+      if (c < 0xe0)
+        {
+          unsigned char c2 = s[1];
+          if (c2 >= 0x80 && c2 < 0xc0)
+            return 2;
+        }
+      else if (c < 0xf0)
+        {
+          unsigned char c2 = s[1];
+          if (c2 >= 0x80 && c2 < 0xc0)
+            {
+              unsigned char c3 = s[2];
+              if (c3 >= 0x80 && c3 < 0xc0)
+                return 3;
+            }
+        }
+      else if (c < 0xf8)
+        {
+          unsigned char c2 = s[1];
+          if (c2 >= 0x80 && c2 < 0xc0)
+            {
+              unsigned char c3 = s[2];
+              if (c3 >= 0x80 && c3 < 0xc0)
+                {
+                  unsigned char c4 = s[3];
+                  if (c4 >= 0x80 && c4 < 0xc0)
+                    return 4;
+                }
+            }
+        }
+    }
+  return 1;
+}
+
+/* Returns a character iterator for a given encoding.
+   Given a pointer into a string, it returns the number occupied by the next
+   single character.  If the piece of string is not valid or if the *s == '\0',
+   it returns 1.  */
+character_iterator_t
+po_charset_character_iterator (const char *canon_charset)
+{
+  if (canon_charset == utf8)
+    return utf8_character_iterator;
+  if (strcmp (canon_charset, "GB2312") == 0
+      || strcmp (canon_charset, "EUC-KR") == 0)
+    return euc_character_iterator;
+  if (strcmp (canon_charset, "EUC-JP") == 0)
+    return euc_jp_character_iterator;
+  if (strcmp (canon_charset, "EUC-TW") == 0)
+    return euc_tw_character_iterator;
+  if (strcmp (canon_charset, "BIG5") == 0)
+    return big5_character_iterator;
+  if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
+    return big5hkscs_character_iterator;
+  if (strcmp (canon_charset, "GBK") == 0)
+    return gbk_character_iterator;
+  if (strcmp (canon_charset, "GB18030") == 0)
+    return gb18030_character_iterator;
+  if (strcmp (canon_charset, "SHIFT_JIS") == 0)
+    return shift_jis_character_iterator;
+  if (strcmp (canon_charset, "JOHAB") == 0)
+    return johab_character_iterator;
+  return char_iterator;
+}
+
+
+/* The PO file's encoding, as specified in the header entry.  */
+const char *po_lex_charset;
+
+#if HAVE_ICONV
+/* Converter from the PO file's encoding to UTF-8.  */
+iconv_t po_lex_iconv;
+#endif
+/* If no converter is available, some information about the structure of the
+   PO file's encoding.  */
+bool po_lex_weird_cjk;
+
+void
+po_lex_charset_init ()
+{
+  po_lex_charset = NULL;
+#if HAVE_ICONV
+  po_lex_iconv = (iconv_t)(-1);
+#endif
+  po_lex_weird_cjk = false;
+}
+
+void
+po_lex_charset_set (const char *header_entry, const char *filename)
+{
+  /* Verify the validity of CHARSET.  It is necessary
+     1. for the correct treatment of multibyte characters containing
+        0x5C bytes in the PO lexer,
+     2. so that at run time, gettext() can call iconv() to convert
+        msgstr.  */
+  const char *charsetstr = c_strstr (header_entry, "charset=");
+
+  if (charsetstr != NULL)
+    {
+      size_t len;
+      char *charset;
+      const char *canon_charset;
+
+      charsetstr += strlen ("charset=");
+      len = strcspn (charsetstr, " \t\n");
+      charset = (char *) xmalloca (len + 1);
+      memcpy (charset, charsetstr, len);
+      charset[len] = '\0';
+
+      canon_charset = po_charset_canonicalize (charset);
+      if (canon_charset == NULL)
+        {
+          /* Don't warn for POT files, because POT files usually contain
+             only ASCII msgids.  */
+          size_t filenamelen = strlen (filename);
+
+          if (!(filenamelen >= 4
+                && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
+                && strcmp (charset, "CHARSET") == 0))
+            {
+              char *warning_message =
+                xasprintf (_("\
+Charset \"%s\" is not a portable encoding name.\n\
+Message conversion to user's charset might not work.\n"),
+                           charset);
+              po_xerror (PO_SEVERITY_WARNING, NULL,
+                         filename, (size_t)(-1), (size_t)(-1), true,
+                         warning_message);
+              free (warning_message);
+            }
+        }
+      else
+        {
+          const char *envval;
+
+          po_lex_charset = canon_charset;
+#if HAVE_ICONV
+          if (po_lex_iconv != (iconv_t)(-1))
+            iconv_close (po_lex_iconv);
+#endif
+
+          /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
+             don't know about multibyte encodings, and require a spurious
+             backslash after every multibyte character whose last byte is
+             0x5C.  Some programs, like vim, distribute PO files in this
+             broken format.  GNU msgfmt must continue to support this old
+             PO file format when the Makefile requests it.  */
+          envval = getenv ("OLD_PO_FILE_INPUT");
+          if (envval != NULL && *envval != '\0')
+            {
+              /* Assume the PO file is in old format, with extraneous
+                 backslashes.  */
+#if HAVE_ICONV
+              po_lex_iconv = (iconv_t)(-1);
+#endif
+              po_lex_weird_cjk = false;
+            }
+          else
+            {
+              /* Use iconv() to parse multibyte characters.  */
+#if HAVE_ICONV
+              /* Avoid glibc-2.1 bug with EUC-KR.  */
+# if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
+     && !defined _LIBICONV_VERSION
+              if (strcmp (po_lex_charset, "EUC-KR") == 0)
+                po_lex_iconv = (iconv_t)(-1);
+              else
+# endif
+              /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
+                 GBK, GB18030.  */
+# if defined __sun && !defined _LIBICONV_VERSION
+              if (   strcmp (po_lex_charset, "GB2312") == 0
+                  || strcmp (po_lex_charset, "EUC-TW") == 0
+                  || strcmp (po_lex_charset, "BIG5") == 0
+                  || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
+                  || strcmp (po_lex_charset, "GBK") == 0
+                  || strcmp (po_lex_charset, "GB18030") == 0)
+                po_lex_iconv = (iconv_t)(-1);
+              else
+# endif
+              po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
+              if (po_lex_iconv == (iconv_t)(-1))
+                {
+                  char *warning_message;
+                  const char *recommendation;
+                  const char *note;
+                  char *whole_message;
+
+                  warning_message =
+                    xasprintf (_("\
+Charset \"%s\" is not supported. %s relies on iconv(),\n\
+and iconv() does not support \"%s\".\n"),
+                               po_lex_charset, basename (program_name),
+                               po_lex_charset);
+
+# if !defined _LIBICONV_VERSION
+                  recommendation = _("\
+Installing GNU libiconv and then reinstalling GNU gettext\n\
+would fix this problem.\n");
+# else
+                  recommendation = "";
+# endif
+
+                  /* Test for a charset which has double-byte characters
+                     ending in 0x5C.  For these encodings, the string parser
+                     is likely to be confused if it can't see the character
+                     boundaries.  */
+                  po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+                  if (po_is_charset_weird (po_lex_charset)
+                      && !po_lex_weird_cjk)
+                    note = _("Continuing anyway, expect parse errors.");
+                  else
+                    note = _("Continuing anyway.");
+
+                  whole_message =
+                    xasprintf ("%s%s%s\n",
+                               warning_message, recommendation, note);
+
+                  po_xerror (PO_SEVERITY_WARNING, NULL,
+                             filename, (size_t)(-1), (size_t)(-1), true,
+                             whole_message);
+
+                  free (whole_message);
+                  free (warning_message);
+                }
+#else
+              /* Test for a charset which has double-byte characters
+                 ending in 0x5C.  For these encodings, the string parser
+                 is likely to be confused if it can't see the character
+                 boundaries.  */
+              po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+              if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
+                {
+                  char *warning_message;
+                  const char *recommendation;
+                  const char *note;
+                  char *whole_message;
+
+                  warning_message =
+                    xasprintf (_("\
+Charset \"%s\" is not supported. %s relies on iconv().\n\
+This version was built without iconv().\n"),
+                               po_lex_charset, basename (program_name));
+
+                  recommendation = _("\
+Installing GNU libiconv and then reinstalling GNU gettext\n\
+would fix this problem.\n");
+
+                  note = _("Continuing anyway, expect parse errors.");
+
+                  whole_message =
+                    xasprintf ("%s%s%s\n",
+                               warning_message, recommendation, note);
+
+                  po_xerror (PO_SEVERITY_WARNING, NULL,
+                             filename, (size_t)(-1), (size_t)(-1), true,
+                             whole_message);
+
+                  free (whole_message);
+                  free (warning_message);
+                }
+#endif
+            }
+        }
+      freea (charset);
+    }
+  else
+    {
+      /* Don't warn for POT files, because POT files usually contain
+         only ASCII msgids.  */
+      size_t filenamelen = strlen (filename);
+
+      if (!(filenamelen >= 4
+            && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
+        po_xerror (PO_SEVERITY_WARNING,
+                   NULL, filename, (size_t)(-1), (size_t)(-1), true,
+                   _("\
+Charset missing in header.\n\
+Message conversion to user's charset will not work.\n"));
+    }
+}
+
+void
+po_lex_charset_close ()
+{
+  po_lex_charset = NULL;
+#if HAVE_ICONV
+  if (po_lex_iconv != (iconv_t)(-1))
+    {
+      iconv_close (po_lex_iconv);
+      po_lex_iconv = (iconv_t)(-1);
+    }
+#endif
+  po_lex_weird_cjk = false;
+}