summaryrefslogtreecommitdiff
path: root/gettext-tools/src/po-charset.c
diff options
context:
space:
mode:
Diffstat (limited to 'gettext-tools/src/po-charset.c')
-rw-r--r--gettext-tools/src/po-charset.c662
1 files changed, 662 insertions, 0 deletions
diff --git a/gettext-tools/src/po-charset.c b/gettext-tools/src/po-charset.c
new file mode 100644
index 0000000..4c0dcdb
--- /dev/null
+++ b/gettext-tools/src/po-charset.c
@@ -0,0 +1,662 @@
+/* Charset handling while reading PO files.
+ Copyright (C) 2001-2007, 2010 Free Software Foundation, Inc.
+ Written by Bruno Haible <haible@clisp.cons.org>, 2001.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+#include <alloca.h>
+
+/* Specification. */
+#include "po-charset.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "xmalloca.h"
+#include "xvasprintf.h"
+#include "po-xerror.h"
+#include "basename.h"
+#include "progname.h"
+#include "c-strstr.h"
+#include "c-strcase.h"
+#include "gettext.h"
+
+#define _(str) gettext (str)
+
+#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
+
+static const char ascii[] = "ASCII";
+
+/* The canonicalized encoding name for ASCII. */
+const char *po_charset_ascii = ascii;
+
+static const char utf8[] = "UTF-8";
+
+/* The canonicalized encoding name for UTF-8. */
+const char *po_charset_utf8 = utf8;
+
+/* Canonicalize an encoding name. */
+const char *
+po_charset_canonicalize (const char *charset)
+{
+ /* The list of charsets supported by glibc's iconv() and by the portable
+ iconv() across platforms. Taken from intl/config.charset. */
+ static const char *standard_charsets[] =
+ {
+ ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */
+ "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */
+ "ISO-8859-2", "ISO_8859-2",
+ "ISO-8859-3", "ISO_8859-3",
+ "ISO-8859-4", "ISO_8859-4",
+ "ISO-8859-5", "ISO_8859-5",
+ "ISO-8859-6", "ISO_8859-6",
+ "ISO-8859-7", "ISO_8859-7",
+ "ISO-8859-8", "ISO_8859-8",
+ "ISO-8859-9", "ISO_8859-9",
+ "ISO-8859-13", "ISO_8859-13",
+ "ISO-8859-14", "ISO_8859-14",
+ "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */
+ "KOI8-R",
+ "KOI8-U",
+ "KOI8-T",
+ "CP850",
+ "CP866",
+ "CP874",
+ "CP932",
+ "CP949",
+ "CP950",
+ "CP1250",
+ "CP1251",
+ "CP1252",
+ "CP1253",
+ "CP1254",
+ "CP1255",
+ "CP1256",
+ "CP1257",
+ "GB2312",
+ "EUC-JP",
+ "EUC-KR",
+ "EUC-TW",
+ "BIG5",
+ "BIG5-HKSCS",
+ "GBK",
+ "GB18030",
+ "SHIFT_JIS",
+ "JOHAB",
+ "TIS-620",
+ "VISCII",
+ "GEORGIAN-PS",
+ utf8
+ };
+ size_t i;
+
+ for (i = 0; i < SIZEOF (standard_charsets); i++)
+ if (c_strcasecmp (charset, standard_charsets[i]) == 0)
+ return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
+ return NULL;
+}
+
+/* Test for ASCII compatibility. */
+bool
+po_charset_ascii_compatible (const char *canon_charset)
+{
+ /* There are only a few exceptions to ASCII compatibility. */
+ if (strcmp (canon_charset, "SHIFT_JIS") == 0
+ || strcmp (canon_charset, "JOHAB") == 0
+ || strcmp (canon_charset, "VISCII") == 0)
+ return false;
+ else
+ return true;
+}
+
+/* Test for a weird encoding, i.e. an encoding which has double-byte
+ characters ending in 0x5C. */
+bool po_is_charset_weird (const char *canon_charset)
+{
+ static const char *weird_charsets[] =
+ {
+ "BIG5",
+ "BIG5-HKSCS",
+ "GBK",
+ "GB18030",
+ "SHIFT_JIS",
+ "JOHAB"
+ };
+ size_t i;
+
+ for (i = 0; i < SIZEOF (weird_charsets); i++)
+ if (strcmp (canon_charset, weird_charsets[i]) == 0)
+ return true;
+ return false;
+}
+
+/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
+ An encoding has CJK structure if every valid character stream is composed
+ of single bytes in the range 0x{00..7F} and of byte pairs in the range
+ 0x{80..FF}{30..FF}. */
+bool po_is_charset_weird_cjk (const char *canon_charset)
+{
+ static const char *weird_cjk_charsets[] =
+ { /* single bytes double bytes */
+ "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
+ "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
+ "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
+ "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
+ "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
+ "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
+ };
+ size_t i;
+
+ for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
+ if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
+ return true;
+ return false;
+}
+
+/* Hardcoded iterator functions for all kinds of encodings.
+ We could also implement a general iterator function with iconv(),
+ but we need a fast one. */
+
+/* Character iterator for 8-bit encodings. */
+static size_t
+char_iterator (const char *s)
+{
+ return 1;
+}
+
+/* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */
+/* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */
+static size_t
+euc_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xa1 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xff)
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */
+static size_t
+euc_jp_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xa1 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xff)
+ return 2;
+ }
+ else if (c == 0x8e)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xe0)
+ return 2;
+ }
+ else if (c == 0x8f)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xff)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0xa1 && c3 < 0xff)
+ return 3;
+ }
+ }
+ return 1;
+}
+
+/* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */
+static size_t
+euc_tw_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xa1 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 < 0xff)
+ return 2;
+ }
+ else if (c == 0x8e)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0xa1 && c2 <= 0xb0)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0xa1 && c3 < 0xff)
+ {
+ unsigned char c4 = s[3];
+ if (c4 >= 0xa1 && c4 < 0xff)
+ return 4;
+ }
+ }
+ }
+ return 1;
+}
+
+/* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */
+static size_t
+big5_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xa1 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */
+static size_t
+big5hkscs_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0x88 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for GBK. See libiconv/lib/ces_gbk.h and
+ libiconv/lib/gbk.h. */
+static size_t
+gbk_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0x81 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for GB18030. See libiconv/lib/gb18030.h. */
+static size_t
+gb18030_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0x81 && c < 0xff)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
+ return 2;
+ }
+ if (c >= 0x81 && c <= 0x84)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0x30 && c2 <= 0x39)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0x81 && c3 < 0xff)
+ {
+ unsigned char c4 = s[3];
+ if (c4 >= 0x30 && c4 <= 0x39)
+ return 4;
+ }
+ }
+ }
+ return 1;
+}
+
+/* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */
+static size_t
+shift_jis_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for JOHAB. See libiconv/lib/johab.h and
+ libiconv/lib/johab_hangul.h. */
+static size_t
+johab_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0x84 && c <= 0xd3)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
+ return 2;
+ }
+ else if (c >= 0xd9 && c <= 0xf9)
+ {
+ unsigned char c2 = s[1];
+ if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
+ return 2;
+ }
+ return 1;
+}
+
+/* Character iterator for UTF-8. See libiconv/lib/utf8.h. */
+static size_t
+utf8_character_iterator (const char *s)
+{
+ unsigned char c = *s;
+ if (c >= 0xc2)
+ {
+ if (c < 0xe0)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0x80 && c2 < 0xc0)
+ return 2;
+ }
+ else if (c < 0xf0)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0x80 && c2 < 0xc0)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0x80 && c3 < 0xc0)
+ return 3;
+ }
+ }
+ else if (c < 0xf8)
+ {
+ unsigned char c2 = s[1];
+ if (c2 >= 0x80 && c2 < 0xc0)
+ {
+ unsigned char c3 = s[2];
+ if (c3 >= 0x80 && c3 < 0xc0)
+ {
+ unsigned char c4 = s[3];
+ if (c4 >= 0x80 && c4 < 0xc0)
+ return 4;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+/* Returns a character iterator for a given encoding.
+ Given a pointer into a string, it returns the number occupied by the next
+ single character. If the piece of string is not valid or if the *s == '\0',
+ it returns 1. */
+character_iterator_t
+po_charset_character_iterator (const char *canon_charset)
+{
+ if (canon_charset == utf8)
+ return utf8_character_iterator;
+ if (strcmp (canon_charset, "GB2312") == 0
+ || strcmp (canon_charset, "EUC-KR") == 0)
+ return euc_character_iterator;
+ if (strcmp (canon_charset, "EUC-JP") == 0)
+ return euc_jp_character_iterator;
+ if (strcmp (canon_charset, "EUC-TW") == 0)
+ return euc_tw_character_iterator;
+ if (strcmp (canon_charset, "BIG5") == 0)
+ return big5_character_iterator;
+ if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
+ return big5hkscs_character_iterator;
+ if (strcmp (canon_charset, "GBK") == 0)
+ return gbk_character_iterator;
+ if (strcmp (canon_charset, "GB18030") == 0)
+ return gb18030_character_iterator;
+ if (strcmp (canon_charset, "SHIFT_JIS") == 0)
+ return shift_jis_character_iterator;
+ if (strcmp (canon_charset, "JOHAB") == 0)
+ return johab_character_iterator;
+ return char_iterator;
+}
+
+
+/* The PO file's encoding, as specified in the header entry. */
+const char *po_lex_charset;
+
+#if HAVE_ICONV
+/* Converter from the PO file's encoding to UTF-8. */
+iconv_t po_lex_iconv;
+#endif
+/* If no converter is available, some information about the structure of the
+ PO file's encoding. */
+bool po_lex_weird_cjk;
+
+void
+po_lex_charset_init ()
+{
+ po_lex_charset = NULL;
+#if HAVE_ICONV
+ po_lex_iconv = (iconv_t)(-1);
+#endif
+ po_lex_weird_cjk = false;
+}
+
+void
+po_lex_charset_set (const char *header_entry, const char *filename)
+{
+ /* Verify the validity of CHARSET. It is necessary
+ 1. for the correct treatment of multibyte characters containing
+ 0x5C bytes in the PO lexer,
+ 2. so that at run time, gettext() can call iconv() to convert
+ msgstr. */
+ const char *charsetstr = c_strstr (header_entry, "charset=");
+
+ if (charsetstr != NULL)
+ {
+ size_t len;
+ char *charset;
+ const char *canon_charset;
+
+ charsetstr += strlen ("charset=");
+ len = strcspn (charsetstr, " \t\n");
+ charset = (char *) xmalloca (len + 1);
+ memcpy (charset, charsetstr, len);
+ charset[len] = '\0';
+
+ canon_charset = po_charset_canonicalize (charset);
+ if (canon_charset == NULL)
+ {
+ /* Don't warn for POT files, because POT files usually contain
+ only ASCII msgids. */
+ size_t filenamelen = strlen (filename);
+
+ if (!(filenamelen >= 4
+ && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
+ && strcmp (charset, "CHARSET") == 0))
+ {
+ char *warning_message =
+ xasprintf (_("\
+Charset \"%s\" is not a portable encoding name.\n\
+Message conversion to user's charset might not work.\n"),
+ charset);
+ po_xerror (PO_SEVERITY_WARNING, NULL,
+ filename, (size_t)(-1), (size_t)(-1), true,
+ warning_message);
+ free (warning_message);
+ }
+ }
+ else
+ {
+ const char *envval;
+
+ po_lex_charset = canon_charset;
+#if HAVE_ICONV
+ if (po_lex_iconv != (iconv_t)(-1))
+ iconv_close (po_lex_iconv);
+#endif
+
+ /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
+ don't know about multibyte encodings, and require a spurious
+ backslash after every multibyte character whose last byte is
+ 0x5C. Some programs, like vim, distribute PO files in this
+ broken format. GNU msgfmt must continue to support this old
+ PO file format when the Makefile requests it. */
+ envval = getenv ("OLD_PO_FILE_INPUT");
+ if (envval != NULL && *envval != '\0')
+ {
+ /* Assume the PO file is in old format, with extraneous
+ backslashes. */
+#if HAVE_ICONV
+ po_lex_iconv = (iconv_t)(-1);
+#endif
+ po_lex_weird_cjk = false;
+ }
+ else
+ {
+ /* Use iconv() to parse multibyte characters. */
+#if HAVE_ICONV
+ /* Avoid glibc-2.1 bug with EUC-KR. */
+# if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
+ && !defined _LIBICONV_VERSION
+ if (strcmp (po_lex_charset, "EUC-KR") == 0)
+ po_lex_iconv = (iconv_t)(-1);
+ else
+# endif
+ /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
+ GBK, GB18030. */
+# if defined __sun && !defined _LIBICONV_VERSION
+ if ( strcmp (po_lex_charset, "GB2312") == 0
+ || strcmp (po_lex_charset, "EUC-TW") == 0
+ || strcmp (po_lex_charset, "BIG5") == 0
+ || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
+ || strcmp (po_lex_charset, "GBK") == 0
+ || strcmp (po_lex_charset, "GB18030") == 0)
+ po_lex_iconv = (iconv_t)(-1);
+ else
+# endif
+ po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
+ if (po_lex_iconv == (iconv_t)(-1))
+ {
+ char *warning_message;
+ const char *recommendation;
+ const char *note;
+ char *whole_message;
+
+ warning_message =
+ xasprintf (_("\
+Charset \"%s\" is not supported. %s relies on iconv(),\n\
+and iconv() does not support \"%s\".\n"),
+ po_lex_charset, basename (program_name),
+ po_lex_charset);
+
+# if !defined _LIBICONV_VERSION
+ recommendation = _("\
+Installing GNU libiconv and then reinstalling GNU gettext\n\
+would fix this problem.\n");
+# else
+ recommendation = "";
+# endif
+
+ /* Test for a charset which has double-byte characters
+ ending in 0x5C. For these encodings, the string parser
+ is likely to be confused if it can't see the character
+ boundaries. */
+ po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+ if (po_is_charset_weird (po_lex_charset)
+ && !po_lex_weird_cjk)
+ note = _("Continuing anyway, expect parse errors.");
+ else
+ note = _("Continuing anyway.");
+
+ whole_message =
+ xasprintf ("%s%s%s\n",
+ warning_message, recommendation, note);
+
+ po_xerror (PO_SEVERITY_WARNING, NULL,
+ filename, (size_t)(-1), (size_t)(-1), true,
+ whole_message);
+
+ free (whole_message);
+ free (warning_message);
+ }
+#else
+ /* Test for a charset which has double-byte characters
+ ending in 0x5C. For these encodings, the string parser
+ is likely to be confused if it can't see the character
+ boundaries. */
+ po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+ if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
+ {
+ char *warning_message;
+ const char *recommendation;
+ const char *note;
+ char *whole_message;
+
+ warning_message =
+ xasprintf (_("\
+Charset \"%s\" is not supported. %s relies on iconv().\n\
+This version was built without iconv().\n"),
+ po_lex_charset, basename (program_name));
+
+ recommendation = _("\
+Installing GNU libiconv and then reinstalling GNU gettext\n\
+would fix this problem.\n");
+
+ note = _("Continuing anyway, expect parse errors.");
+
+ whole_message =
+ xasprintf ("%s%s%s\n",
+ warning_message, recommendation, note);
+
+ po_xerror (PO_SEVERITY_WARNING, NULL,
+ filename, (size_t)(-1), (size_t)(-1), true,
+ whole_message);
+
+ free (whole_message);
+ free (warning_message);
+ }
+#endif
+ }
+ }
+ freea (charset);
+ }
+ else
+ {
+ /* Don't warn for POT files, because POT files usually contain
+ only ASCII msgids. */
+ size_t filenamelen = strlen (filename);
+
+ if (!(filenamelen >= 4
+ && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
+ po_xerror (PO_SEVERITY_WARNING,
+ NULL, filename, (size_t)(-1), (size_t)(-1), true,
+ _("\
+Charset missing in header.\n\
+Message conversion to user's charset will not work.\n"));
+ }
+}
+
+void
+po_lex_charset_close ()
+{
+ po_lex_charset = NULL;
+#if HAVE_ICONV
+ if (po_lex_iconv != (iconv_t)(-1))
+ {
+ iconv_close (po_lex_iconv);
+ po_lex_iconv = (iconv_t)(-1);
+ }
+#endif
+ po_lex_weird_cjk = false;
+}