diff options
Diffstat (limited to 'gettext-tools/src/po-charset.c')
-rw-r--r-- | gettext-tools/src/po-charset.c | 662 |
1 files changed, 662 insertions, 0 deletions
diff --git a/gettext-tools/src/po-charset.c b/gettext-tools/src/po-charset.c new file mode 100644 index 0000000..4c0dcdb --- /dev/null +++ b/gettext-tools/src/po-charset.c @@ -0,0 +1,662 @@ +/* Charset handling while reading PO files. + Copyright (C) 2001-2007, 2010 Free Software Foundation, Inc. + Written by Bruno Haible <haible@clisp.cons.org>, 2001. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include <alloca.h> + +/* Specification. */ +#include "po-charset.h" + +#include <stdlib.h> +#include <string.h> + +#include "xmalloca.h" +#include "xvasprintf.h" +#include "po-xerror.h" +#include "basename.h" +#include "progname.h" +#include "c-strstr.h" +#include "c-strcase.h" +#include "gettext.h" + +#define _(str) gettext (str) + +#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) + +static const char ascii[] = "ASCII"; + +/* The canonicalized encoding name for ASCII. */ +const char *po_charset_ascii = ascii; + +static const char utf8[] = "UTF-8"; + +/* The canonicalized encoding name for UTF-8. */ +const char *po_charset_utf8 = utf8; + +/* Canonicalize an encoding name. */ +const char * +po_charset_canonicalize (const char *charset) +{ + /* The list of charsets supported by glibc's iconv() and by the portable + iconv() across platforms. Taken from intl/config.charset. */ + static const char *standard_charsets[] = + { + ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */ + "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */ + "ISO-8859-2", "ISO_8859-2", + "ISO-8859-3", "ISO_8859-3", + "ISO-8859-4", "ISO_8859-4", + "ISO-8859-5", "ISO_8859-5", + "ISO-8859-6", "ISO_8859-6", + "ISO-8859-7", "ISO_8859-7", + "ISO-8859-8", "ISO_8859-8", + "ISO-8859-9", "ISO_8859-9", + "ISO-8859-13", "ISO_8859-13", + "ISO-8859-14", "ISO_8859-14", + "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */ + "KOI8-R", + "KOI8-U", + "KOI8-T", + "CP850", + "CP866", + "CP874", + "CP932", + "CP949", + "CP950", + "CP1250", + "CP1251", + "CP1252", + "CP1253", + "CP1254", + "CP1255", + "CP1256", + "CP1257", + "GB2312", + "EUC-JP", + "EUC-KR", + "EUC-TW", + "BIG5", + "BIG5-HKSCS", + "GBK", + "GB18030", + "SHIFT_JIS", + "JOHAB", + "TIS-620", + "VISCII", + "GEORGIAN-PS", + utf8 + }; + size_t i; + + for (i = 0; i < SIZEOF (standard_charsets); i++) + if (c_strcasecmp (charset, standard_charsets[i]) == 0) + return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i]; + return NULL; +} + +/* Test for ASCII compatibility. */ +bool +po_charset_ascii_compatible (const char *canon_charset) +{ + /* There are only a few exceptions to ASCII compatibility. */ + if (strcmp (canon_charset, "SHIFT_JIS") == 0 + || strcmp (canon_charset, "JOHAB") == 0 + || strcmp (canon_charset, "VISCII") == 0) + return false; + else + return true; +} + +/* Test for a weird encoding, i.e. an encoding which has double-byte + characters ending in 0x5C. */ +bool po_is_charset_weird (const char *canon_charset) +{ + static const char *weird_charsets[] = + { + "BIG5", + "BIG5-HKSCS", + "GBK", + "GB18030", + "SHIFT_JIS", + "JOHAB" + }; + size_t i; + + for (i = 0; i < SIZEOF (weird_charsets); i++) + if (strcmp (canon_charset, weird_charsets[i]) == 0) + return true; + return false; +} + +/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure. + An encoding has CJK structure if every valid character stream is composed + of single bytes in the range 0x{00..7F} and of byte pairs in the range + 0x{80..FF}{30..FF}. */ +bool po_is_charset_weird_cjk (const char *canon_charset) +{ + static const char *weird_cjk_charsets[] = + { /* single bytes double bytes */ + "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */ + "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */ + "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */ + "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */ + "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */ + "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */ + }; + size_t i; + + for (i = 0; i < SIZEOF (weird_cjk_charsets); i++) + if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0) + return true; + return false; +} + +/* Hardcoded iterator functions for all kinds of encodings. + We could also implement a general iterator function with iconv(), + but we need a fast one. */ + +/* Character iterator for 8-bit encodings. */ +static size_t +char_iterator (const char *s) +{ + return 1; +} + +/* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */ +/* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */ +static size_t +euc_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xa1 && c < 0xff) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xff) + return 2; + } + return 1; +} + +/* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */ +static size_t +euc_jp_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xa1 && c < 0xff) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xff) + return 2; + } + else if (c == 0x8e) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xe0) + return 2; + } + else if (c == 0x8f) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xff) + { + unsigned char c3 = s[2]; + if (c3 >= 0xa1 && c3 < 0xff) + return 3; + } + } + return 1; +} + +/* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */ +static size_t +euc_tw_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xa1 && c < 0xff) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 < 0xff) + return 2; + } + else if (c == 0x8e) + { + unsigned char c2 = s[1]; + if (c2 >= 0xa1 && c2 <= 0xb0) + { + unsigned char c3 = s[2]; + if (c3 >= 0xa1 && c3 < 0xff) + { + unsigned char c4 = s[3]; + if (c4 >= 0xa1 && c4 < 0xff) + return 4; + } + } + } + return 1; +} + +/* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */ +static size_t +big5_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xa1 && c < 0xff) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) + return 2; + } + return 1; +} + +/* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */ +static size_t +big5hkscs_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0x88 && c < 0xff) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) + return 2; + } + return 1; +} + +/* Character iterator for GBK. See libiconv/lib/ces_gbk.h and + libiconv/lib/gbk.h. */ +static size_t +gbk_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0x81 && c < 0xff) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff)) + return 2; + } + return 1; +} + +/* Character iterator for GB18030. See libiconv/lib/gb18030.h. */ +static size_t +gb18030_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0x81 && c < 0xff) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff)) + return 2; + } + if (c >= 0x81 && c <= 0x84) + { + unsigned char c2 = s[1]; + if (c2 >= 0x30 && c2 <= 0x39) + { + unsigned char c3 = s[2]; + if (c3 >= 0x81 && c3 < 0xff) + { + unsigned char c4 = s[3]; + if (c4 >= 0x30 && c4 <= 0x39) + return 4; + } + } + } + return 1; +} + +/* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */ +static size_t +shift_jis_character_iterator (const char *s) +{ + unsigned char c = *s; + if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9)) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc)) + return 2; + } + return 1; +} + +/* Character iterator for JOHAB. See libiconv/lib/johab.h and + libiconv/lib/johab_hangul.h. */ +static size_t +johab_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0x84 && c <= 0xd3) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) + return 2; + } + else if (c >= 0xd9 && c <= 0xf9) + { + unsigned char c2 = s[1]; + if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe)) + return 2; + } + return 1; +} + +/* Character iterator for UTF-8. See libiconv/lib/utf8.h. */ +static size_t +utf8_character_iterator (const char *s) +{ + unsigned char c = *s; + if (c >= 0xc2) + { + if (c < 0xe0) + { + unsigned char c2 = s[1]; + if (c2 >= 0x80 && c2 < 0xc0) + return 2; + } + else if (c < 0xf0) + { + unsigned char c2 = s[1]; + if (c2 >= 0x80 && c2 < 0xc0) + { + unsigned char c3 = s[2]; + if (c3 >= 0x80 && c3 < 0xc0) + return 3; + } + } + else if (c < 0xf8) + { + unsigned char c2 = s[1]; + if (c2 >= 0x80 && c2 < 0xc0) + { + unsigned char c3 = s[2]; + if (c3 >= 0x80 && c3 < 0xc0) + { + unsigned char c4 = s[3]; + if (c4 >= 0x80 && c4 < 0xc0) + return 4; + } + } + } + } + return 1; +} + +/* Returns a character iterator for a given encoding. + Given a pointer into a string, it returns the number occupied by the next + single character. If the piece of string is not valid or if the *s == '\0', + it returns 1. */ +character_iterator_t +po_charset_character_iterator (const char *canon_charset) +{ + if (canon_charset == utf8) + return utf8_character_iterator; + if (strcmp (canon_charset, "GB2312") == 0 + || strcmp (canon_charset, "EUC-KR") == 0) + return euc_character_iterator; + if (strcmp (canon_charset, "EUC-JP") == 0) + return euc_jp_character_iterator; + if (strcmp (canon_charset, "EUC-TW") == 0) + return euc_tw_character_iterator; + if (strcmp (canon_charset, "BIG5") == 0) + return big5_character_iterator; + if (strcmp (canon_charset, "BIG5-HKSCS") == 0) + return big5hkscs_character_iterator; + if (strcmp (canon_charset, "GBK") == 0) + return gbk_character_iterator; + if (strcmp (canon_charset, "GB18030") == 0) + return gb18030_character_iterator; + if (strcmp (canon_charset, "SHIFT_JIS") == 0) + return shift_jis_character_iterator; + if (strcmp (canon_charset, "JOHAB") == 0) + return johab_character_iterator; + return char_iterator; +} + + +/* The PO file's encoding, as specified in the header entry. */ +const char *po_lex_charset; + +#if HAVE_ICONV +/* Converter from the PO file's encoding to UTF-8. */ +iconv_t po_lex_iconv; +#endif +/* If no converter is available, some information about the structure of the + PO file's encoding. */ +bool po_lex_weird_cjk; + +void +po_lex_charset_init () +{ + po_lex_charset = NULL; +#if HAVE_ICONV + po_lex_iconv = (iconv_t)(-1); +#endif + po_lex_weird_cjk = false; +} + +void +po_lex_charset_set (const char *header_entry, const char *filename) +{ + /* Verify the validity of CHARSET. It is necessary + 1. for the correct treatment of multibyte characters containing + 0x5C bytes in the PO lexer, + 2. so that at run time, gettext() can call iconv() to convert + msgstr. */ + const char *charsetstr = c_strstr (header_entry, "charset="); + + if (charsetstr != NULL) + { + size_t len; + char *charset; + const char *canon_charset; + + charsetstr += strlen ("charset="); + len = strcspn (charsetstr, " \t\n"); + charset = (char *) xmalloca (len + 1); + memcpy (charset, charsetstr, len); + charset[len] = '\0'; + + canon_charset = po_charset_canonicalize (charset); + if (canon_charset == NULL) + { + /* Don't warn for POT files, because POT files usually contain + only ASCII msgids. */ + size_t filenamelen = strlen (filename); + + if (!(filenamelen >= 4 + && memcmp (filename + filenamelen - 4, ".pot", 4) == 0 + && strcmp (charset, "CHARSET") == 0)) + { + char *warning_message = + xasprintf (_("\ +Charset \"%s\" is not a portable encoding name.\n\ +Message conversion to user's charset might not work.\n"), + charset); + po_xerror (PO_SEVERITY_WARNING, NULL, + filename, (size_t)(-1), (size_t)(-1), true, + warning_message); + free (warning_message); + } + } + else + { + const char *envval; + + po_lex_charset = canon_charset; +#if HAVE_ICONV + if (po_lex_iconv != (iconv_t)(-1)) + iconv_close (po_lex_iconv); +#endif + + /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35 + don't know about multibyte encodings, and require a spurious + backslash after every multibyte character whose last byte is + 0x5C. Some programs, like vim, distribute PO files in this + broken format. GNU msgfmt must continue to support this old + PO file format when the Makefile requests it. */ + envval = getenv ("OLD_PO_FILE_INPUT"); + if (envval != NULL && *envval != '\0') + { + /* Assume the PO file is in old format, with extraneous + backslashes. */ +#if HAVE_ICONV + po_lex_iconv = (iconv_t)(-1); +#endif + po_lex_weird_cjk = false; + } + else + { + /* Use iconv() to parse multibyte characters. */ +#if HAVE_ICONV + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \ + && !defined _LIBICONV_VERSION + if (strcmp (po_lex_charset, "EUC-KR") == 0) + po_lex_iconv = (iconv_t)(-1); + else +# endif + /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, + GBK, GB18030. */ +# if defined __sun && !defined _LIBICONV_VERSION + if ( strcmp (po_lex_charset, "GB2312") == 0 + || strcmp (po_lex_charset, "EUC-TW") == 0 + || strcmp (po_lex_charset, "BIG5") == 0 + || strcmp (po_lex_charset, "BIG5-HKSCS") == 0 + || strcmp (po_lex_charset, "GBK") == 0 + || strcmp (po_lex_charset, "GB18030") == 0) + po_lex_iconv = (iconv_t)(-1); + else +# endif + po_lex_iconv = iconv_open ("UTF-8", po_lex_charset); + if (po_lex_iconv == (iconv_t)(-1)) + { + char *warning_message; + const char *recommendation; + const char *note; + char *whole_message; + + warning_message = + xasprintf (_("\ +Charset \"%s\" is not supported. %s relies on iconv(),\n\ +and iconv() does not support \"%s\".\n"), + po_lex_charset, basename (program_name), + po_lex_charset); + +# if !defined _LIBICONV_VERSION + recommendation = _("\ +Installing GNU libiconv and then reinstalling GNU gettext\n\ +would fix this problem.\n"); +# else + recommendation = ""; +# endif + + /* Test for a charset which has double-byte characters + ending in 0x5C. For these encodings, the string parser + is likely to be confused if it can't see the character + boundaries. */ + po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset); + if (po_is_charset_weird (po_lex_charset) + && !po_lex_weird_cjk) + note = _("Continuing anyway, expect parse errors."); + else + note = _("Continuing anyway."); + + whole_message = + xasprintf ("%s%s%s\n", + warning_message, recommendation, note); + + po_xerror (PO_SEVERITY_WARNING, NULL, + filename, (size_t)(-1), (size_t)(-1), true, + whole_message); + + free (whole_message); + free (warning_message); + } +#else + /* Test for a charset which has double-byte characters + ending in 0x5C. For these encodings, the string parser + is likely to be confused if it can't see the character + boundaries. */ + po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset); + if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk) + { + char *warning_message; + const char *recommendation; + const char *note; + char *whole_message; + + warning_message = + xasprintf (_("\ +Charset \"%s\" is not supported. %s relies on iconv().\n\ +This version was built without iconv().\n"), + po_lex_charset, basename (program_name)); + + recommendation = _("\ +Installing GNU libiconv and then reinstalling GNU gettext\n\ +would fix this problem.\n"); + + note = _("Continuing anyway, expect parse errors."); + + whole_message = + xasprintf ("%s%s%s\n", + warning_message, recommendation, note); + + po_xerror (PO_SEVERITY_WARNING, NULL, + filename, (size_t)(-1), (size_t)(-1), true, + whole_message); + + free (whole_message); + free (warning_message); + } +#endif + } + } + freea (charset); + } + else + { + /* Don't warn for POT files, because POT files usually contain + only ASCII msgids. */ + size_t filenamelen = strlen (filename); + + if (!(filenamelen >= 4 + && memcmp (filename + filenamelen - 4, ".pot", 4) == 0)) + po_xerror (PO_SEVERITY_WARNING, + NULL, filename, (size_t)(-1), (size_t)(-1), true, + _("\ +Charset missing in header.\n\ +Message conversion to user's charset will not work.\n")); + } +} + +void +po_lex_charset_close () +{ + po_lex_charset = NULL; +#if HAVE_ICONV + if (po_lex_iconv != (iconv_t)(-1)) + { + iconv_close (po_lex_iconv); + po_lex_iconv = (iconv_t)(-1); + } +#endif + po_lex_weird_cjk = false; +} |