diff options
Diffstat (limited to 'gettext-tools/src/read-stringtable.c')
-rw-r--r-- | gettext-tools/src/read-stringtable.c | 963 |
1 files changed, 963 insertions, 0 deletions
diff --git a/gettext-tools/src/read-stringtable.c b/gettext-tools/src/read-stringtable.c new file mode 100644 index 0000000..27b8860 --- /dev/null +++ b/gettext-tools/src/read-stringtable.c @@ -0,0 +1,963 @@ +/* Reading NeXTstep/GNUstep .strings files. + Copyright (C) 2003, 2005-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2003. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +/* Specification. */ +#include "read-stringtable.h" + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "error.h" +#include "error-progname.h" +#include "read-catalog-abstract.h" +#include "xalloc.h" +#include "xvasprintf.h" +#include "po-xerror.h" +#include "unistr.h" +#include "gettext.h" + +#define _(str) gettext (str) + +/* The format of NeXTstep/GNUstep .strings files is documented in + gnustep-base-1.8.0/Tools/make_strings/Using.txt + and in the comments of method propertyListFromStringsFileFormat in + gnustep-base-1.8.0/Source/NSString.m + In summary, it's a Objective-C like file with pseudo-assignments of the form + "key" = "value"; + where the key is the msgid and the value is the msgstr. + + The implementation of the parser of .strings files is in + gnustep-base-1.8.0/Source/NSString.m + function GSPropertyListFromStringsFormat + (indirectly called from NSBundle's method localizedStringForKey). + + A test case is in + gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example + */ + +/* Handling of comments: We copy all comments from the .strings file to + the PO file. This is not really needed; it's a service for translators + who don't like PO files and prefer to maintain the .strings file. */ + + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* File name and line number. */ +extern lex_pos_ty gram_pos; + +/* The input file stream. */ +static FILE *fp; + + +/* Phase 1: Read a byte. + Max. 4 pushback characters. */ + +static unsigned char phase1_pushback[4]; +static int phase1_pushback_length; + +static int +phase1_getc () +{ + int c; + + if (phase1_pushback_length) + return phase1_pushback[--phase1_pushback_length]; + + c = getc (fp); + + if (c == EOF) + { + if (ferror (fp)) + { + const char *errno_description = strerror (errno); + po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, + xasprintf ("%s: %s", + xasprintf (_("error while reading \"%s\""), + real_file_name), + errno_description)); + } + return EOF; + } + + return c; +} + +static void +phase1_ungetc (int c) +{ + if (c != EOF) + phase1_pushback[phase1_pushback_length++] = c; +} + + +/* Phase 2: Read an UCS-4 character. + Max. 2 pushback characters. */ + +/* End-of-file indicator for functions returning an UCS-4 character. */ +#define UEOF -1 + +static int phase2_pushback[4]; +static int phase2_pushback_length; + +/* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each + with a BOM!), or otherwise the locale-dependent default encoding is used. + Since we don't want to depend on the locale here, we use ISO-8859-1 + instead. */ +enum enc +{ + enc_undetermined, + enc_ucs2be, + enc_ucs2le, + enc_utf8, + enc_iso8859_1 +}; +static enum enc encoding; + +static int +phase2_getc () +{ + if (phase2_pushback_length) + return phase2_pushback[--phase2_pushback_length]; + + if (encoding == enc_undetermined) + { + /* Determine the input file's encoding. */ + int c0, c1; + + c0 = phase1_getc (); + if (c0 == EOF) + return UEOF; + c1 = phase1_getc (); + if (c1 == EOF) + { + phase1_ungetc (c0); + encoding = enc_iso8859_1; + } + else if (c0 == 0xfe && c1 == 0xff) + encoding = enc_ucs2be; + else if (c0 == 0xff && c1 == 0xfe) + encoding = enc_ucs2le; + else + { + int c2; + + c2 = phase1_getc (); + if (c2 == EOF) + { + phase1_ungetc (c1); + phase1_ungetc (c0); + encoding = enc_iso8859_1; + } + else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf) + encoding = enc_utf8; + else + { + phase1_ungetc (c2); + phase1_ungetc (c1); + phase1_ungetc (c0); + encoding = enc_iso8859_1; + } + } + } + + switch (encoding) + { + case enc_ucs2be: + /* Read an UCS-2BE encoded character. */ + { + int c0, c1; + + c0 = phase1_getc (); + if (c0 == EOF) + return UEOF; + c1 = phase1_getc (); + if (c1 == EOF) + return UEOF; + return (c0 << 8) + c1; + } + + case enc_ucs2le: + /* Read an UCS-2LE encoded character. */ + { + int c0, c1; + + c0 = phase1_getc (); + if (c0 == EOF) + return UEOF; + c1 = phase1_getc (); + if (c1 == EOF) + return UEOF; + return c0 + (c1 << 8); + } + + case enc_utf8: + /* Read an UTF-8 encoded character. */ + { + unsigned char buf[6]; + unsigned int count; + int c; + ucs4_t uc; + + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[0] = c; + count = 1; + + if (buf[0] >= 0xc0) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[1] = c; + count = 2; + + if (buf[0] >= 0xe0 + && ((buf[1] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[2] = c; + count = 3; + + if (buf[0] >= 0xf0 + && ((buf[2] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[3] = c; + count = 4; + + if (buf[0] >= 0xf8 + && ((buf[3] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[4] = c; + count = 5; + + if (buf[0] >= 0xfc + && ((buf[4] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[5] = c; + count = 6; + } + } + } + } + } + + u8_mbtouc (&uc, buf, count); + return uc; + } + + case enc_iso8859_1: + /* Read an ISO-8859-1 encoded character. */ + { + int c = phase1_getc (); + + if (c == EOF) + return UEOF; + return c; + } + + default: + abort (); + } +} + +static void +phase2_ungetc (int c) +{ + if (c != UEOF) + phase2_pushback[phase2_pushback_length++] = c; +} + + +/* Phase 3: Read an UCS-4 character, with line number handling. */ + +static int +phase3_getc () +{ + int c = phase2_getc (); + + if (c == '\n') + gram_pos.line_number++; + + return c; +} + +static void +phase3_ungetc (int c) +{ + if (c == '\n') + --gram_pos.line_number; + phase2_ungetc (c); +} + + +/* Convert from UCS-4 to UTF-8. */ +static char * +conv_from_ucs4 (const int *buffer, size_t buflen) +{ + unsigned char *utf8_string; + size_t pos; + unsigned char *q; + + /* Each UCS-4 word needs 6 bytes at worst. */ + utf8_string = XNMALLOC (6 * buflen + 1, unsigned char); + + for (pos = 0, q = utf8_string; pos < buflen; ) + { + unsigned int uc; + int n; + + uc = buffer[pos++]; + n = u8_uctomb (q, uc, 6); + assert (n > 0); + q += n; + } + *q = '\0'; + assert (q - utf8_string <= 6 * buflen); + + return (char *) utf8_string; +} + + +/* Parse a string enclosed in double-quotes. Input is UCS-4 encoded. + Return the string in UTF-8 encoding, or NULL if the input doesn't represent + a valid string enclosed in double-quotes. */ +static char * +parse_escaped_string (const int *string, size_t length) +{ + static int *buffer; + static size_t bufmax; + static size_t buflen; + const int *string_limit = string + length; + int c; + + if (string == string_limit) + return NULL; + c = *string++; + if (c != '"') + return NULL; + buflen = 0; + for (;;) + { + if (string == string_limit) + return NULL; + c = *string++; + if (c == '"') + break; + if (c == '\\') + { + if (string == string_limit) + return NULL; + c = *string++; + if (c >= '0' && c <= '7') + { + unsigned int n = 0; + int j = 0; + for (;;) + { + n = n * 8 + (c - '0'); + if (++j == 3) + break; + if (string == string_limit) + break; + c = *string; + if (!(c >= '0' && c <= '7')) + break; + string++; + } + c = n; + } + else if (c == 'u' || c == 'U') + { + unsigned int n = 0; + int j; + for (j = 0; j < 4; j++) + { + if (string == string_limit) + break; + c = *string; + if (c >= '0' && c <= '9') + n = n * 16 + (c - '0'); + else if (c >= 'A' && c <= 'F') + n = n * 16 + (c - 'A' + 10); + else if (c >= 'a' && c <= 'f') + n = n * 16 + (c - 'a' + 10); + else + break; + string++; + } + c = n; + } + else + switch (c) + { + case 'a': c = '\a'; break; + case 'b': c = '\b'; break; + case 't': c = '\t'; break; + case 'r': c = '\r'; break; + case 'n': c = '\n'; break; + case 'v': c = '\v'; break; + case 'f': c = '\f'; break; + } + } + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax * sizeof (int)); + } + buffer[buflen++] = c; + } + + return conv_from_ucs4 (buffer, buflen); +} + + +/* Accumulating flag comments. */ + +static char *special_comment; + +static inline void +special_comment_reset () +{ + if (special_comment != NULL) + free (special_comment); + special_comment = NULL; +} + +static void +special_comment_add (const char *flag) +{ + if (special_comment == NULL) + special_comment = xstrdup (flag); + else + { + size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1; + special_comment = xrealloc (special_comment, total_len); + strcat (special_comment, ", "); + strcat (special_comment, flag); + } +} + +static inline void +special_comment_finish () +{ + if (special_comment != NULL) + { + po_callback_comment_special (special_comment); + free (special_comment); + special_comment = NULL; + } +} + + +/* Accumulating comments. */ + +static int *buffer; +static size_t bufmax; +static size_t buflen; +static bool next_is_obsolete; +static bool next_is_fuzzy; +static char *fuzzy_msgstr; +static bool expect_fuzzy_msgstr_as_c_comment; +static bool expect_fuzzy_msgstr_as_cxx_comment; + +static inline void +comment_start () +{ + buflen = 0; +} + +static inline void +comment_add (int c) +{ + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax * sizeof (int)); + } + buffer[buflen++] = c; +} + +static inline void +comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr) +{ + char *line; + + buflen -= chars_to_remove; + /* Drop trailing white space, but not EOLs. */ + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + + /* At special positions we interpret a comment of the form + = "escaped string" + with an optional trailing semicolon as being the fuzzy msgstr, not a + regular comment. */ + if (test_for_fuzzy_msgstr + && buflen > 2 && buffer[0] == '=' && buffer[1] == ' ' + && (fuzzy_msgstr = + parse_escaped_string (buffer + 2, + buflen - (buffer[buflen - 1] == ';') - 2))) + return; + + line = conv_from_ucs4 (buffer, buflen); + + if (strcmp (line, "Flag: untranslated") == 0) + { + special_comment_add ("fuzzy"); + next_is_fuzzy = true; + } + else if (strcmp (line, "Flag: unmatched") == 0) + next_is_obsolete = true; + else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0) + special_comment_add (line + 6); + else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0) + /* A comment extracted from the source. */ + po_callback_comment_dot (line + 9); + else + { + char *last_colon; + unsigned long number; + char *endp; + + if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0 + && (last_colon = strrchr (line + 6, ':')) != NULL + && *(last_colon + 1) != '\0' + && (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0')) + { + /* A "File: <filename>:<number>" type comment. */ + *last_colon = '\0'; + po_callback_comment_filepos (line + 6, number); + } + else + po_callback_comment (line); + } +} + + +/* Phase 4: Replace each comment that is not inside a string with a space + character. */ + +static int +phase4_getc () +{ + int c; + + c = phase3_getc (); + if (c != '/') + return c; + c = phase3_getc (); + switch (c) + { + default: + phase3_ungetc (c); + return '/'; + + case '*': + /* C style comment. */ + { + bool last_was_star; + size_t trailing_stars; + bool seen_newline; + + comment_start (); + last_was_star = false; + trailing_stars = 0; + seen_newline = false; + /* Drop additional stars at the beginning of the comment. */ + for (;;) + { + c = phase3_getc (); + if (c != '*') + break; + last_was_star = true; + } + phase3_ungetc (c); + for (;;) + { + c = phase3_getc (); + if (c == UEOF) + break; + /* We skip all leading white space, but not EOLs. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + switch (c) + { + case '\n': + seen_newline = true; + comment_line_end (1, false); + comment_start (); + last_was_star = false; + trailing_stars = 0; + continue; + + case '*': + last_was_star = true; + trailing_stars++; + continue; + + case '/': + if (last_was_star) + { + /* Drop additional stars at the end of the comment. */ + comment_line_end (trailing_stars + 1, + expect_fuzzy_msgstr_as_c_comment + && !seen_newline); + break; + } + /* FALLTHROUGH */ + + default: + last_was_star = false; + trailing_stars = 0; + continue; + } + break; + } + return ' '; + } + + case '/': + /* C++ style comment. */ + comment_start (); + for (;;) + { + c = phase3_getc (); + if (c == '\n' || c == UEOF) + break; + /* We skip all leading white space, but not EOLs. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + } + comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment); + return '\n'; + } +} + +static inline void +phase4_ungetc (int c) +{ + phase3_ungetc (c); +} + + +/* Return true if a character is considered as whitespace. */ +static bool +is_whitespace (int c) +{ + return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f' + || c == '\b'); +} + +/* Return true if a character needs quoting, i.e. cannot be used in unquoted + tokens. */ +static bool +is_quotable (int c) +{ + if ((c >= '0' && c <= '9') + || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + return false; + switch (c) + { + case '!': case '#': case '$': case '%': case '&': case '*': + case '+': case '-': case '.': case '/': case ':': case '?': + case '@': case '|': case '~': case '_': case '^': + return false; + default: + return true; + } +} + + +/* Read a key or value string. + Return the string in UTF-8 encoding, or NULL if no string is seen. + Return the start position of the string in *pos. */ +static char * +read_string (lex_pos_ty *pos) +{ + static int *buffer; + static size_t bufmax; + static size_t buflen; + int c; + + /* Skip whitespace before the string. */ + do + c = phase4_getc (); + while (is_whitespace (c)); + + if (c == UEOF) + /* No more string. */ + return NULL; + + *pos = gram_pos; + buflen = 0; + if (c == '"') + { + /* Read a string enclosed in double-quotes. */ + for (;;) + { + c = phase3_getc (); + if (c == UEOF || c == '"') + break; + if (c == '\\') + { + c = phase3_getc (); + if (c == UEOF) + break; + if (c >= '0' && c <= '7') + { + unsigned int n = 0; + int j = 0; + for (;;) + { + n = n * 8 + (c - '0'); + if (++j == 3) + break; + c = phase3_getc (); + if (!(c >= '0' && c <= '7')) + { + phase3_ungetc (c); + break; + } + } + c = n; + } + else if (c == 'u' || c == 'U') + { + unsigned int n = 0; + int j; + for (j = 0; j < 4; j++) + { + c = phase3_getc (); + if (c >= '0' && c <= '9') + n = n * 16 + (c - '0'); + else if (c >= 'A' && c <= 'F') + n = n * 16 + (c - 'A' + 10); + else if (c >= 'a' && c <= 'f') + n = n * 16 + (c - 'a' + 10); + else + { + phase3_ungetc (c); + break; + } + } + c = n; + } + else + switch (c) + { + case 'a': c = '\a'; break; + case 'b': c = '\b'; break; + case 't': c = '\t'; break; + case 'r': c = '\r'; break; + case 'n': c = '\n'; break; + case 'v': c = '\v'; break; + case 'f': c = '\f'; break; + } + } + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax * sizeof (int)); + } + buffer[buflen++] = c; + } + if (c == UEOF) + po_xerror (PO_SEVERITY_ERROR, NULL, + real_file_name, gram_pos.line_number, (size_t)(-1), false, + _("warning: unterminated string")); + } + else + { + /* Read a token outside quotes. */ + if (is_quotable (c)) + po_xerror (PO_SEVERITY_ERROR, NULL, + real_file_name, gram_pos.line_number, (size_t)(-1), false, + _("warning: syntax error")); + for (; c != UEOF && !is_quotable (c); c = phase4_getc ()) + { + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax * sizeof (int)); + } + buffer[buflen++] = c; + } + } + + return conv_from_ucs4 (buffer, buflen); +} + + +/* Read a .strings file from a stream, and dispatch to the various + abstract_catalog_reader_class_ty methods. */ +static void +stringtable_parse (abstract_catalog_reader_ty *pop, FILE *file, + const char *real_filename, const char *logical_filename) +{ + fp = file; + real_file_name = real_filename; + gram_pos.file_name = xstrdup (real_file_name); + gram_pos.line_number = 1; + encoding = enc_undetermined; + expect_fuzzy_msgstr_as_c_comment = false; + expect_fuzzy_msgstr_as_cxx_comment = false; + + for (;;) + { + char *msgid; + lex_pos_ty msgid_pos; + char *msgstr; + lex_pos_ty msgstr_pos; + int c; + + /* Prepare for next msgid/msgstr pair. */ + special_comment_reset (); + next_is_obsolete = false; + next_is_fuzzy = false; + fuzzy_msgstr = NULL; + + /* Read the key and all the comments preceding it. */ + msgid = read_string (&msgid_pos); + if (msgid == NULL) + break; + + special_comment_finish (); + + /* Skip whitespace. */ + do + c = phase4_getc (); + while (is_whitespace (c)); + + /* Expect a '=' or ';'. */ + if (c == UEOF) + { + po_xerror (PO_SEVERITY_ERROR, NULL, + real_file_name, gram_pos.line_number, (size_t)(-1), false, + _("warning: unterminated key/value pair")); + break; + } + if (c == ';') + { + /* "key"; is an abbreviation for "key"=""; and does not + necessarily designate an untranslated entry. */ + msgstr = xstrdup (""); + msgstr_pos = msgid_pos; + po_callback_message (NULL, msgid, &msgid_pos, NULL, + msgstr, strlen (msgstr) + 1, &msgstr_pos, + NULL, NULL, NULL, + false, next_is_obsolete); + } + else if (c == '=') + { + /* Read the value. */ + msgstr = read_string (&msgstr_pos); + if (msgstr == NULL) + { + po_xerror (PO_SEVERITY_ERROR, NULL, + real_file_name, gram_pos.line_number, (size_t)(-1), + false, _("warning: unterminated key/value pair")); + break; + } + + /* Skip whitespace. But for fuzzy key/value pairs, look for the + tentative msgstr in the form of a C style comment. */ + expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy; + do + { + c = phase4_getc (); + if (fuzzy_msgstr != NULL) + expect_fuzzy_msgstr_as_c_comment = false; + } + while (is_whitespace (c)); + expect_fuzzy_msgstr_as_c_comment = false; + + /* Expect a ';'. */ + if (c == ';') + { + /* But for fuzzy key/value pairs, look for the tentative msgstr + in the form of a C++ style comment. */ + if (fuzzy_msgstr == NULL && next_is_fuzzy) + { + do + c = phase3_getc (); + while (c == ' '); + phase3_ungetc (c); + + expect_fuzzy_msgstr_as_cxx_comment = true; + c = phase4_getc (); + phase4_ungetc (c); + expect_fuzzy_msgstr_as_cxx_comment = false; + } + if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0) + msgstr = fuzzy_msgstr; + + /* A key/value pair. */ + po_callback_message (NULL, msgid, &msgid_pos, NULL, + msgstr, strlen (msgstr) + 1, &msgstr_pos, + NULL, NULL, NULL, + false, next_is_obsolete); + } + else + { + po_xerror (PO_SEVERITY_ERROR, NULL, + real_file_name, gram_pos.line_number, (size_t)(-1), + false, _("\ +warning: syntax error, expected ';' after string")); + break; + } + } + else + { + po_xerror (PO_SEVERITY_ERROR, NULL, + real_file_name, gram_pos.line_number, (size_t)(-1), false, + _("\ +warning: syntax error, expected '=' or ';' after string")); + break; + } + } + + fp = NULL; + real_file_name = NULL; + gram_pos.line_number = 0; +} + +const struct catalog_input_format input_format_stringtable = +{ + stringtable_parse, /* parse */ + true /* produces_utf8 */ +}; |