summaryrefslogtreecommitdiff
path: root/gettext-tools/src/read-properties.c
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@baserock.org>2014-12-24 07:38:37 +0000
committer <>2015-02-02 12:02:29 +0000
commit482840e61f86ca321838a91e902c41d40c098bbb (patch)
tree01ea2e242fd2792d19fe192476601587901db794 /gettext-tools/src/read-properties.c
downloadgettext-tarball-482840e61f86ca321838a91e902c41d40c098bbb.tar.gz
Imported from /home/lorry/working-area/delta_gettext-tarball/gettext-0.19.4.tar.xz.gettext-0.19.4
Diffstat (limited to 'gettext-tools/src/read-properties.c')
-rw-r--r--gettext-tools/src/read-properties.c560
1 files changed, 560 insertions, 0 deletions
diff --git a/gettext-tools/src/read-properties.c b/gettext-tools/src/read-properties.c
new file mode 100644
index 0000000..0c64730
--- /dev/null
+++ b/gettext-tools/src/read-properties.c
@@ -0,0 +1,560 @@
+/* Reading Java .properties files.
+ Copyright (C) 2003, 2005-2007, 2009 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2003.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+/* Specification. */
+#include "read-properties.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "error.h"
+#include "error-progname.h"
+#include "message.h"
+#include "read-catalog-abstract.h"
+#include "xalloc.h"
+#include "xvasprintf.h"
+#include "po-xerror.h"
+#include "msgl-ascii.h"
+#include "unistr.h"
+#include "gettext.h"
+
+#define _(str) gettext (str)
+
+/* For compiling this file in C++ mode. */
+#ifdef __cplusplus
+# define this thiss
+#endif
+
+
+/* The format of the Java .properties files is documented in the JDK
+ documentation for class java.util.Properties. In the case of .properties
+ files for PropertyResourceBundle, each non-comment line contains a
+ key/value pair in the form "key = value" or "key : value" or "key value",
+ where the key is the msgid and the value is the msgstr. Messages with
+ plurals are not supported in this format. */
+
+/* Handling of comments: We copy all comments from the .properties file to
+ the PO file. This is not really needed; it's a service for translators
+ who don't like PO files and prefer to maintain the .properties file. */
+
+/* Real filename, used in error messages about the input file. */
+static const char *real_file_name;
+
+/* File name and line number. */
+extern lex_pos_ty gram_pos;
+
+/* The input file stream. */
+static FILE *fp;
+
+
+/* Phase 1: Read an ISO-8859-1 character.
+ Max. 1 pushback character. */
+
+static int
+phase1_getc ()
+{
+ int c;
+
+ c = getc (fp);
+
+ if (c == EOF)
+ {
+ if (ferror (fp))
+ {
+ const char *errno_description = strerror (errno);
+ po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
+ xasprintf ("%s: %s",
+ xasprintf (_("error while reading \"%s\""),
+ real_file_name),
+ errno_description));
+ }
+ return EOF;
+ }
+
+ return c;
+}
+
+static inline void
+phase1_ungetc (int c)
+{
+ if (c != EOF)
+ ungetc (c, fp);
+}
+
+
+/* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
+ Max. 2 pushback characters. */
+
+static unsigned char phase2_pushback[2];
+static int phase2_pushback_length;
+
+static int
+phase2_getc ()
+{
+ int c;
+
+ if (phase2_pushback_length)
+ c = phase2_pushback[--phase2_pushback_length];
+ else
+ {
+ c = phase1_getc ();
+
+ if (c == '\r')
+ {
+ int c2 = phase1_getc ();
+ if (c2 == '\n')
+ c = c2;
+ else
+ phase1_ungetc (c2);
+ }
+ }
+
+ if (c == '\n')
+ gram_pos.line_number++;
+
+ return c;
+}
+
+static void
+phase2_ungetc (int c)
+{
+ if (c == '\n')
+ --gram_pos.line_number;
+ if (c != EOF)
+ phase2_pushback[phase2_pushback_length++] = c;
+}
+
+
+/* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
+ with handling of continuation lines.
+ Max. 1 pushback character. */
+
+static int
+phase3_getc ()
+{
+ int c = phase2_getc ();
+
+ for (;;)
+ {
+ if (c != '\\')
+ return c;
+
+ c = phase2_getc ();
+ if (c != '\n')
+ {
+ phase2_ungetc (c);
+ return '\\';
+ }
+
+ /* Skip the backslash-newline and all whitespace that follows it. */
+ do
+ c = phase2_getc ();
+ while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
+ }
+}
+
+static inline void
+phase3_ungetc (int c)
+{
+ phase2_ungetc (c);
+}
+
+
+/* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
+ with handling of continuation lines and of \uxxxx sequences. */
+
+static int
+phase4_getuc ()
+{
+ int c = phase3_getc ();
+
+ if (c == EOF)
+ return -1;
+ if (c == '\\')
+ {
+ int c2 = phase3_getc ();
+
+ if (c2 == 't')
+ return '\t';
+ if (c2 == 'n')
+ return '\n';
+ if (c2 == 'r')
+ return '\r';
+ if (c2 == 'f')
+ return '\f';
+ if (c2 == 'u')
+ {
+ unsigned int n = 0;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ int c1 = phase3_getc ();
+
+ if (c1 >= '0' && c1 <= '9')
+ n = (n << 4) + (c1 - '0');
+ else if (c1 >= 'A' && c1 <= 'F')
+ n = (n << 4) + (c1 - 'A' + 10);
+ else if (c1 >= 'a' && c1 <= 'f')
+ n = (n << 4) + (c1 - 'a' + 10);
+ else
+ {
+ phase3_ungetc (c1);
+ po_xerror (PO_SEVERITY_ERROR, NULL,
+ real_file_name, gram_pos.line_number, (size_t)(-1),
+ false, _("warning: invalid \\uxxxx syntax for Unicode character"));
+ return 'u';
+ }
+ }
+ return n;
+ }
+
+ return c2;
+ }
+ else
+ return c;
+}
+
+
+/* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
+static char *
+conv_from_iso_8859_1 (char *string)
+{
+ if (is_ascii_string (string))
+ return string;
+ else
+ {
+ size_t length = strlen (string);
+ /* Each ISO-8859-1 character needs 2 bytes at worst. */
+ unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
+ unsigned char *q = utf8_string;
+ const char *str = string;
+ const char *str_limit = str + length;
+
+ while (str < str_limit)
+ {
+ unsigned int uc = (unsigned char) *str++;
+ int n = u8_uctomb (q, uc, 6);
+ assert (n > 0);
+ q += n;
+ }
+ *q = '\0';
+ assert (q - utf8_string <= 2 * length);
+
+ return (char *) utf8_string;
+ }
+}
+
+
+/* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
+ encoding. May destructively modify the argument string. */
+static char *
+conv_from_java (char *string)
+{
+ /* This conversion can only shrink the string, never increase its size.
+ So there is no need to xmalloc the result freshly. */
+ const char *p = string;
+ unsigned char *q = (unsigned char *) string;
+
+ while (*p != '\0')
+ {
+ if (p[0] == '\\' && p[1] == 'u')
+ {
+ unsigned int n = 0;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ int c1 = (unsigned char) p[2 + i];
+
+ if (c1 >= '0' && c1 <= '9')
+ n = (n << 4) + (c1 - '0');
+ else if (c1 >= 'A' && c1 <= 'F')
+ n = (n << 4) + (c1 - 'A' + 10);
+ else if (c1 >= 'a' && c1 <= 'f')
+ n = (n << 4) + (c1 - 'a' + 10);
+ else
+ goto just_one_byte;
+ }
+
+ if (i == 4)
+ {
+ unsigned int uc;
+
+ if (n >= 0xd800 && n < 0xdc00)
+ {
+ if (p[6] == '\\' && p[7] == 'u')
+ {
+ unsigned int m = 0;
+
+ for (i = 0; i < 4; i++)
+ {
+ int c1 = (unsigned char) p[8 + i];
+
+ if (c1 >= '0' && c1 <= '9')
+ m = (m << 4) + (c1 - '0');
+ else if (c1 >= 'A' && c1 <= 'F')
+ m = (m << 4) + (c1 - 'A' + 10);
+ else if (c1 >= 'a' && c1 <= 'f')
+ m = (m << 4) + (c1 - 'a' + 10);
+ else
+ goto just_one_byte;
+ }
+
+ if (i == 4 && (m >= 0xdc00 && m < 0xe000))
+ {
+ /* Combine two UTF-16 words to a character. */
+ uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
+ p += 12;
+ }
+ else
+ goto just_one_byte;
+ }
+ else
+ goto just_one_byte;
+ }
+ else
+ {
+ uc = n;
+ p += 6;
+ }
+
+ q += u8_uctomb (q, uc, 6);
+ continue;
+ }
+ }
+ just_one_byte:
+ *q++ = (unsigned char) *p++;
+ }
+ *q = '\0';
+ return string;
+}
+
+
+/* Reads a key or value string.
+ Returns the string in UTF-8 encoding, or NULL if the end of the logical
+ line is reached.
+ Parsing ends:
+ - when returning NULL, after the end of the logical line,
+ - otherwise, if in_key is true, after the whitespace and possibly the
+ separator that follows after the string,
+ - otherwise, if in_key is false, after the end of the logical line. */
+
+static char *
+read_escaped_string (bool in_key)
+{
+ static unsigned short *buffer;
+ static size_t bufmax;
+ static size_t buflen;
+ int c;
+
+ /* Skip whitespace before the string. */
+ do
+ c = phase3_getc ();
+ while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
+
+ if (c == EOF || c == '\n')
+ /* Empty string. */
+ return NULL;
+
+ /* Start accumulating the string. We store the string in UTF-16 before
+ converting it to UTF-8. Why not converting every character directly to
+ UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
+ we must combine them to a single UTF-8 character. */
+ buflen = 0;
+ for (;;)
+ {
+ if (in_key && (c == '=' || c == ':'
+ || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
+ {
+ /* Skip whitespace after the string. */
+ while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
+ c = phase3_getc ();
+ /* Skip '=' or ':' separator. */
+ if (!(c == '=' || c == ':'))
+ phase3_ungetc (c);
+ break;
+ }
+
+ phase3_ungetc (c);
+
+ /* Read the next UTF-16 codepoint. */
+ c = phase4_getuc ();
+ if (c < 0)
+ break;
+ /* Append it to the buffer. */
+ if (buflen >= bufmax)
+ {
+ bufmax += 100;
+ buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
+ }
+ buffer[buflen++] = c;
+
+ c = phase3_getc ();
+ if (c == EOF || c == '\n')
+ {
+ if (in_key)
+ phase3_ungetc (c);
+ break;
+ }
+ }
+
+ /* Now convert from UTF-16 to UTF-8. */
+ {
+ size_t pos;
+ unsigned char *utf8_string;
+ unsigned char *q;
+
+ /* Each UTF-16 word needs 3 bytes at worst. */
+ utf8_string = XNMALLOC (3 * buflen + 1, unsigned char);
+ for (pos = 0, q = utf8_string; pos < buflen; )
+ {
+ ucs4_t uc;
+ int n;
+
+ pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
+ n = u8_uctomb (q, uc, 6);
+ assert (n > 0);
+ q += n;
+ }
+ *q = '\0';
+ assert (q - utf8_string <= 3 * buflen);
+
+ return (char *) utf8_string;
+ }
+}
+
+
+/* Read a .properties file from a stream, and dispatch to the various
+ abstract_catalog_reader_class_ty methods. */
+static void
+properties_parse (abstract_catalog_reader_ty *this, FILE *file,
+ const char *real_filename, const char *logical_filename)
+{
+ fp = file;
+ real_file_name = real_filename;
+ gram_pos.file_name = xstrdup (real_file_name);
+ gram_pos.line_number = 1;
+
+ for (;;)
+ {
+ int c;
+ bool comment;
+ bool hidden;
+
+ c = phase2_getc ();
+
+ if (c == EOF)
+ break;
+
+ comment = false;
+ hidden = false;
+ if (c == '#')
+ comment = true;
+ else if (c == '!')
+ {
+ /* For compatibility with write-properties.c, we treat '!' not
+ followed by space as a fuzzy or untranslated message. */
+ int c2 = phase2_getc ();
+ if (c2 == ' ' || c2 == '\n' || c2 == EOF)
+ comment = true;
+ else
+ hidden = true;
+ phase2_ungetc (c2);
+ }
+ else
+ phase2_ungetc (c);
+
+ if (comment)
+ {
+ /* A comment line. */
+ static char *buffer;
+ static size_t bufmax;
+ static size_t buflen;
+
+ buflen = 0;
+ for (;;)
+ {
+ c = phase2_getc ();
+
+ if (buflen >= bufmax)
+ {
+ bufmax += 100;
+ buffer = xrealloc (buffer, bufmax);
+ }
+
+ if (c == EOF || c == '\n')
+ break;
+
+ buffer[buflen++] = c;
+ }
+ buffer[buflen] = '\0';
+
+ po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
+ }
+ else
+ {
+ /* A key/value pair. */
+ char *msgid;
+ lex_pos_ty msgid_pos;
+
+ msgid_pos = gram_pos;
+ msgid = read_escaped_string (true);
+ if (msgid == NULL)
+ /* Skip blank line. */
+ ;
+ else
+ {
+ char *msgstr;
+ lex_pos_ty msgstr_pos;
+ bool force_fuzzy;
+
+ msgstr_pos = gram_pos;
+ msgstr = read_escaped_string (false);
+ if (msgstr == NULL)
+ msgstr = xstrdup ("");
+
+ /* Be sure to make the message fuzzy if it was commented out
+ and if it is not already header/fuzzy/untranslated. */
+ force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
+
+ po_callback_message (NULL, msgid, &msgid_pos, NULL,
+ msgstr, strlen (msgstr) + 1, &msgstr_pos,
+ NULL, NULL, NULL,
+ force_fuzzy, false);
+ }
+ }
+ }
+
+ fp = NULL;
+ real_file_name = NULL;
+ gram_pos.line_number = 0;
+}
+
+const struct catalog_input_format input_format_properties =
+{
+ properties_parse, /* parse */
+ true /* produces_utf8 */
+};