summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2023-03-12 01:39:57 +0100
committerBruno Haible <bruno@clisp.org>2023-03-14 02:57:28 +0100
commit589731cc1b49bfbb2467b102fba8e1afd2277eef (patch)
tree335a8c3c8837dfc54af97e4645ed687fadd2a82a
parent20e263b57a0bb446f1b4fbb0d4979a1db178be62 (diff)
downloadgettext-589731cc1b49bfbb2467b102fba8e1afd2277eef.tar.gz
xgettext: Fix abort when outputting a msgid that has an invalid UTF-8 character.
* gettext-tools/src/xg-encoding.c: Include unistr.h. (non_utf8_error_message): New function. (from_current_source_encoding): When xgettext_current_source_encoding is "UTF-8", check that the string is well-formed UTF-8. * gettext-tools/tests/xgettext-c-8: New file. * gettext-tools/tests/xgettext-python-5: New file. * gettext-tools/tests/xgettext-elisp-3: New file. * gettext-tools/tests/xgettext-librep-3: New file. * gettext-tools/tests/xgettext-awk-3: New file. * gettext-tools/tests/xgettext-lua-3: New file. * gettext-tools/tests/xgettext-vala-4: New file. * gettext-tools/tests/xgettext-php-5: New file. * gettext-tools/tests/Makefile.am (TESTS): Add them.
-rw-r--r--gettext-tools/src/xg-encoding.c54
-rw-r--r--gettext-tools/tests/Makefile.am15
-rwxr-xr-xgettext-tools/tests/xgettext-awk-318
-rwxr-xr-xgettext-tools/tests/xgettext-c-818
-rwxr-xr-xgettext-tools/tests/xgettext-elisp-318
-rwxr-xr-xgettext-tools/tests/xgettext-librep-318
-rwxr-xr-xgettext-tools/tests/xgettext-lua-318
-rwxr-xr-xgettext-tools/tests/xgettext-php-520
-rwxr-xr-xgettext-tools/tests/xgettext-python-518
-rwxr-xr-xgettext-tools/tests/xgettext-vala-418
10 files changed, 206 insertions, 9 deletions
diff --git a/gettext-tools/src/xg-encoding.c b/gettext-tools/src/xg-encoding.c
index 11793368e..d06587c85 100644
--- a/gettext-tools/src/xg-encoding.c
+++ b/gettext-tools/src/xg-encoding.c
@@ -1,5 +1,5 @@
/* Keeping track of the encoding of strings to be extracted.
- Copyright (C) 2001-2019 Free Software Foundation, Inc.
+ Copyright (C) 2001-2023 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -27,6 +27,7 @@
#include "msgl-ascii.h"
#include "msgl-iconv.h"
#include "po-charset.h"
+#include "unistr.h"
#include "xalloc.h"
#include "xerror.h"
#include "xvasprintf.h"
@@ -90,6 +91,42 @@ non_ascii_error_message (lexical_context_ty lcontext,
return errmsg;
}
+/* Error message about non-UTF-8 character in a specific lexical context. */
+static char *
+non_utf8_error_message (lexical_context_ty lcontext,
+ const char *file_name, size_t line_number)
+{
+ char buffer[21];
+ char *errmsg;
+
+ if (line_number == (size_t)(-1))
+ buffer[0] = '\0';
+ else
+ sprintf (buffer, ":%ld", (long) line_number);
+
+ switch (lcontext)
+ {
+ case lc_outside:
+ errmsg =
+ xasprintf (_("Character at %s%s is not UTF-8 encoded."),
+ file_name, buffer);
+ break;
+ case lc_comment:
+ errmsg =
+ xasprintf (_("Comment at or before %s%s is not UTF-8 encoded."),
+ file_name, buffer);
+ break;
+ case lc_string:
+ errmsg =
+ xasprintf (_("String at %s%s is not UTF-8 encoded."),
+ file_name, buffer);
+ break;
+ default:
+ abort ();
+ }
+ return errmsg;
+}
+
/* Convert the given string from xgettext_current_source_encoding to
the output file encoding (i.e. ASCII or UTF-8).
The resulting string is either the argument string, or freshly allocated.
@@ -112,7 +149,20 @@ from_current_source_encoding (const char *string,
exit (EXIT_FAILURE);
}
}
- else if (xgettext_current_source_encoding != po_charset_utf8)
+ else if (xgettext_current_source_encoding == po_charset_utf8)
+ {
+ if (u8_check ((uint8_t *) string, strlen (string)) != NULL)
+ {
+ multiline_error (xstrdup (""),
+ xasprintf ("%s\n%s\n",
+ non_utf8_error_message (lcontext,
+ file_name,
+ line_number),
+ _("Please specify the source encoding through --from-code.")));
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
{
#if HAVE_ICONV
struct conversion_context context;
diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am
index a9595dfc3..b0d96c83e 100644
--- a/gettext-tools/tests/Makefile.am
+++ b/gettext-tools/tests/Makefile.am
@@ -82,10 +82,10 @@ TESTS = gettext-1 gettext-2 \
xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \
xgettext-18 \
xgettext-appdata-1 \
- xgettext-awk-1 xgettext-awk-2 \
+ xgettext-awk-1 xgettext-awk-2 xgettext-awk-3 \
xgettext-awk-stackovfl-1 xgettext-awk-stackovfl-2 \
xgettext-c-2 xgettext-c-3 xgettext-c-4 xgettext-c-5 xgettext-c-6 \
- xgettext-c-7 \
+ xgettext-c-7 xgettext-c-8 \
xgettext-c-comment-1 xgettext-c-comment-2 xgettext-c-comment-3 \
xgettext-c-comment-4 xgettext-c-comment-5 xgettext-c-comment-6 \
xgettext-c-escape-1 xgettext-c-escape-2 xgettext-c-escape-3 \
@@ -100,7 +100,7 @@ TESTS = gettext-1 gettext-2 \
xgettext-csharp-stackovfl-1 xgettext-csharp-stackovfl-2 \
xgettext-csharp-stackovfl-3 xgettext-csharp-stackovfl-4 \
xgettext-desktop-1 xgettext-desktop-2 \
- xgettext-elisp-1 xgettext-elisp-2 \
+ xgettext-elisp-1 xgettext-elisp-2 xgettext-elisp-3 \
xgettext-elisp-stackovfl-1 xgettext-elisp-stackovfl-2 \
xgettext-elisp-stackovfl-3 xgettext-elisp-stackovfl-4 \
xgettext-glade-1 xgettext-glade-2 xgettext-glade-3 xgettext-glade-4 \
@@ -118,11 +118,11 @@ TESTS = gettext-1 gettext-2 \
xgettext-javascript-stackovfl-3 xgettext-javascript-stackovfl-4 \
xgettext-javascript-stackovfl-5 xgettext-javascript-stackovfl-6 \
xgettext-javascript-stackovfl-7 xgettext-javascript-stackovfl-8 \
- xgettext-librep-1 xgettext-librep-2 \
+ xgettext-librep-1 xgettext-librep-2 xgettext-librep-3 \
xgettext-librep-stackovfl-1 xgettext-librep-stackovfl-2 \
xgettext-lisp-1 xgettext-lisp-2 \
xgettext-lisp-stackovfl-1 xgettext-lisp-stackovfl-2 \
- xgettext-lua-1 xgettext-lua-2 \
+ xgettext-lua-1 xgettext-lua-2 xgettext-lua-3 \
xgettext-lua-stackovfl-1 xgettext-lua-stackovfl-2 \
xgettext-lua-stackovfl-3 xgettext-lua-stackovfl-4 \
xgettext-objc-1 xgettext-objc-2 \
@@ -131,6 +131,7 @@ TESTS = gettext-1 gettext-2 \
xgettext-perl-stackovfl-1 xgettext-perl-stackovfl-2 \
xgettext-perl-stackovfl-3 xgettext-perl-stackovfl-4 \
xgettext-php-1 xgettext-php-2 xgettext-php-3 xgettext-php-4 \
+ xgettext-php-5 \
xgettext-php-stackovfl-1 xgettext-php-stackovfl-2 \
xgettext-php-stackovfl-3 xgettext-php-stackovfl-4 \
xgettext-po-1 xgettext-po-2 xgettext-po-3 xgettext-po-4 \
@@ -138,7 +139,7 @@ TESTS = gettext-1 gettext-2 \
xgettext-properties-4 \
xgettext-rst-1 xgettext-rst-2 \
xgettext-python-1 xgettext-python-2 xgettext-python-3 \
- xgettext-python-4 \
+ xgettext-python-4 xgettext-python-5 \
xgettext-python-stackovfl-1 xgettext-python-stackovfl-2 \
xgettext-python-stackovfl-3 xgettext-python-stackovfl-4 \
xgettext-ruby-1 \
@@ -155,7 +156,7 @@ TESTS = gettext-1 gettext-2 \
xgettext-tcl-1 xgettext-tcl-2 xgettext-tcl-3 xgettext-tcl-4 \
xgettext-tcl-stackovfl-1 xgettext-tcl-stackovfl-2 \
xgettext-tcl-stackovfl-3 xgettext-tcl-stackovfl-4 \
- xgettext-vala-1 xgettext-vala-2 xgettext-vala-3 \
+ xgettext-vala-1 xgettext-vala-2 xgettext-vala-3 xgettext-vala-4 \
xgettext-vala-stackovfl-1 xgettext-vala-stackovfl-2 \
xgettext-ycp-1 xgettext-ycp-2 xgettext-ycp-3 xgettext-ycp-4 \
xgettext-ycp-stackovfl-1 xgettext-ycp-stackovfl-2 \
diff --git a/gettext-tools/tests/xgettext-awk-3 b/gettext-tools/tests/xgettext-awk-3
new file mode 100755
index 000000000..c52151792
--- /dev/null
+++ b/gettext-tools/tests/xgettext-awk-3
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test awk support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-a-3.awk
+_"\xE0"
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-a-3.tmp xg-a-3.awk 2>xg-a-3.err
+result=$?
+cat xg-a-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-a-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-c-8 b/gettext-tools/tests/xgettext-c-8
new file mode 100755
index 000000000..6a828b5e6
--- /dev/null
+++ b/gettext-tools/tests/xgettext-c-8
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test C support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-c-8.c
+gettext("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-c-8.tmp xg-c-8.c 2>xg-c-8.err
+result=$?
+cat xg-c-8.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-c-8.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-elisp-3 b/gettext-tools/tests/xgettext-elisp-3
new file mode 100755
index 000000000..6d27e148d
--- /dev/null
+++ b/gettext-tools/tests/xgettext-elisp-3
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test EmacsLisp support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-el-3.el
+(_ "\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-el-3.tmp xg-el-3.el 2>xg-el-3.err
+result=$?
+cat xg-el-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-el-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-librep-3 b/gettext-tools/tests/xgettext-librep-3
new file mode 100755
index 000000000..05d459fdc
--- /dev/null
+++ b/gettext-tools/tests/xgettext-librep-3
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test librep support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-lr-3.jl
+(_ "\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-lr-3.tmp xg-lr-3.jl 2>xg-lr-3.err
+result=$?
+cat xg-lr-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-lr-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-lua-3 b/gettext-tools/tests/xgettext-lua-3
new file mode 100755
index 000000000..bd736edfe
--- /dev/null
+++ b/gettext-tools/tests/xgettext-lua-3
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Lua support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-lu-3.lua
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-lu-3.tmp xg-lu-3.lua 2>xg-lu-3.err
+result=$?
+cat xg-lu-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-lu-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-php-5 b/gettext-tools/tests/xgettext-php-5
new file mode 100755
index 000000000..6c8a8e044
--- /dev/null
+++ b/gettext-tools/tests/xgettext-php-5
@@ -0,0 +1,20 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test PHP support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-ph-3.php
+<?
+_("\xE0")
+?>
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-ph-3.tmp xg-ph-3.php 2>xg-ph-3.err
+result=$?
+cat xg-ph-3.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-ph-3.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-python-5 b/gettext-tools/tests/xgettext-python-5
new file mode 100755
index 000000000..f480b4a28
--- /dev/null
+++ b/gettext-tools/tests/xgettext-python-5
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Python support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-py-5.py
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-py-5.tmp xg-py-5.py 2>xg-py-5.err
+result=$?
+cat xg-py-5.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-py-5.err >/dev/null || Exit 1
+
+exit 0
diff --git a/gettext-tools/tests/xgettext-vala-4 b/gettext-tools/tests/xgettext-vala-4
new file mode 100755
index 000000000..be1910c9b
--- /dev/null
+++ b/gettext-tools/tests/xgettext-vala-4
@@ -0,0 +1,18 @@
+#! /bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Vala support: strings with hexadecimal escape sequences that are
+# invalid UTF-8.
+
+cat <<\EOF > xg-vala-4.vala
+_("\xE0")
+EOF
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-vala-4.tmp xg-vala-4.vala 2>xg-vala-4.err
+result=$?
+cat xg-vala-4.err
+test $result = 1 || Exit 1
+grep 'is not UTF-8 encoded' xg-vala-4.err >/dev/null || Exit 1
+
+exit 0