diff options
author | Bruno Haible <bruno@clisp.org> | 2023-03-12 01:39:57 +0100 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2023-03-14 02:57:28 +0100 |
commit | 589731cc1b49bfbb2467b102fba8e1afd2277eef (patch) | |
tree | 335a8c3c8837dfc54af97e4645ed687fadd2a82a | |
parent | 20e263b57a0bb446f1b4fbb0d4979a1db178be62 (diff) | |
download | gettext-589731cc1b49bfbb2467b102fba8e1afd2277eef.tar.gz |
xgettext: Fix abort when outputting a msgid that has an invalid UTF-8 character.
* gettext-tools/src/xg-encoding.c: Include unistr.h.
(non_utf8_error_message): New function.
(from_current_source_encoding): When xgettext_current_source_encoding is
"UTF-8", check that the string is well-formed UTF-8.
* gettext-tools/tests/xgettext-c-8: New file.
* gettext-tools/tests/xgettext-python-5: New file.
* gettext-tools/tests/xgettext-elisp-3: New file.
* gettext-tools/tests/xgettext-librep-3: New file.
* gettext-tools/tests/xgettext-awk-3: New file.
* gettext-tools/tests/xgettext-lua-3: New file.
* gettext-tools/tests/xgettext-vala-4: New file.
* gettext-tools/tests/xgettext-php-5: New file.
* gettext-tools/tests/Makefile.am (TESTS): Add them.
-rw-r--r-- | gettext-tools/src/xg-encoding.c | 54 | ||||
-rw-r--r-- | gettext-tools/tests/Makefile.am | 15 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-awk-3 | 18 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-c-8 | 18 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-elisp-3 | 18 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-librep-3 | 18 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-lua-3 | 18 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-php-5 | 20 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-python-5 | 18 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-vala-4 | 18 |
10 files changed, 206 insertions, 9 deletions
diff --git a/gettext-tools/src/xg-encoding.c b/gettext-tools/src/xg-encoding.c index 11793368e..d06587c85 100644 --- a/gettext-tools/src/xg-encoding.c +++ b/gettext-tools/src/xg-encoding.c @@ -1,5 +1,5 @@ /* Keeping track of the encoding of strings to be extracted. - Copyright (C) 2001-2019 Free Software Foundation, Inc. + Copyright (C) 2001-2023 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,6 +27,7 @@ #include "msgl-ascii.h" #include "msgl-iconv.h" #include "po-charset.h" +#include "unistr.h" #include "xalloc.h" #include "xerror.h" #include "xvasprintf.h" @@ -90,6 +91,42 @@ non_ascii_error_message (lexical_context_ty lcontext, return errmsg; } +/* Error message about non-UTF-8 character in a specific lexical context. */ +static char * +non_utf8_error_message (lexical_context_ty lcontext, + const char *file_name, size_t line_number) +{ + char buffer[21]; + char *errmsg; + + if (line_number == (size_t)(-1)) + buffer[0] = '\0'; + else + sprintf (buffer, ":%ld", (long) line_number); + + switch (lcontext) + { + case lc_outside: + errmsg = + xasprintf (_("Character at %s%s is not UTF-8 encoded."), + file_name, buffer); + break; + case lc_comment: + errmsg = + xasprintf (_("Comment at or before %s%s is not UTF-8 encoded."), + file_name, buffer); + break; + case lc_string: + errmsg = + xasprintf (_("String at %s%s is not UTF-8 encoded."), + file_name, buffer); + break; + default: + abort (); + } + return errmsg; +} + /* Convert the given string from xgettext_current_source_encoding to the output file encoding (i.e. ASCII or UTF-8). The resulting string is either the argument string, or freshly allocated. @@ -112,7 +149,20 @@ from_current_source_encoding (const char *string, exit (EXIT_FAILURE); } } - else if (xgettext_current_source_encoding != po_charset_utf8) + else if (xgettext_current_source_encoding == po_charset_utf8) + { + if (u8_check ((uint8_t *) string, strlen (string)) != NULL) + { + multiline_error (xstrdup (""), + xasprintf ("%s\n%s\n", + non_utf8_error_message (lcontext, + file_name, + line_number), + _("Please specify the source encoding through --from-code."))); + exit (EXIT_FAILURE); + } + } + else { #if HAVE_ICONV struct conversion_context context; diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index a9595dfc3..b0d96c83e 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -82,10 +82,10 @@ TESTS = gettext-1 gettext-2 \ xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \ xgettext-18 \ xgettext-appdata-1 \ - xgettext-awk-1 xgettext-awk-2 \ + xgettext-awk-1 xgettext-awk-2 xgettext-awk-3 \ xgettext-awk-stackovfl-1 xgettext-awk-stackovfl-2 \ xgettext-c-2 xgettext-c-3 xgettext-c-4 xgettext-c-5 xgettext-c-6 \ - xgettext-c-7 \ + xgettext-c-7 xgettext-c-8 \ xgettext-c-comment-1 xgettext-c-comment-2 xgettext-c-comment-3 \ xgettext-c-comment-4 xgettext-c-comment-5 xgettext-c-comment-6 \ xgettext-c-escape-1 xgettext-c-escape-2 xgettext-c-escape-3 \ @@ -100,7 +100,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-csharp-stackovfl-1 xgettext-csharp-stackovfl-2 \ xgettext-csharp-stackovfl-3 xgettext-csharp-stackovfl-4 \ xgettext-desktop-1 xgettext-desktop-2 \ - xgettext-elisp-1 xgettext-elisp-2 \ + xgettext-elisp-1 xgettext-elisp-2 xgettext-elisp-3 \ xgettext-elisp-stackovfl-1 xgettext-elisp-stackovfl-2 \ xgettext-elisp-stackovfl-3 xgettext-elisp-stackovfl-4 \ xgettext-glade-1 xgettext-glade-2 xgettext-glade-3 xgettext-glade-4 \ @@ -118,11 +118,11 @@ TESTS = gettext-1 gettext-2 \ xgettext-javascript-stackovfl-3 xgettext-javascript-stackovfl-4 \ xgettext-javascript-stackovfl-5 xgettext-javascript-stackovfl-6 \ xgettext-javascript-stackovfl-7 xgettext-javascript-stackovfl-8 \ - xgettext-librep-1 xgettext-librep-2 \ + xgettext-librep-1 xgettext-librep-2 xgettext-librep-3 \ xgettext-librep-stackovfl-1 xgettext-librep-stackovfl-2 \ xgettext-lisp-1 xgettext-lisp-2 \ xgettext-lisp-stackovfl-1 xgettext-lisp-stackovfl-2 \ - xgettext-lua-1 xgettext-lua-2 \ + xgettext-lua-1 xgettext-lua-2 xgettext-lua-3 \ xgettext-lua-stackovfl-1 xgettext-lua-stackovfl-2 \ xgettext-lua-stackovfl-3 xgettext-lua-stackovfl-4 \ xgettext-objc-1 xgettext-objc-2 \ @@ -131,6 +131,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-perl-stackovfl-1 xgettext-perl-stackovfl-2 \ xgettext-perl-stackovfl-3 xgettext-perl-stackovfl-4 \ xgettext-php-1 xgettext-php-2 xgettext-php-3 xgettext-php-4 \ + xgettext-php-5 \ xgettext-php-stackovfl-1 xgettext-php-stackovfl-2 \ xgettext-php-stackovfl-3 xgettext-php-stackovfl-4 \ xgettext-po-1 xgettext-po-2 xgettext-po-3 xgettext-po-4 \ @@ -138,7 +139,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-properties-4 \ xgettext-rst-1 xgettext-rst-2 \ xgettext-python-1 xgettext-python-2 xgettext-python-3 \ - xgettext-python-4 \ + xgettext-python-4 xgettext-python-5 \ xgettext-python-stackovfl-1 xgettext-python-stackovfl-2 \ xgettext-python-stackovfl-3 xgettext-python-stackovfl-4 \ xgettext-ruby-1 \ @@ -155,7 +156,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-tcl-1 xgettext-tcl-2 xgettext-tcl-3 xgettext-tcl-4 \ xgettext-tcl-stackovfl-1 xgettext-tcl-stackovfl-2 \ xgettext-tcl-stackovfl-3 xgettext-tcl-stackovfl-4 \ - xgettext-vala-1 xgettext-vala-2 xgettext-vala-3 \ + xgettext-vala-1 xgettext-vala-2 xgettext-vala-3 xgettext-vala-4 \ xgettext-vala-stackovfl-1 xgettext-vala-stackovfl-2 \ xgettext-ycp-1 xgettext-ycp-2 xgettext-ycp-3 xgettext-ycp-4 \ xgettext-ycp-stackovfl-1 xgettext-ycp-stackovfl-2 \ diff --git a/gettext-tools/tests/xgettext-awk-3 b/gettext-tools/tests/xgettext-awk-3 new file mode 100755 index 000000000..c52151792 --- /dev/null +++ b/gettext-tools/tests/xgettext-awk-3 @@ -0,0 +1,18 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test awk support: strings with hexadecimal escape sequences that are +# invalid UTF-8. + +cat <<\EOF > xg-a-3.awk +_"\xE0" +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-a-3.tmp xg-a-3.awk 2>xg-a-3.err +result=$? +cat xg-a-3.err +test $result = 1 || Exit 1 +grep 'is not UTF-8 encoded' xg-a-3.err >/dev/null || Exit 1 + +exit 0 diff --git a/gettext-tools/tests/xgettext-c-8 b/gettext-tools/tests/xgettext-c-8 new file mode 100755 index 000000000..6a828b5e6 --- /dev/null +++ b/gettext-tools/tests/xgettext-c-8 @@ -0,0 +1,18 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test C support: strings with hexadecimal escape sequences that are +# invalid UTF-8. + +cat <<\EOF > xg-c-8.c +gettext("\xE0") +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-c-8.tmp xg-c-8.c 2>xg-c-8.err +result=$? +cat xg-c-8.err +test $result = 1 || Exit 1 +grep 'is not UTF-8 encoded' xg-c-8.err >/dev/null || Exit 1 + +exit 0 diff --git a/gettext-tools/tests/xgettext-elisp-3 b/gettext-tools/tests/xgettext-elisp-3 new file mode 100755 index 000000000..6d27e148d --- /dev/null +++ b/gettext-tools/tests/xgettext-elisp-3 @@ -0,0 +1,18 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test EmacsLisp support: strings with hexadecimal escape sequences that are +# invalid UTF-8. + +cat <<\EOF > xg-el-3.el +(_ "\xE0") +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-el-3.tmp xg-el-3.el 2>xg-el-3.err +result=$? +cat xg-el-3.err +test $result = 1 || Exit 1 +grep 'is not UTF-8 encoded' xg-el-3.err >/dev/null || Exit 1 + +exit 0 diff --git a/gettext-tools/tests/xgettext-librep-3 b/gettext-tools/tests/xgettext-librep-3 new file mode 100755 index 000000000..05d459fdc --- /dev/null +++ b/gettext-tools/tests/xgettext-librep-3 @@ -0,0 +1,18 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test librep support: strings with hexadecimal escape sequences that are +# invalid UTF-8. + +cat <<\EOF > xg-lr-3.jl +(_ "\xE0") +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-lr-3.tmp xg-lr-3.jl 2>xg-lr-3.err +result=$? +cat xg-lr-3.err +test $result = 1 || Exit 1 +grep 'is not UTF-8 encoded' xg-lr-3.err >/dev/null || Exit 1 + +exit 0 diff --git a/gettext-tools/tests/xgettext-lua-3 b/gettext-tools/tests/xgettext-lua-3 new file mode 100755 index 000000000..bd736edfe --- /dev/null +++ b/gettext-tools/tests/xgettext-lua-3 @@ -0,0 +1,18 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test Lua support: strings with hexadecimal escape sequences that are +# invalid UTF-8. + +cat <<\EOF > xg-lu-3.lua +_("\xE0") +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-lu-3.tmp xg-lu-3.lua 2>xg-lu-3.err +result=$? +cat xg-lu-3.err +test $result = 1 || Exit 1 +grep 'is not UTF-8 encoded' xg-lu-3.err >/dev/null || Exit 1 + +exit 0 diff --git a/gettext-tools/tests/xgettext-php-5 b/gettext-tools/tests/xgettext-php-5 new file mode 100755 index 000000000..6c8a8e044 --- /dev/null +++ b/gettext-tools/tests/xgettext-php-5 @@ -0,0 +1,20 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test PHP support: strings with hexadecimal escape sequences that are +# invalid UTF-8. + +cat <<\EOF > xg-ph-3.php +<? +_("\xE0") +?> +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-ph-3.tmp xg-ph-3.php 2>xg-ph-3.err +result=$? +cat xg-ph-3.err +test $result = 1 || Exit 1 +grep 'is not UTF-8 encoded' xg-ph-3.err >/dev/null || Exit 1 + +exit 0 diff --git a/gettext-tools/tests/xgettext-python-5 b/gettext-tools/tests/xgettext-python-5 new file mode 100755 index 000000000..f480b4a28 --- /dev/null +++ b/gettext-tools/tests/xgettext-python-5 @@ -0,0 +1,18 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test Python support: strings with hexadecimal escape sequences that are +# invalid UTF-8. + +cat <<\EOF > xg-py-5.py +_("\xE0") +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-py-5.tmp xg-py-5.py 2>xg-py-5.err +result=$? +cat xg-py-5.err +test $result = 1 || Exit 1 +grep 'is not UTF-8 encoded' xg-py-5.err >/dev/null || Exit 1 + +exit 0 diff --git a/gettext-tools/tests/xgettext-vala-4 b/gettext-tools/tests/xgettext-vala-4 new file mode 100755 index 000000000..be1910c9b --- /dev/null +++ b/gettext-tools/tests/xgettext-vala-4 @@ -0,0 +1,18 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test Vala support: strings with hexadecimal escape sequences that are +# invalid UTF-8. + +cat <<\EOF > xg-vala-4.vala +_("\xE0") +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --from-code=UTF-8 -d xg-vala-4.tmp xg-vala-4.vala 2>xg-vala-4.err +result=$? +cat xg-vala-4.err +test $result = 1 || Exit 1 +grep 'is not UTF-8 encoded' xg-vala-4.err >/dev/null || Exit 1 + +exit 0 |