diff options
author | Bruno Haible <bruno@clisp.org> | 2023-03-15 21:57:56 +0100 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2023-03-15 23:25:33 +0100 |
commit | b49d82c60787d5d8549b352bbb73757f7f27ff0a (patch) | |
tree | 99babe18926311aaf2e35f7e66d37a173d05c16d | |
parent | 093929b1d90a98bf0986ded8b629a9d232a85e2e (diff) | |
download | gettext-b49d82c60787d5d8549b352bbb73757f7f27ff0a.tar.gz |
xgettext: In language Perl, avoid a crash when NUL is used as delimiter.
* gettext-tools/src/str-desc.h: New file.
* gettext-tools/src/str-desc.c: New file.
* gettext-tools/src/FILES: Describe them.
* gettext-tools/src/Makefile.am (noinst_HEADERS): Add str-desc.h.
(libgettextsrc_la_SOURCES): Add str-desc.c.
* gettext-tools/src/msgl-ascii.h: Include str-desc.h.
(is_ascii_string_desc): New declaration.
* gettext-tools/src/msgl-ascii.c (is_ascii_string_desc): New function.
* gettext-tools/src/msgl-iconv.h: Include str-desc.h.
(convert_string_desc_directly): New declaration.
* gettext-tools/src/msgl-iconv.c: Include str-desc.h.
(convert_string_desc_directly): New function.
* gettext-tools/src/xg-encoding.h: Include str-desc.h.
(string_desc_from_current_source_encoding): New declaration.
* gettext-tools/src/xg-encoding.c (string_desc_from_current_source_encoding):
New function.
* gettext-tools/src/x-perl.c: Include str-desc.h, c-ctype.h.
(extract_quotelike_pass1): Return a string_desc_ty instead of a 'char *'.
(extract_quotelike_pass1_utf8): Likewise.
(extract_quotelike, extract_triple_quotelike): Update.
(interpolate_keywords): Take a string_desc_ty instead of a 'const char *' as
argument.
(x_perl_prelex): Update.
* gettext-tools/libgettextpo/Makefile.am (libgettextpo_la_AUXSOURCES): Add
str-desc.c.
* gettext-tools/tests/xgettext-perl-9: New file.
* gettext-tools/tests/Makefile.am (TESTS): Add it.
-rw-r--r-- | gettext-tools/libgettextpo/Makefile.am | 1 | ||||
-rw-r--r-- | gettext-tools/src/FILES | 4 | ||||
-rw-r--r-- | gettext-tools/src/Makefile.am | 3 | ||||
-rw-r--r-- | gettext-tools/src/msgl-ascii.c | 13 | ||||
-rw-r--r-- | gettext-tools/src/msgl-ascii.h | 5 | ||||
-rw-r--r-- | gettext-tools/src/msgl-iconv.c | 19 | ||||
-rw-r--r-- | gettext-tools/src/msgl-iconv.h | 6 | ||||
-rw-r--r-- | gettext-tools/src/str-desc.c | 331 | ||||
-rw-r--r-- | gettext-tools/src/str-desc.h | 140 | ||||
-rw-r--r-- | gettext-tools/src/x-perl.c | 144 | ||||
-rw-r--r-- | gettext-tools/src/xg-encoding.c | 61 | ||||
-rw-r--r-- | gettext-tools/src/xg-encoding.h | 11 | ||||
-rw-r--r-- | gettext-tools/tests/Makefile.am | 1 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-perl-9 | 56 |
14 files changed, 729 insertions, 66 deletions
diff --git a/gettext-tools/libgettextpo/Makefile.am b/gettext-tools/libgettextpo/Makefile.am index ee9ed39c4..b9c9fb630 100644 --- a/gettext-tools/libgettextpo/Makefile.am +++ b/gettext-tools/libgettextpo/Makefile.am @@ -52,6 +52,7 @@ libgettextpo_la_AUXSOURCES = \ ../src/dir-list.c \ ../src/message.c \ ../src/pos.c \ + ../src/str-desc.c \ ../src/msgl-ascii.c \ ../src/po-error.c \ ../src/po-xerror.c \ diff --git a/gettext-tools/src/FILES b/gettext-tools/src/FILES index 90471bccd..a119f6fa0 100644 --- a/gettext-tools/src/FILES +++ b/gettext-tools/src/FILES @@ -9,6 +9,10 @@ str-list.h str-list.c A list-of-immutable-strings type. +str-desc.h +str-desc.c + A string descriptor type, for strings that may contain NULs. + dir-list.h dir-list.c Management of the list of directories where PO files are diff --git a/gettext-tools/src/Makefile.am b/gettext-tools/src/Makefile.am index 59af8aaa7..3d5e2b989 100644 --- a/gettext-tools/src/Makefile.am +++ b/gettext-tools/src/Makefile.am @@ -39,7 +39,7 @@ noinst_HEADERS = \ pos.h message.h po-error.h po-xerror.h po-gram.h po-charset.h \ po-lex.h open-catalog.h read-catalog-abstract.h read-catalog.h \ read-po.h read-properties.h read-stringtable.h \ - str-list.h \ + str-desc.h str-list.h \ write-catalog.h write-po.h write-properties.h write-stringtable.h \ dir-list.h file-list.h po-gram-gen.h po-gram-gen2.h cldr-plural.h \ cldr-plural-exp.h locating-rule.h its.h search-path.h \ @@ -172,6 +172,7 @@ FORMAT_SOURCE += \ # libgettextsrc contains all code that is needed by at least two programs. libgettextsrc_la_SOURCES = \ $(COMMON_SOURCE) \ + str-desc.c \ read-catalog.c \ write-catalog.c write-properties.c write-stringtable.c write-po.c \ msgl-ascii.c \ diff --git a/gettext-tools/src/msgl-ascii.c b/gettext-tools/src/msgl-ascii.c index edc11405a..cabad7295 100644 --- a/gettext-tools/src/msgl-ascii.c +++ b/gettext-tools/src/msgl-ascii.c @@ -1,5 +1,5 @@ /* Message list test for ASCII character set. - Copyright (C) 2001-2002, 2005-2006 Free Software Foundation, Inc. + Copyright (C) 2001-2002, 2005-2006, 2023 Free Software Foundation, Inc. Written by Bruno Haible <haible@clisp.cons.org>, 2001. This program is free software: you can redistribute it and/or modify @@ -39,6 +39,17 @@ is_ascii_string (const char *string) } bool +is_ascii_string_desc (string_desc_ty string) +{ + size_t len = string_desc_length (string); + size_t i; + for (i = 0; i < len; i++) + if (!c_isascii ((unsigned char) string_desc_char_at (string, i))) + return false; + return true; +} + +bool is_ascii_string_list (string_list_ty *slp) { size_t i; diff --git a/gettext-tools/src/msgl-ascii.h b/gettext-tools/src/msgl-ascii.h index 590f7307e..54b024022 100644 --- a/gettext-tools/src/msgl-ascii.h +++ b/gettext-tools/src/msgl-ascii.h @@ -1,5 +1,5 @@ /* Message list test for ASCII character set. - Copyright (C) 2001-2003, 2005 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2005, 2023 Free Software Foundation, Inc. Written by Bruno Haible <haible@clisp.cons.org>, 2001. This program is free software: you can redistribute it and/or modify @@ -18,6 +18,7 @@ #ifndef _MSGL_ASCII_H #define _MSGL_ASCII_H +#include "str-desc.h" #include "message.h" #include <stdbool.h> @@ -31,6 +32,8 @@ extern "C" { extern bool is_ascii_string (const char *string); extern bool + is_ascii_string_desc (string_desc_ty string); +extern bool is_ascii_string_list (string_list_ty *slp); extern bool is_ascii_message (message_ty *mp); diff --git a/gettext-tools/src/msgl-iconv.c b/gettext-tools/src/msgl-iconv.c index 6bafd9506..178fd2dbe 100644 --- a/gettext-tools/src/msgl-iconv.c +++ b/gettext-tools/src/msgl-iconv.c @@ -1,5 +1,5 @@ /* Message list charset and locale charset handling. - Copyright (C) 2001-2003, 2005-2009, 2019-2021 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2005-2009, 2019-2023 Free Software Foundation, Inc. Written by Bruno Haible <haible@clisp.cons.org>, 2001. This program is free software: you can redistribute it and/or modify @@ -35,6 +35,7 @@ #include "noreturn.h" #include "progname.h" #include "basename-lgpl.h" +#include "str-desc.h" #include "message.h" #include "po-charset.h" #include "xstriconv.h" @@ -90,6 +91,22 @@ convert_string_directly (iconv_t cd, const char *string, return NULL; } +string_desc_ty +convert_string_desc_directly (iconv_t cd, string_desc_ty string, + const struct conversion_context* context) +{ + char *result = NULL; + size_t resultlen = 0; + + if (xmem_cd_iconv (string_desc_data (string), string_desc_length (string), + cd, &result, &resultlen) == 0) + return string_desc_new_addr (resultlen, result); + + conversion_error (context); + /* NOTREACHED */ + return string_desc_new (0); +} + static char * convert_string (const iconveh_t *cd, const char *string, const struct conversion_context* context) diff --git a/gettext-tools/src/msgl-iconv.h b/gettext-tools/src/msgl-iconv.h index bbc1e5347..774d995a4 100644 --- a/gettext-tools/src/msgl-iconv.h +++ b/gettext-tools/src/msgl-iconv.h @@ -1,5 +1,5 @@ /* Message list character set conversion. - Copyright (C) 2001-2003, 2005-2006, 2009 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2005-2006, 2009, 2023 Free Software Foundation, Inc. Written by Bruno Haible <haible@clisp.cons.org>, 2001. This program is free software: you can redistribute it and/or modify @@ -23,6 +23,7 @@ #include <iconv.h> #endif +#include "str-desc.h" #include "message.h" @@ -46,6 +47,9 @@ struct conversion_context Assumes that either FROM_CODE or TO_CODE is UTF-8. */ extern char *convert_string_directly (iconv_t cd, const char *string, const struct conversion_context* context); +extern string_desc_ty + convert_string_desc_directly (iconv_t cd, string_desc_ty string, + const struct conversion_context* context); #endif diff --git a/gettext-tools/src/str-desc.c b/gettext-tools/src/str-desc.c new file mode 100644 index 000000000..b6d101efa --- /dev/null +++ b/gettext-tools/src/str-desc.c @@ -0,0 +1,331 @@ +/* GNU gettext - internationalization aids + Copyright (C) 2023 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Bruno Haible <bruno@clisp.org>, 2023. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +/* Specification. */ +#include "str-desc.h" + +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> + +#include "xalloc.h" + + +/* ==== Side-effect-free operations on string descriptors ==== */ + +size_t +string_desc_length (string_desc_ty s) +{ + return s.nbytes; +} + +char +string_desc_char_at (string_desc_ty s, size_t i) +{ + if (!(i < s.nbytes)) + /* Invalid argument. */ + abort (); + return s.data[i]; +} + +const char * +string_desc_data (string_desc_ty s) +{ + return s.data; +} + +bool +string_desc_is_empty (string_desc_ty s) +{ + return s.nbytes == 0; +} + +bool +string_desc_startswith (string_desc_ty s, string_desc_ty prefix) +{ + return (s.nbytes >= prefix.nbytes + && (prefix.nbytes == 0 + || memcmp (s.data, prefix.data, prefix.nbytes) == 0)); +} + +bool +string_desc_endswith (string_desc_ty s, string_desc_ty suffix) +{ + return (s.nbytes >= suffix.nbytes + && (suffix.nbytes == 0 + || memcmp (s.data + (s.nbytes - suffix.nbytes), suffix.data, + suffix.nbytes) == 0)); +} + +int +string_desc_cmp (string_desc_ty a, string_desc_ty b) +{ + if (a.nbytes > b.nbytes) + { + if (b.nbytes == 0) + return 1; + return (memcmp (a.data, b.data, b.nbytes) < 0 ? -1 : 1); + } + else if (a.nbytes < b.nbytes) + { + if (a.nbytes == 0) + return -1; + return (memcmp (a.data, b.data, a.nbytes) > 0 ? 1 : -1); + } + else /* a.nbytes == b.nbytes */ + { + if (a.nbytes == 0) + return 0; + return memcmp (a.data, b.data, a.nbytes); + } +} + +ptrdiff_t +string_desc_index (string_desc_ty s, char c) +{ + if (s.nbytes > 0) + { + void *found = memchr (s.data, (unsigned char) c, s.nbytes); + if (found != NULL) + return (char *) found - s.data; + } + return -1; +} + +ptrdiff_t +string_desc_last_index (string_desc_ty s, char c) +{ + if (s.nbytes > 0) + { + void *found = memrchr (s.data, (unsigned char) c, s.nbytes); + if (found != NULL) + return (char *) found - s.data; + } + return -1; +} + +ptrdiff_t +string_desc_contains (string_desc_ty haystack, string_desc_ty needle) +{ + if (needle.nbytes == 0) + return 0; + void *found = + memmem (haystack.data, haystack.nbytes, needle.data, needle.nbytes); + if (found != NULL) + return (char *) found - haystack.data; + else + return -1; +} + +string_desc_ty +string_desc_from_c (const char *s) +{ + string_desc_ty result; + + result.nbytes = strlen (s); + result.data = (char *) s; + + return result; +} + +string_desc_ty +string_desc_substring (string_desc_ty s, size_t start, size_t end) +{ + string_desc_ty result; + + if (!(start <= end)) + /* Invalid arguments. */ + abort (); + + result.nbytes = end - start; + result.data = s.data + start; + + return result; +} + + +/* ==== Memory-allocating operations on string descriptors ==== */ + +string_desc_ty +string_desc_new (size_t n) +{ + string_desc_ty result; + + result.nbytes = n; + if (n == 0) + result.data = NULL; + else + result.data = (char *) xmalloc (n); + + return result; +} + +string_desc_ty +string_desc_new_addr (size_t n, char *addr) +{ + string_desc_ty result; + + result.nbytes = n; + if (n == 0) + result.data = NULL; + else + result.data = addr; + + return result; +} + +string_desc_ty +string_desc_new_filled (size_t n, char c) +{ + string_desc_ty result; + + result.nbytes = n; + if (n == 0) + result.data = NULL; + else + { + result.data = (char *) xmalloc (n); + memset (result.data, (unsigned char) c, n); + } + + return result; +} + +string_desc_ty +string_desc_copy (string_desc_ty s) +{ + string_desc_ty result; + size_t n = s.nbytes; + + result.nbytes = n; + if (n == 0) + result.data = NULL; + else + { + result.data = (char *) xmalloc (n); + memcpy (result.data, s.data, n); + } + + return result; +} + +string_desc_ty +string_desc_concat (size_t n, string_desc_ty string1, ...) +{ + if (n == 0) + /* Invalid argument. */ + abort (); + + size_t total = 0; + total += string1.nbytes; + if (n > 1) + { + va_list other_strings; + size_t i; + + va_start (other_strings, string1); + for (i = --n; i > 0; i--) + { + string_desc_ty arg = va_arg (other_strings, string_desc_ty); + total += arg.nbytes; + } + va_end (other_strings); + } + + char *combined = (char *) xmalloc (total); + size_t pos = 0; + memcpy (combined, string1.data, string1.nbytes); + pos += string1.nbytes; + if (n > 1) + { + va_list other_strings; + size_t i; + + va_start (other_strings, string1); + for (i = --n; i > 0; i--) + { + string_desc_ty arg = va_arg (other_strings, string_desc_ty); + if (arg.nbytes > 0) + memcpy (combined + pos, arg.data, arg.nbytes); + pos += arg.nbytes; + } + va_end (other_strings); + } + + string_desc_ty result; + result.nbytes = total; + result.data = combined; + + return result; +} + +char * +string_desc_c (string_desc_ty s) +{ + size_t n = s.nbytes; + char *result = (char *) xmalloc (n + 1); + if (n > 0) + memcpy (result, s.data, n); + result[n] = '\0'; + + return result; +} + + +/* ==== Operations with side effects on string descriptors ==== */ + +void +string_desc_set_char_at (string_desc_ty s, size_t i, char c) +{ + if (!(i < s.nbytes)) + /* Invalid argument. */ + abort (); + s.data[i] = c; +} + +void +string_desc_fill (string_desc_ty s, size_t start, size_t end, char c) +{ + if (!(start <= end)) + /* Invalid arguments. */ + abort (); + + if (start < end) + memset (s.data + start, (unsigned char) c, end - start); +} + +void +string_desc_overwrite (string_desc_ty s, size_t start, string_desc_ty t) +{ + if (!(start + t.nbytes <= s.nbytes)) + /* Invalid arguments. */ + abort (); + + if (t.nbytes > 0) + memcpy (s.data + start, t.data, t.nbytes); +} + +void +string_desc_free (string_desc_ty s) +{ + free (s.data); +} diff --git a/gettext-tools/src/str-desc.h b/gettext-tools/src/str-desc.h new file mode 100644 index 000000000..932641926 --- /dev/null +++ b/gettext-tools/src/str-desc.h @@ -0,0 +1,140 @@ +/* GNU gettext - internationalization aids + Copyright (C) 2023 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Bruno Haible <bruno@clisp.org>, 2023. */ + +#ifndef _STR_DESC_H +#define _STR_DESC_H 1 + +/* Get size_t, ptrdiff_t. */ +#include <stddef.h> + +/* Get bool. */ +#include <stdbool.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Type describing a string that may contain NUL bytes. + It's merely a descriptor of an array of bytes. */ +typedef struct string_desc_ty string_desc_ty; +struct string_desc_ty +{ + size_t nbytes; + char *data; +}; + +/* String descriptors can be passed and returned by value. */ + + +/* ==== Side-effect-free operations on string descriptors ==== */ + +/* Return the length of the string S. */ +extern size_t string_desc_length (string_desc_ty s); + +/* Return the byte at index I of string S. + I must be < length(S). */ +extern char string_desc_char_at (string_desc_ty s, size_t i); + +/* Return a read-only view of the bytes of S. */ +extern const char * string_desc_data (string_desc_ty s); + +/* Return true if S is the empty string. */ +extern bool string_desc_is_empty (string_desc_ty s); + +/* Return true if S starts with PREFIX. */ +extern bool string_desc_startswith (string_desc_ty s, string_desc_ty prefix); + +/* Return true if S ends with SUFFIX. */ +extern bool string_desc_endswith (string_desc_ty s, string_desc_ty suffix); + +/* Return > 0, == 0, or < 0 if A > B, A == B, A < B. + This uses a lexicographic ordering, where the bytes are compared as + 'unsigned char'. */ +extern int string_desc_cmp (string_desc_ty a, string_desc_ty b); + +/* Return the index of the first occurrence of C in S, + or -1 if there is none. */ +extern ptrdiff_t string_desc_index (string_desc_ty s, char c); + +/* Return the index of the last occurrence of C in S, + or -1 if there is none. */ +extern ptrdiff_t string_desc_last_index (string_desc_ty s, char c); + +/* Return the index of the first occurrence of NEEDLE in HAYSTACK, + or -1 if there is none. */ +extern ptrdiff_t string_desc_contains (string_desc_ty haystack, string_desc_ty needle); + +/* Return a string that represents the C string S, of length strlen (S). */ +extern string_desc_ty string_desc_from_c (const char *s); + +/* Return the substring of S, starting at offset START and ending at offset END. + START must be <= END. + The result is of length END - START. + The result must not be freed (since its storage is part of the storage + of S). */ +extern string_desc_ty string_desc_substring (string_desc_ty s, size_t start, size_t end); + + +/* ==== Memory-allocating operations on string descriptors ==== */ + +/* Return a string of length N, with uninitialized contents. */ +extern string_desc_ty string_desc_new (size_t n); + +/* Return a string of length N, at the given memory address. */ +extern string_desc_ty string_desc_new_addr (size_t n, char *addr); + +/* Return a string of length N, filled with C. */ +extern string_desc_ty string_desc_new_filled (size_t n, char c); + +/* Return a copy of string S. */ +extern string_desc_ty string_desc_copy (string_desc_ty s); + +/* Return the concatenation of N strings. N must be > 0. */ +extern string_desc_ty string_desc_concat (size_t n, string_desc_ty string1, ...); + +/* Return a copy of string S, as a NUL-terminated C string. */ +extern char * string_desc_c (string_desc_ty s); + + +/* ==== Operations with side effects on string descriptors ==== */ + +/* Overwrite the byte at index I of string S with C. + I must be < length(S). */ +extern void string_desc_set_char_at (string_desc_ty s, size_t i, char c); + +/* Fill part of S, starting at offset START and ending at offset END, + with copies of C. + START must be <= END. */ +extern void string_desc_fill (string_desc_ty s, size_t start, size_t end, char c); + +/* Overwrite part of S with T, starting at offset START. + START + length(T) must be <= length (S). */ +extern void string_desc_overwrite (string_desc_ty s, size_t start, string_desc_ty t); + +/* Free S. */ +extern void string_desc_free (string_desc_ty s); + + +#ifdef __cplusplus +} +#endif + + +#endif /* _STR_DESC_H */ diff --git a/gettext-tools/src/x-perl.c b/gettext-tools/src/x-perl.c index 428d9e486..30d4f9d42 100644 --- a/gettext-tools/src/x-perl.c +++ b/gettext-tools/src/x-perl.c @@ -32,6 +32,7 @@ #include "attribute.h" #include "message.h" #include "rc-str-list.h" +#include "str-desc.h" #include "xgettext.h" #include "xg-pos.h" #include "xg-encoding.h" @@ -43,6 +44,7 @@ #include "error.h" #include "error-progname.h" #include "xalloc.h" +#include "c-ctype.h" #include "po-charset.h" #include "unistr.h" #include "uniname.h" @@ -671,7 +673,7 @@ free_token (token_ty *tp) of the semantics of the construct. Return the complete string, including the starting and the trailing delimiter, with backslashes removed where appropriate. */ -static char * +static string_desc_ty extract_quotelike_pass1 (int delim) { /* This function is called recursively. No way to allocate stuff @@ -720,17 +722,16 @@ extract_quotelike_pass1 (int delim) if (c == counter_delim || c == EOF) { buffer[bufpos++] = counter_delim; /* will be stripped off later */ - buffer[bufpos++] = '\0'; #if DEBUG_PERL - fprintf (stderr, "PASS1: %s\n", buffer); + fprintf (stderr, "PASS1: %.*s\n", bufpos, buffer); #endif - return buffer; + return string_desc_new_addr (bufpos, buffer); } if (nested && c == delim) { - char *inner = extract_quotelike_pass1 (delim); - size_t len = strlen (inner); + string_desc_ty inner = extract_quotelike_pass1 (delim); + size_t len = string_desc_length (inner); /* Ensure room for len + 1 bytes. */ if (bufpos + len >= bufmax) @@ -740,8 +741,8 @@ extract_quotelike_pass1 (int delim) while (bufpos + len >= bufmax); buffer = xrealloc (buffer, bufmax); } - strcpy (buffer + bufpos, inner); - free (inner); + memcpy (buffer + bufpos, string_desc_data (inner), len); + string_desc_free (inner); bufpos += len; } else if (c == '\\') @@ -772,15 +773,15 @@ extract_quotelike_pass1 (int delim) /* Like extract_quotelike_pass1, but return the complete string in UTF-8 encoding. */ -static char * +static string_desc_ty extract_quotelike_pass1_utf8 (int delim) { - char *string = extract_quotelike_pass1 (delim); - char *utf8_string = - from_current_source_encoding (string, lc_string, logical_file_name, - line_number); - if (utf8_string != string) - free (string); + string_desc_ty string = extract_quotelike_pass1 (delim); + string_desc_ty utf8_string = + string_desc_from_current_source_encoding (string, lc_string, + logical_file_name, line_number); + if (utf8_string.data != string.data) + string_desc_free (string); return utf8_string; } @@ -800,7 +801,7 @@ static int nesting_depth; /* Forward declaration of local functions. */ -static void interpolate_keywords (message_list_ty *mlp, const char *string, +static void interpolate_keywords (message_list_ty *mlp, string_desc_ty string, int lineno); static token_ty *x_perl_lex (message_list_ty *mlp); static void x_perl_unlex (token_ty *tp); @@ -876,16 +877,15 @@ extract_oct (const char *string, size_t len, unsigned int *result) static void extract_quotelike (token_ty *tp, int delim) { - char *string = extract_quotelike_pass1_utf8 (delim); - size_t len = strlen (string); + string_desc_ty string = extract_quotelike_pass1_utf8 (delim); + size_t len = string_desc_length (string); tp->type = token_type_string; /* Take the string without the delimiters at the start and at the end. */ if (!(len >= 2)) abort (); - string[len - 1] = '\0'; - tp->string = xstrdup (string + 1); - free (string); + tp->string = string_desc_c (string_desc_substring (string, 1, len - 1)); + string_desc_free (string); tp->comment = add_reference (savable_comment); } @@ -897,14 +897,14 @@ static void extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim, bool interpolate) { - char *string; + string_desc_ty string; tp->type = token_type_regex_op; string = extract_quotelike_pass1_utf8 (delim); if (interpolate) interpolate_keywords (mlp, string, line_number); - free (string); + string_desc_free (string); if (delim == '(' || delim == '<' || delim == '{' || delim == '[') { @@ -921,7 +921,7 @@ extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim, string = extract_quotelike_pass1_utf8 (delim); if (interpolate) interpolate_keywords (mlp, string, line_number); - free (string); + string_desc_free (string); } /* Perform pass 3 of quotelike extraction (interpolation). @@ -1691,13 +1691,15 @@ extract_variable (message_list_ty *mlp, token_ty *tp, int first) variables inside a double-quoted string that may interpolate to some keyword hash (reference). The string is UTF-8 encoded. */ static void -interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) +interpolate_keywords (message_list_ty *mlp, string_desc_ty string, int lineno) { static char *buffer; static int bufmax = 0; int bufpos = 0; flag_context_ty context; - int c; + size_t length; + size_t index; + char c; bool maybe_hash_deref = false; enum parser_state { @@ -1747,6 +1749,9 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) state = initial; context = null_context; + length = string_desc_length (string); + index = 0; + token.type = token_type_string; token.sub_type = string_type_qq; token.line_number = line_number; @@ -1757,10 +1762,11 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) pos.file_name = logical_file_name; pos.line_number = lineno; - while ((c = (unsigned char) *string++) != '\0') + while (index < length) { void *keyword_value; + c = string_desc_char_at (string, index++); if (state == initial) bufpos = 0; @@ -1779,12 +1785,12 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) switch (c) { case '\\': - c = (unsigned char) *string++; - if (c == '\0') + if (index == length) { nesting_depth--; return; } + c = string_desc_char_at (string, index++); break; case '$': buffer[bufpos++] = '$'; @@ -1807,7 +1813,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) state = two_dollars; break; default: - if (c == '_' || c == ':' || c == '\'' || c >= 0x80 + if (!c_isascii ((unsigned char) c) + || c == '_' || c == ':' || c == '\'' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) @@ -1821,7 +1828,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) } break; case two_dollars: - if (c == '_' || c == ':' || c == '\'' || c >= 0x80 + if (!c_isascii ((unsigned char) c) + || c == '_' || c == ':' || c == '\'' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) @@ -1874,7 +1882,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) state = initial; break; default: - if (c == '_' || c == ':' || c == '\'' || c >= 0x80 + if (!c_isascii ((unsigned char) c) + || c == '_' || c == ':' || c == '\'' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) @@ -1926,7 +1935,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) state = dquote; break; default: - if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80 + if (!c_isascii ((unsigned char) c) + || c == '_' || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { pos.line_number = lineno; @@ -1959,19 +1969,23 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) state = wait_rbrace; break; case '\\': - if (string[0] == '\"') + if (index == length) { - buffer[bufpos++] = string++[0]; - } - else if (string[0]) - { - buffer[bufpos++] = '\\'; - buffer[bufpos++] = string++[0]; + context = null_context; + state = initial; } else { - context = null_context; - state = initial; + c = string_desc_char_at (string, index++); + if (c == '\"') + { + buffer[bufpos++] = c; + } + else + { + buffer[bufpos++] = '\\'; + buffer[bufpos++] = c; + } } break; default: @@ -1986,19 +2000,23 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) state = wait_rbrace; break; case '\\': - if (string[0] == '\'') - { - buffer[bufpos++] = string++[0]; - } - else if (string[0]) + if (index == length) { - buffer[bufpos++] = '\\'; - buffer[bufpos++] = string++[0]; + context = null_context; + state = initial; } else { - context = null_context; - state = initial; + c = string_desc_char_at (string, index++); + if (c == '\'') + { + buffer[bufpos++] = c; + } + else + { + buffer[bufpos++] = '\\'; + buffer[bufpos++] = c; + } } break; default: @@ -2007,7 +2025,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) } break; case barekey: - if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80 + if (!c_isascii ((unsigned char) c) + || c == '_' || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { buffer[bufpos++] = c; @@ -2343,7 +2362,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp) } extract_quotelike (tp, delim); if (delim != '\'') - interpolate_keywords (mlp, tp->string, line_number); + interpolate_keywords (mlp, string_desc_from_c (tp->string), + line_number); free (tp->string); drop_reference (tp->comment); tp->type = token_type_regex_op; @@ -2397,7 +2417,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp) case 'x': tp->type = token_type_string; tp->sub_type = string_type_qq; - interpolate_keywords (mlp, tp->string, line_number); + interpolate_keywords (mlp, string_desc_from_c (tp->string), + line_number); break; case 'r': drop_reference (tp->comment); @@ -2432,13 +2453,15 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp) case '"': extract_quotelike (tp, c); tp->sub_type = string_type_qq; - interpolate_keywords (mlp, tp->string, line_number); + interpolate_keywords (mlp, string_desc_from_c (tp->string), + line_number); return; case '`': extract_quotelike (tp, c); tp->sub_type = string_type_qq; - interpolate_keywords (mlp, tp->string, line_number); + interpolate_keywords (mlp, string_desc_from_c (tp->string), + line_number); return; case '\'': @@ -2535,7 +2558,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp) tp->type = token_type_string; tp->sub_type = string_type_qq; tp->line_number = line_number + 1; - interpolate_keywords (mlp, tp->string, tp->line_number); + interpolate_keywords (mlp, string_desc_from_c (tp->string), + tp->line_number); return; } else if ((c >= 'A' && c <= 'Z') @@ -2577,7 +2601,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp) tp->sub_type = string_type_qq; tp->comment = add_reference (savable_comment); tp->line_number = line_number + 1; - interpolate_keywords (mlp, tp->string, tp->line_number); + interpolate_keywords (mlp, string_desc_from_c (tp->string), + tp->line_number); return; } } @@ -2618,7 +2643,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp) if (prefer_regexp_over_division (tp->last_type)) { extract_quotelike (tp, c); - interpolate_keywords (mlp, tp->string, line_number); + interpolate_keywords (mlp, string_desc_from_c (tp->string), + line_number); free (tp->string); drop_reference (tp->comment); tp->type = token_type_regex_op; diff --git a/gettext-tools/src/xg-encoding.c b/gettext-tools/src/xg-encoding.c index b36fe148f..4bcd76e48 100644 --- a/gettext-tools/src/xg-encoding.c +++ b/gettext-tools/src/xg-encoding.c @@ -163,7 +163,7 @@ from_current_source_encoding (const char *string, } else if (xgettext_current_source_encoding == po_charset_utf8) { - if (u8_check ((uint8_t *) string, strlen (string)) != NULL) + if (u8_check ((const uint8_t *) string, strlen (string)) != NULL) { multiline_error (xstrdup (""), xasprintf ("%s\n%s\n", @@ -197,3 +197,62 @@ from_current_source_encoding (const char *string, return (char *) string; } + +/* Like from_current_source_encoding, for a string that may contain NULs. */ +string_desc_ty +string_desc_from_current_source_encoding (string_desc_ty string, + lexical_context_ty lcontext, + const char *file_name, + size_t line_number) +{ + if (xgettext_current_source_encoding == po_charset_ascii) + { + if (!is_ascii_string_desc (string)) + { + multiline_error (xstrdup (""), + xasprintf ("%s\n%s\n", + non_ascii_error_message (lcontext, + file_name, + line_number), + _("Please specify the source encoding through --from-code."))); + exit (EXIT_FAILURE); + } + } + else if (xgettext_current_source_encoding == po_charset_utf8) + { + if (u8_check ((const uint8_t *) string_desc_data (string), + string_desc_length (string)) + != NULL) + { + multiline_error (xstrdup (""), + xasprintf ("%s\n%s\n", + non_utf8_error_message (lcontext, + file_name, + line_number), + _("Please specify the source encoding through --from-code."))); + exit (EXIT_FAILURE); + } + } + else + { +#if HAVE_ICONV + struct conversion_context context; + + context.from_code = xgettext_current_source_encoding; + context.to_code = po_charset_utf8; + context.from_filename = file_name; + context.message = NULL; + + string = convert_string_desc_directly (xgettext_current_source_iconv, + string, &context); +#else + /* If we don't have iconv(), the only supported values for + xgettext_global_source_encoding and thus also for + xgettext_current_source_encoding are ASCII and UTF-8. + convert_string_desc_directly() should not be called in this case. */ + abort (); +#endif + } + + return string; +} diff --git a/gettext-tools/src/xg-encoding.h b/gettext-tools/src/xg-encoding.h index 9ef2da76f..f09665d82 100644 --- a/gettext-tools/src/xg-encoding.h +++ b/gettext-tools/src/xg-encoding.h @@ -1,5 +1,5 @@ /* Keeping track of the encoding of strings to be extracted. - Copyright (C) 2001-2019 Free Software Foundation, Inc. + Copyright (C) 2001-2023 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,6 +23,8 @@ #include <iconv.h> #endif +#include "str-desc.h" + #ifdef __cplusplus extern "C" { @@ -79,6 +81,13 @@ extern char *from_current_source_encoding (const char *string, const char *file_name, size_t line_number); +/* Like from_current_source_encoding, for a string that may contain NULs. */ +extern string_desc_ty + string_desc_from_current_source_encoding (string_desc_ty string, + lexical_context_ty lcontext, + const char *file_name, + size_t line_number); + #ifdef __cplusplus } diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 95638e9ec..6b30f9a00 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -130,6 +130,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-objc-1 xgettext-objc-2 \ xgettext-perl-1 xgettext-perl-2 xgettext-perl-3 xgettext-perl-4 \ xgettext-perl-5 xgettext-perl-6 xgettext-perl-7 xgettext-perl-8 \ + xgettext-perl-9 \ xgettext-perl-stackovfl-1 xgettext-perl-stackovfl-2 \ xgettext-perl-stackovfl-3 xgettext-perl-stackovfl-4 \ xgettext-php-1 xgettext-php-2 xgettext-php-3 xgettext-php-4 \ diff --git a/gettext-tools/tests/xgettext-perl-9 b/gettext-tools/tests/xgettext-perl-9 new file mode 100755 index 000000000..89462f8b5 --- /dev/null +++ b/gettext-tools/tests/xgettext-perl-9 @@ -0,0 +1,56 @@ +#!/bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test of Perl support: use of NUL as delimiter. + +printf '"\0' > xg-pl-9a.pl +printf "'"'\0' > xg-pl-9b.pl +printf '/\0' > xg-pl-9c.pl +printf '?\0' > xg-pl-9d.pl +printf '`\0' > xg-pl-9e.pl +printf 'm\0' > xg-pl-9f.pl +printf 'q\0' > xg-pl-9g.pl + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9a.pl 2>xg-pl-9.err +result=$? +cat xg-pl-9.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9b.pl 2>xg-pl-9.err +result=$? +cat xg-pl-9.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9c.pl 2>xg-pl-9.err +result=$? +cat xg-pl-9.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9d.pl 2>xg-pl-9.err +result=$? +cat xg-pl-9.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9e.pl 2>xg-pl-9.err +result=$? +cat xg-pl-9.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9f.pl 2>xg-pl-9.err +result=$? +cat xg-pl-9.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9g.pl 2>xg-pl-9.err +result=$? +cat xg-pl-9.err +test $result = 0 || Exit 1 + +exit 0 |