summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2023-03-15 21:57:56 +0100
committerBruno Haible <bruno@clisp.org>2023-03-15 23:25:33 +0100
commitb49d82c60787d5d8549b352bbb73757f7f27ff0a (patch)
tree99babe18926311aaf2e35f7e66d37a173d05c16d
parent093929b1d90a98bf0986ded8b629a9d232a85e2e (diff)
downloadgettext-b49d82c60787d5d8549b352bbb73757f7f27ff0a.tar.gz
xgettext: In language Perl, avoid a crash when NUL is used as delimiter.
* gettext-tools/src/str-desc.h: New file. * gettext-tools/src/str-desc.c: New file. * gettext-tools/src/FILES: Describe them. * gettext-tools/src/Makefile.am (noinst_HEADERS): Add str-desc.h. (libgettextsrc_la_SOURCES): Add str-desc.c. * gettext-tools/src/msgl-ascii.h: Include str-desc.h. (is_ascii_string_desc): New declaration. * gettext-tools/src/msgl-ascii.c (is_ascii_string_desc): New function. * gettext-tools/src/msgl-iconv.h: Include str-desc.h. (convert_string_desc_directly): New declaration. * gettext-tools/src/msgl-iconv.c: Include str-desc.h. (convert_string_desc_directly): New function. * gettext-tools/src/xg-encoding.h: Include str-desc.h. (string_desc_from_current_source_encoding): New declaration. * gettext-tools/src/xg-encoding.c (string_desc_from_current_source_encoding): New function. * gettext-tools/src/x-perl.c: Include str-desc.h, c-ctype.h. (extract_quotelike_pass1): Return a string_desc_ty instead of a 'char *'. (extract_quotelike_pass1_utf8): Likewise. (extract_quotelike, extract_triple_quotelike): Update. (interpolate_keywords): Take a string_desc_ty instead of a 'const char *' as argument. (x_perl_prelex): Update. * gettext-tools/libgettextpo/Makefile.am (libgettextpo_la_AUXSOURCES): Add str-desc.c. * gettext-tools/tests/xgettext-perl-9: New file. * gettext-tools/tests/Makefile.am (TESTS): Add it.
-rw-r--r--gettext-tools/libgettextpo/Makefile.am1
-rw-r--r--gettext-tools/src/FILES4
-rw-r--r--gettext-tools/src/Makefile.am3
-rw-r--r--gettext-tools/src/msgl-ascii.c13
-rw-r--r--gettext-tools/src/msgl-ascii.h5
-rw-r--r--gettext-tools/src/msgl-iconv.c19
-rw-r--r--gettext-tools/src/msgl-iconv.h6
-rw-r--r--gettext-tools/src/str-desc.c331
-rw-r--r--gettext-tools/src/str-desc.h140
-rw-r--r--gettext-tools/src/x-perl.c144
-rw-r--r--gettext-tools/src/xg-encoding.c61
-rw-r--r--gettext-tools/src/xg-encoding.h11
-rw-r--r--gettext-tools/tests/Makefile.am1
-rwxr-xr-xgettext-tools/tests/xgettext-perl-956
14 files changed, 729 insertions, 66 deletions
diff --git a/gettext-tools/libgettextpo/Makefile.am b/gettext-tools/libgettextpo/Makefile.am
index ee9ed39c4..b9c9fb630 100644
--- a/gettext-tools/libgettextpo/Makefile.am
+++ b/gettext-tools/libgettextpo/Makefile.am
@@ -52,6 +52,7 @@ libgettextpo_la_AUXSOURCES = \
../src/dir-list.c \
../src/message.c \
../src/pos.c \
+ ../src/str-desc.c \
../src/msgl-ascii.c \
../src/po-error.c \
../src/po-xerror.c \
diff --git a/gettext-tools/src/FILES b/gettext-tools/src/FILES
index 90471bccd..a119f6fa0 100644
--- a/gettext-tools/src/FILES
+++ b/gettext-tools/src/FILES
@@ -9,6 +9,10 @@ str-list.h
str-list.c
A list-of-immutable-strings type.
+str-desc.h
+str-desc.c
+ A string descriptor type, for strings that may contain NULs.
+
dir-list.h
dir-list.c
Management of the list of directories where PO files are
diff --git a/gettext-tools/src/Makefile.am b/gettext-tools/src/Makefile.am
index 59af8aaa7..3d5e2b989 100644
--- a/gettext-tools/src/Makefile.am
+++ b/gettext-tools/src/Makefile.am
@@ -39,7 +39,7 @@ noinst_HEADERS = \
pos.h message.h po-error.h po-xerror.h po-gram.h po-charset.h \
po-lex.h open-catalog.h read-catalog-abstract.h read-catalog.h \
read-po.h read-properties.h read-stringtable.h \
- str-list.h \
+ str-desc.h str-list.h \
write-catalog.h write-po.h write-properties.h write-stringtable.h \
dir-list.h file-list.h po-gram-gen.h po-gram-gen2.h cldr-plural.h \
cldr-plural-exp.h locating-rule.h its.h search-path.h \
@@ -172,6 +172,7 @@ FORMAT_SOURCE += \
# libgettextsrc contains all code that is needed by at least two programs.
libgettextsrc_la_SOURCES = \
$(COMMON_SOURCE) \
+ str-desc.c \
read-catalog.c \
write-catalog.c write-properties.c write-stringtable.c write-po.c \
msgl-ascii.c \
diff --git a/gettext-tools/src/msgl-ascii.c b/gettext-tools/src/msgl-ascii.c
index edc11405a..cabad7295 100644
--- a/gettext-tools/src/msgl-ascii.c
+++ b/gettext-tools/src/msgl-ascii.c
@@ -1,5 +1,5 @@
/* Message list test for ASCII character set.
- Copyright (C) 2001-2002, 2005-2006 Free Software Foundation, Inc.
+ Copyright (C) 2001-2002, 2005-2006, 2023 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2001.
This program is free software: you can redistribute it and/or modify
@@ -39,6 +39,17 @@ is_ascii_string (const char *string)
}
bool
+is_ascii_string_desc (string_desc_ty string)
+{
+ size_t len = string_desc_length (string);
+ size_t i;
+ for (i = 0; i < len; i++)
+ if (!c_isascii ((unsigned char) string_desc_char_at (string, i)))
+ return false;
+ return true;
+}
+
+bool
is_ascii_string_list (string_list_ty *slp)
{
size_t i;
diff --git a/gettext-tools/src/msgl-ascii.h b/gettext-tools/src/msgl-ascii.h
index 590f7307e..54b024022 100644
--- a/gettext-tools/src/msgl-ascii.h
+++ b/gettext-tools/src/msgl-ascii.h
@@ -1,5 +1,5 @@
/* Message list test for ASCII character set.
- Copyright (C) 2001-2003, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2001-2003, 2005, 2023 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2001.
This program is free software: you can redistribute it and/or modify
@@ -18,6 +18,7 @@
#ifndef _MSGL_ASCII_H
#define _MSGL_ASCII_H
+#include "str-desc.h"
#include "message.h"
#include <stdbool.h>
@@ -31,6 +32,8 @@ extern "C" {
extern bool
is_ascii_string (const char *string);
extern bool
+ is_ascii_string_desc (string_desc_ty string);
+extern bool
is_ascii_string_list (string_list_ty *slp);
extern bool
is_ascii_message (message_ty *mp);
diff --git a/gettext-tools/src/msgl-iconv.c b/gettext-tools/src/msgl-iconv.c
index 6bafd9506..178fd2dbe 100644
--- a/gettext-tools/src/msgl-iconv.c
+++ b/gettext-tools/src/msgl-iconv.c
@@ -1,5 +1,5 @@
/* Message list charset and locale charset handling.
- Copyright (C) 2001-2003, 2005-2009, 2019-2021 Free Software Foundation, Inc.
+ Copyright (C) 2001-2003, 2005-2009, 2019-2023 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2001.
This program is free software: you can redistribute it and/or modify
@@ -35,6 +35,7 @@
#include "noreturn.h"
#include "progname.h"
#include "basename-lgpl.h"
+#include "str-desc.h"
#include "message.h"
#include "po-charset.h"
#include "xstriconv.h"
@@ -90,6 +91,22 @@ convert_string_directly (iconv_t cd, const char *string,
return NULL;
}
+string_desc_ty
+convert_string_desc_directly (iconv_t cd, string_desc_ty string,
+ const struct conversion_context* context)
+{
+ char *result = NULL;
+ size_t resultlen = 0;
+
+ if (xmem_cd_iconv (string_desc_data (string), string_desc_length (string),
+ cd, &result, &resultlen) == 0)
+ return string_desc_new_addr (resultlen, result);
+
+ conversion_error (context);
+ /* NOTREACHED */
+ return string_desc_new (0);
+}
+
static char *
convert_string (const iconveh_t *cd, const char *string,
const struct conversion_context* context)
diff --git a/gettext-tools/src/msgl-iconv.h b/gettext-tools/src/msgl-iconv.h
index bbc1e5347..774d995a4 100644
--- a/gettext-tools/src/msgl-iconv.h
+++ b/gettext-tools/src/msgl-iconv.h
@@ -1,5 +1,5 @@
/* Message list character set conversion.
- Copyright (C) 2001-2003, 2005-2006, 2009 Free Software Foundation, Inc.
+ Copyright (C) 2001-2003, 2005-2006, 2009, 2023 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2001.
This program is free software: you can redistribute it and/or modify
@@ -23,6 +23,7 @@
#include <iconv.h>
#endif
+#include "str-desc.h"
#include "message.h"
@@ -46,6 +47,9 @@ struct conversion_context
Assumes that either FROM_CODE or TO_CODE is UTF-8. */
extern char *convert_string_directly (iconv_t cd, const char *string,
const struct conversion_context* context);
+extern string_desc_ty
+ convert_string_desc_directly (iconv_t cd, string_desc_ty string,
+ const struct conversion_context* context);
#endif
diff --git a/gettext-tools/src/str-desc.c b/gettext-tools/src/str-desc.c
new file mode 100644
index 000000000..b6d101efa
--- /dev/null
+++ b/gettext-tools/src/str-desc.c
@@ -0,0 +1,331 @@
+/* GNU gettext - internationalization aids
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Bruno Haible <bruno@clisp.org>, 2023. */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+/* Specification. */
+#include "str-desc.h"
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xalloc.h"
+
+
+/* ==== Side-effect-free operations on string descriptors ==== */
+
+size_t
+string_desc_length (string_desc_ty s)
+{
+ return s.nbytes;
+}
+
+char
+string_desc_char_at (string_desc_ty s, size_t i)
+{
+ if (!(i < s.nbytes))
+ /* Invalid argument. */
+ abort ();
+ return s.data[i];
+}
+
+const char *
+string_desc_data (string_desc_ty s)
+{
+ return s.data;
+}
+
+bool
+string_desc_is_empty (string_desc_ty s)
+{
+ return s.nbytes == 0;
+}
+
+bool
+string_desc_startswith (string_desc_ty s, string_desc_ty prefix)
+{
+ return (s.nbytes >= prefix.nbytes
+ && (prefix.nbytes == 0
+ || memcmp (s.data, prefix.data, prefix.nbytes) == 0));
+}
+
+bool
+string_desc_endswith (string_desc_ty s, string_desc_ty suffix)
+{
+ return (s.nbytes >= suffix.nbytes
+ && (suffix.nbytes == 0
+ || memcmp (s.data + (s.nbytes - suffix.nbytes), suffix.data,
+ suffix.nbytes) == 0));
+}
+
+int
+string_desc_cmp (string_desc_ty a, string_desc_ty b)
+{
+ if (a.nbytes > b.nbytes)
+ {
+ if (b.nbytes == 0)
+ return 1;
+ return (memcmp (a.data, b.data, b.nbytes) < 0 ? -1 : 1);
+ }
+ else if (a.nbytes < b.nbytes)
+ {
+ if (a.nbytes == 0)
+ return -1;
+ return (memcmp (a.data, b.data, a.nbytes) > 0 ? 1 : -1);
+ }
+ else /* a.nbytes == b.nbytes */
+ {
+ if (a.nbytes == 0)
+ return 0;
+ return memcmp (a.data, b.data, a.nbytes);
+ }
+}
+
+ptrdiff_t
+string_desc_index (string_desc_ty s, char c)
+{
+ if (s.nbytes > 0)
+ {
+ void *found = memchr (s.data, (unsigned char) c, s.nbytes);
+ if (found != NULL)
+ return (char *) found - s.data;
+ }
+ return -1;
+}
+
+ptrdiff_t
+string_desc_last_index (string_desc_ty s, char c)
+{
+ if (s.nbytes > 0)
+ {
+ void *found = memrchr (s.data, (unsigned char) c, s.nbytes);
+ if (found != NULL)
+ return (char *) found - s.data;
+ }
+ return -1;
+}
+
+ptrdiff_t
+string_desc_contains (string_desc_ty haystack, string_desc_ty needle)
+{
+ if (needle.nbytes == 0)
+ return 0;
+ void *found =
+ memmem (haystack.data, haystack.nbytes, needle.data, needle.nbytes);
+ if (found != NULL)
+ return (char *) found - haystack.data;
+ else
+ return -1;
+}
+
+string_desc_ty
+string_desc_from_c (const char *s)
+{
+ string_desc_ty result;
+
+ result.nbytes = strlen (s);
+ result.data = (char *) s;
+
+ return result;
+}
+
+string_desc_ty
+string_desc_substring (string_desc_ty s, size_t start, size_t end)
+{
+ string_desc_ty result;
+
+ if (!(start <= end))
+ /* Invalid arguments. */
+ abort ();
+
+ result.nbytes = end - start;
+ result.data = s.data + start;
+
+ return result;
+}
+
+
+/* ==== Memory-allocating operations on string descriptors ==== */
+
+string_desc_ty
+string_desc_new (size_t n)
+{
+ string_desc_ty result;
+
+ result.nbytes = n;
+ if (n == 0)
+ result.data = NULL;
+ else
+ result.data = (char *) xmalloc (n);
+
+ return result;
+}
+
+string_desc_ty
+string_desc_new_addr (size_t n, char *addr)
+{
+ string_desc_ty result;
+
+ result.nbytes = n;
+ if (n == 0)
+ result.data = NULL;
+ else
+ result.data = addr;
+
+ return result;
+}
+
+string_desc_ty
+string_desc_new_filled (size_t n, char c)
+{
+ string_desc_ty result;
+
+ result.nbytes = n;
+ if (n == 0)
+ result.data = NULL;
+ else
+ {
+ result.data = (char *) xmalloc (n);
+ memset (result.data, (unsigned char) c, n);
+ }
+
+ return result;
+}
+
+string_desc_ty
+string_desc_copy (string_desc_ty s)
+{
+ string_desc_ty result;
+ size_t n = s.nbytes;
+
+ result.nbytes = n;
+ if (n == 0)
+ result.data = NULL;
+ else
+ {
+ result.data = (char *) xmalloc (n);
+ memcpy (result.data, s.data, n);
+ }
+
+ return result;
+}
+
+string_desc_ty
+string_desc_concat (size_t n, string_desc_ty string1, ...)
+{
+ if (n == 0)
+ /* Invalid argument. */
+ abort ();
+
+ size_t total = 0;
+ total += string1.nbytes;
+ if (n > 1)
+ {
+ va_list other_strings;
+ size_t i;
+
+ va_start (other_strings, string1);
+ for (i = --n; i > 0; i--)
+ {
+ string_desc_ty arg = va_arg (other_strings, string_desc_ty);
+ total += arg.nbytes;
+ }
+ va_end (other_strings);
+ }
+
+ char *combined = (char *) xmalloc (total);
+ size_t pos = 0;
+ memcpy (combined, string1.data, string1.nbytes);
+ pos += string1.nbytes;
+ if (n > 1)
+ {
+ va_list other_strings;
+ size_t i;
+
+ va_start (other_strings, string1);
+ for (i = --n; i > 0; i--)
+ {
+ string_desc_ty arg = va_arg (other_strings, string_desc_ty);
+ if (arg.nbytes > 0)
+ memcpy (combined + pos, arg.data, arg.nbytes);
+ pos += arg.nbytes;
+ }
+ va_end (other_strings);
+ }
+
+ string_desc_ty result;
+ result.nbytes = total;
+ result.data = combined;
+
+ return result;
+}
+
+char *
+string_desc_c (string_desc_ty s)
+{
+ size_t n = s.nbytes;
+ char *result = (char *) xmalloc (n + 1);
+ if (n > 0)
+ memcpy (result, s.data, n);
+ result[n] = '\0';
+
+ return result;
+}
+
+
+/* ==== Operations with side effects on string descriptors ==== */
+
+void
+string_desc_set_char_at (string_desc_ty s, size_t i, char c)
+{
+ if (!(i < s.nbytes))
+ /* Invalid argument. */
+ abort ();
+ s.data[i] = c;
+}
+
+void
+string_desc_fill (string_desc_ty s, size_t start, size_t end, char c)
+{
+ if (!(start <= end))
+ /* Invalid arguments. */
+ abort ();
+
+ if (start < end)
+ memset (s.data + start, (unsigned char) c, end - start);
+}
+
+void
+string_desc_overwrite (string_desc_ty s, size_t start, string_desc_ty t)
+{
+ if (!(start + t.nbytes <= s.nbytes))
+ /* Invalid arguments. */
+ abort ();
+
+ if (t.nbytes > 0)
+ memcpy (s.data + start, t.data, t.nbytes);
+}
+
+void
+string_desc_free (string_desc_ty s)
+{
+ free (s.data);
+}
diff --git a/gettext-tools/src/str-desc.h b/gettext-tools/src/str-desc.h
new file mode 100644
index 000000000..932641926
--- /dev/null
+++ b/gettext-tools/src/str-desc.h
@@ -0,0 +1,140 @@
+/* GNU gettext - internationalization aids
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Bruno Haible <bruno@clisp.org>, 2023. */
+
+#ifndef _STR_DESC_H
+#define _STR_DESC_H 1
+
+/* Get size_t, ptrdiff_t. */
+#include <stddef.h>
+
+/* Get bool. */
+#include <stdbool.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Type describing a string that may contain NUL bytes.
+ It's merely a descriptor of an array of bytes. */
+typedef struct string_desc_ty string_desc_ty;
+struct string_desc_ty
+{
+ size_t nbytes;
+ char *data;
+};
+
+/* String descriptors can be passed and returned by value. */
+
+
+/* ==== Side-effect-free operations on string descriptors ==== */
+
+/* Return the length of the string S. */
+extern size_t string_desc_length (string_desc_ty s);
+
+/* Return the byte at index I of string S.
+ I must be < length(S). */
+extern char string_desc_char_at (string_desc_ty s, size_t i);
+
+/* Return a read-only view of the bytes of S. */
+extern const char * string_desc_data (string_desc_ty s);
+
+/* Return true if S is the empty string. */
+extern bool string_desc_is_empty (string_desc_ty s);
+
+/* Return true if S starts with PREFIX. */
+extern bool string_desc_startswith (string_desc_ty s, string_desc_ty prefix);
+
+/* Return true if S ends with SUFFIX. */
+extern bool string_desc_endswith (string_desc_ty s, string_desc_ty suffix);
+
+/* Return > 0, == 0, or < 0 if A > B, A == B, A < B.
+ This uses a lexicographic ordering, where the bytes are compared as
+ 'unsigned char'. */
+extern int string_desc_cmp (string_desc_ty a, string_desc_ty b);
+
+/* Return the index of the first occurrence of C in S,
+ or -1 if there is none. */
+extern ptrdiff_t string_desc_index (string_desc_ty s, char c);
+
+/* Return the index of the last occurrence of C in S,
+ or -1 if there is none. */
+extern ptrdiff_t string_desc_last_index (string_desc_ty s, char c);
+
+/* Return the index of the first occurrence of NEEDLE in HAYSTACK,
+ or -1 if there is none. */
+extern ptrdiff_t string_desc_contains (string_desc_ty haystack, string_desc_ty needle);
+
+/* Return a string that represents the C string S, of length strlen (S). */
+extern string_desc_ty string_desc_from_c (const char *s);
+
+/* Return the substring of S, starting at offset START and ending at offset END.
+ START must be <= END.
+ The result is of length END - START.
+ The result must not be freed (since its storage is part of the storage
+ of S). */
+extern string_desc_ty string_desc_substring (string_desc_ty s, size_t start, size_t end);
+
+
+/* ==== Memory-allocating operations on string descriptors ==== */
+
+/* Return a string of length N, with uninitialized contents. */
+extern string_desc_ty string_desc_new (size_t n);
+
+/* Return a string of length N, at the given memory address. */
+extern string_desc_ty string_desc_new_addr (size_t n, char *addr);
+
+/* Return a string of length N, filled with C. */
+extern string_desc_ty string_desc_new_filled (size_t n, char c);
+
+/* Return a copy of string S. */
+extern string_desc_ty string_desc_copy (string_desc_ty s);
+
+/* Return the concatenation of N strings. N must be > 0. */
+extern string_desc_ty string_desc_concat (size_t n, string_desc_ty string1, ...);
+
+/* Return a copy of string S, as a NUL-terminated C string. */
+extern char * string_desc_c (string_desc_ty s);
+
+
+/* ==== Operations with side effects on string descriptors ==== */
+
+/* Overwrite the byte at index I of string S with C.
+ I must be < length(S). */
+extern void string_desc_set_char_at (string_desc_ty s, size_t i, char c);
+
+/* Fill part of S, starting at offset START and ending at offset END,
+ with copies of C.
+ START must be <= END. */
+extern void string_desc_fill (string_desc_ty s, size_t start, size_t end, char c);
+
+/* Overwrite part of S with T, starting at offset START.
+ START + length(T) must be <= length (S). */
+extern void string_desc_overwrite (string_desc_ty s, size_t start, string_desc_ty t);
+
+/* Free S. */
+extern void string_desc_free (string_desc_ty s);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _STR_DESC_H */
diff --git a/gettext-tools/src/x-perl.c b/gettext-tools/src/x-perl.c
index 428d9e486..30d4f9d42 100644
--- a/gettext-tools/src/x-perl.c
+++ b/gettext-tools/src/x-perl.c
@@ -32,6 +32,7 @@
#include "attribute.h"
#include "message.h"
#include "rc-str-list.h"
+#include "str-desc.h"
#include "xgettext.h"
#include "xg-pos.h"
#include "xg-encoding.h"
@@ -43,6 +44,7 @@
#include "error.h"
#include "error-progname.h"
#include "xalloc.h"
+#include "c-ctype.h"
#include "po-charset.h"
#include "unistr.h"
#include "uniname.h"
@@ -671,7 +673,7 @@ free_token (token_ty *tp)
of the semantics of the construct. Return the complete string,
including the starting and the trailing delimiter, with backslashes
removed where appropriate. */
-static char *
+static string_desc_ty
extract_quotelike_pass1 (int delim)
{
/* This function is called recursively. No way to allocate stuff
@@ -720,17 +722,16 @@ extract_quotelike_pass1 (int delim)
if (c == counter_delim || c == EOF)
{
buffer[bufpos++] = counter_delim; /* will be stripped off later */
- buffer[bufpos++] = '\0';
#if DEBUG_PERL
- fprintf (stderr, "PASS1: %s\n", buffer);
+ fprintf (stderr, "PASS1: %.*s\n", bufpos, buffer);
#endif
- return buffer;
+ return string_desc_new_addr (bufpos, buffer);
}
if (nested && c == delim)
{
- char *inner = extract_quotelike_pass1 (delim);
- size_t len = strlen (inner);
+ string_desc_ty inner = extract_quotelike_pass1 (delim);
+ size_t len = string_desc_length (inner);
/* Ensure room for len + 1 bytes. */
if (bufpos + len >= bufmax)
@@ -740,8 +741,8 @@ extract_quotelike_pass1 (int delim)
while (bufpos + len >= bufmax);
buffer = xrealloc (buffer, bufmax);
}
- strcpy (buffer + bufpos, inner);
- free (inner);
+ memcpy (buffer + bufpos, string_desc_data (inner), len);
+ string_desc_free (inner);
bufpos += len;
}
else if (c == '\\')
@@ -772,15 +773,15 @@ extract_quotelike_pass1 (int delim)
/* Like extract_quotelike_pass1, but return the complete string in UTF-8
encoding. */
-static char *
+static string_desc_ty
extract_quotelike_pass1_utf8 (int delim)
{
- char *string = extract_quotelike_pass1 (delim);
- char *utf8_string =
- from_current_source_encoding (string, lc_string, logical_file_name,
- line_number);
- if (utf8_string != string)
- free (string);
+ string_desc_ty string = extract_quotelike_pass1 (delim);
+ string_desc_ty utf8_string =
+ string_desc_from_current_source_encoding (string, lc_string,
+ logical_file_name, line_number);
+ if (utf8_string.data != string.data)
+ string_desc_free (string);
return utf8_string;
}
@@ -800,7 +801,7 @@ static int nesting_depth;
/* Forward declaration of local functions. */
-static void interpolate_keywords (message_list_ty *mlp, const char *string,
+static void interpolate_keywords (message_list_ty *mlp, string_desc_ty string,
int lineno);
static token_ty *x_perl_lex (message_list_ty *mlp);
static void x_perl_unlex (token_ty *tp);
@@ -876,16 +877,15 @@ extract_oct (const char *string, size_t len, unsigned int *result)
static void
extract_quotelike (token_ty *tp, int delim)
{
- char *string = extract_quotelike_pass1_utf8 (delim);
- size_t len = strlen (string);
+ string_desc_ty string = extract_quotelike_pass1_utf8 (delim);
+ size_t len = string_desc_length (string);
tp->type = token_type_string;
/* Take the string without the delimiters at the start and at the end. */
if (!(len >= 2))
abort ();
- string[len - 1] = '\0';
- tp->string = xstrdup (string + 1);
- free (string);
+ tp->string = string_desc_c (string_desc_substring (string, 1, len - 1));
+ string_desc_free (string);
tp->comment = add_reference (savable_comment);
}
@@ -897,14 +897,14 @@ static void
extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
bool interpolate)
{
- char *string;
+ string_desc_ty string;
tp->type = token_type_regex_op;
string = extract_quotelike_pass1_utf8 (delim);
if (interpolate)
interpolate_keywords (mlp, string, line_number);
- free (string);
+ string_desc_free (string);
if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
{
@@ -921,7 +921,7 @@ extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
string = extract_quotelike_pass1_utf8 (delim);
if (interpolate)
interpolate_keywords (mlp, string, line_number);
- free (string);
+ string_desc_free (string);
}
/* Perform pass 3 of quotelike extraction (interpolation).
@@ -1691,13 +1691,15 @@ extract_variable (message_list_ty *mlp, token_ty *tp, int first)
variables inside a double-quoted string that may interpolate to
some keyword hash (reference). The string is UTF-8 encoded. */
static void
-interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
+interpolate_keywords (message_list_ty *mlp, string_desc_ty string, int lineno)
{
static char *buffer;
static int bufmax = 0;
int bufpos = 0;
flag_context_ty context;
- int c;
+ size_t length;
+ size_t index;
+ char c;
bool maybe_hash_deref = false;
enum parser_state
{
@@ -1747,6 +1749,9 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
state = initial;
context = null_context;
+ length = string_desc_length (string);
+ index = 0;
+
token.type = token_type_string;
token.sub_type = string_type_qq;
token.line_number = line_number;
@@ -1757,10 +1762,11 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
pos.file_name = logical_file_name;
pos.line_number = lineno;
- while ((c = (unsigned char) *string++) != '\0')
+ while (index < length)
{
void *keyword_value;
+ c = string_desc_char_at (string, index++);
if (state == initial)
bufpos = 0;
@@ -1779,12 +1785,12 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
switch (c)
{
case '\\':
- c = (unsigned char) *string++;
- if (c == '\0')
+ if (index == length)
{
nesting_depth--;
return;
}
+ c = string_desc_char_at (string, index++);
break;
case '$':
buffer[bufpos++] = '$';
@@ -1807,7 +1813,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
state = two_dollars;
break;
default:
- if (c == '_' || c == ':' || c == '\'' || c >= 0x80
+ if (!c_isascii ((unsigned char) c)
+ || c == '_' || c == ':' || c == '\''
|| (c >= 'A' && c <= 'Z')
|| (c >= 'a' && c <= 'z')
|| (c >= '0' && c <= '9'))
@@ -1821,7 +1828,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
}
break;
case two_dollars:
- if (c == '_' || c == ':' || c == '\'' || c >= 0x80
+ if (!c_isascii ((unsigned char) c)
+ || c == '_' || c == ':' || c == '\''
|| (c >= 'A' && c <= 'Z')
|| (c >= 'a' && c <= 'z')
|| (c >= '0' && c <= '9'))
@@ -1874,7 +1882,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
state = initial;
break;
default:
- if (c == '_' || c == ':' || c == '\'' || c >= 0x80
+ if (!c_isascii ((unsigned char) c)
+ || c == '_' || c == ':' || c == '\''
|| (c >= 'A' && c <= 'Z')
|| (c >= 'a' && c <= 'z')
|| (c >= '0' && c <= '9'))
@@ -1926,7 +1935,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
state = dquote;
break;
default:
- if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
+ if (!c_isascii ((unsigned char) c)
+ || c == '_' || (c >= '0' && c <= '9')
|| (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
{
pos.line_number = lineno;
@@ -1959,19 +1969,23 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
state = wait_rbrace;
break;
case '\\':
- if (string[0] == '\"')
+ if (index == length)
{
- buffer[bufpos++] = string++[0];
- }
- else if (string[0])
- {
- buffer[bufpos++] = '\\';
- buffer[bufpos++] = string++[0];
+ context = null_context;
+ state = initial;
}
else
{
- context = null_context;
- state = initial;
+ c = string_desc_char_at (string, index++);
+ if (c == '\"')
+ {
+ buffer[bufpos++] = c;
+ }
+ else
+ {
+ buffer[bufpos++] = '\\';
+ buffer[bufpos++] = c;
+ }
}
break;
default:
@@ -1986,19 +2000,23 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
state = wait_rbrace;
break;
case '\\':
- if (string[0] == '\'')
- {
- buffer[bufpos++] = string++[0];
- }
- else if (string[0])
+ if (index == length)
{
- buffer[bufpos++] = '\\';
- buffer[bufpos++] = string++[0];
+ context = null_context;
+ state = initial;
}
else
{
- context = null_context;
- state = initial;
+ c = string_desc_char_at (string, index++);
+ if (c == '\'')
+ {
+ buffer[bufpos++] = c;
+ }
+ else
+ {
+ buffer[bufpos++] = '\\';
+ buffer[bufpos++] = c;
+ }
}
break;
default:
@@ -2007,7 +2025,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
}
break;
case barekey:
- if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
+ if (!c_isascii ((unsigned char) c)
+ || c == '_' || (c >= '0' && c <= '9')
|| (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
{
buffer[bufpos++] = c;
@@ -2343,7 +2362,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp)
}
extract_quotelike (tp, delim);
if (delim != '\'')
- interpolate_keywords (mlp, tp->string, line_number);
+ interpolate_keywords (mlp, string_desc_from_c (tp->string),
+ line_number);
free (tp->string);
drop_reference (tp->comment);
tp->type = token_type_regex_op;
@@ -2397,7 +2417,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp)
case 'x':
tp->type = token_type_string;
tp->sub_type = string_type_qq;
- interpolate_keywords (mlp, tp->string, line_number);
+ interpolate_keywords (mlp, string_desc_from_c (tp->string),
+ line_number);
break;
case 'r':
drop_reference (tp->comment);
@@ -2432,13 +2453,15 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp)
case '"':
extract_quotelike (tp, c);
tp->sub_type = string_type_qq;
- interpolate_keywords (mlp, tp->string, line_number);
+ interpolate_keywords (mlp, string_desc_from_c (tp->string),
+ line_number);
return;
case '`':
extract_quotelike (tp, c);
tp->sub_type = string_type_qq;
- interpolate_keywords (mlp, tp->string, line_number);
+ interpolate_keywords (mlp, string_desc_from_c (tp->string),
+ line_number);
return;
case '\'':
@@ -2535,7 +2558,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp)
tp->type = token_type_string;
tp->sub_type = string_type_qq;
tp->line_number = line_number + 1;
- interpolate_keywords (mlp, tp->string, tp->line_number);
+ interpolate_keywords (mlp, string_desc_from_c (tp->string),
+ tp->line_number);
return;
}
else if ((c >= 'A' && c <= 'Z')
@@ -2577,7 +2601,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp)
tp->sub_type = string_type_qq;
tp->comment = add_reference (savable_comment);
tp->line_number = line_number + 1;
- interpolate_keywords (mlp, tp->string, tp->line_number);
+ interpolate_keywords (mlp, string_desc_from_c (tp->string),
+ tp->line_number);
return;
}
}
@@ -2618,7 +2643,8 @@ x_perl_prelex (message_list_ty *mlp, token_ty *tp)
if (prefer_regexp_over_division (tp->last_type))
{
extract_quotelike (tp, c);
- interpolate_keywords (mlp, tp->string, line_number);
+ interpolate_keywords (mlp, string_desc_from_c (tp->string),
+ line_number);
free (tp->string);
drop_reference (tp->comment);
tp->type = token_type_regex_op;
diff --git a/gettext-tools/src/xg-encoding.c b/gettext-tools/src/xg-encoding.c
index b36fe148f..4bcd76e48 100644
--- a/gettext-tools/src/xg-encoding.c
+++ b/gettext-tools/src/xg-encoding.c
@@ -163,7 +163,7 @@ from_current_source_encoding (const char *string,
}
else if (xgettext_current_source_encoding == po_charset_utf8)
{
- if (u8_check ((uint8_t *) string, strlen (string)) != NULL)
+ if (u8_check ((const uint8_t *) string, strlen (string)) != NULL)
{
multiline_error (xstrdup (""),
xasprintf ("%s\n%s\n",
@@ -197,3 +197,62 @@ from_current_source_encoding (const char *string,
return (char *) string;
}
+
+/* Like from_current_source_encoding, for a string that may contain NULs. */
+string_desc_ty
+string_desc_from_current_source_encoding (string_desc_ty string,
+ lexical_context_ty lcontext,
+ const char *file_name,
+ size_t line_number)
+{
+ if (xgettext_current_source_encoding == po_charset_ascii)
+ {
+ if (!is_ascii_string_desc (string))
+ {
+ multiline_error (xstrdup (""),
+ xasprintf ("%s\n%s\n",
+ non_ascii_error_message (lcontext,
+ file_name,
+ line_number),
+ _("Please specify the source encoding through --from-code.")));
+ exit (EXIT_FAILURE);
+ }
+ }
+ else if (xgettext_current_source_encoding == po_charset_utf8)
+ {
+ if (u8_check ((const uint8_t *) string_desc_data (string),
+ string_desc_length (string))
+ != NULL)
+ {
+ multiline_error (xstrdup (""),
+ xasprintf ("%s\n%s\n",
+ non_utf8_error_message (lcontext,
+ file_name,
+ line_number),
+ _("Please specify the source encoding through --from-code.")));
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
+ {
+#if HAVE_ICONV
+ struct conversion_context context;
+
+ context.from_code = xgettext_current_source_encoding;
+ context.to_code = po_charset_utf8;
+ context.from_filename = file_name;
+ context.message = NULL;
+
+ string = convert_string_desc_directly (xgettext_current_source_iconv,
+ string, &context);
+#else
+ /* If we don't have iconv(), the only supported values for
+ xgettext_global_source_encoding and thus also for
+ xgettext_current_source_encoding are ASCII and UTF-8.
+ convert_string_desc_directly() should not be called in this case. */
+ abort ();
+#endif
+ }
+
+ return string;
+}
diff --git a/gettext-tools/src/xg-encoding.h b/gettext-tools/src/xg-encoding.h
index 9ef2da76f..f09665d82 100644
--- a/gettext-tools/src/xg-encoding.h
+++ b/gettext-tools/src/xg-encoding.h
@@ -1,5 +1,5 @@
/* Keeping track of the encoding of strings to be extracted.
- Copyright (C) 2001-2019 Free Software Foundation, Inc.
+ Copyright (C) 2001-2023 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -23,6 +23,8 @@
#include <iconv.h>
#endif
+#include "str-desc.h"
+
#ifdef __cplusplus
extern "C" {
@@ -79,6 +81,13 @@ extern char *from_current_source_encoding (const char *string,
const char *file_name,
size_t line_number);
+/* Like from_current_source_encoding, for a string that may contain NULs. */
+extern string_desc_ty
+ string_desc_from_current_source_encoding (string_desc_ty string,
+ lexical_context_ty lcontext,
+ const char *file_name,
+ size_t line_number);
+
#ifdef __cplusplus
}
diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am
index 95638e9ec..6b30f9a00 100644
--- a/gettext-tools/tests/Makefile.am
+++ b/gettext-tools/tests/Makefile.am
@@ -130,6 +130,7 @@ TESTS = gettext-1 gettext-2 \
xgettext-objc-1 xgettext-objc-2 \
xgettext-perl-1 xgettext-perl-2 xgettext-perl-3 xgettext-perl-4 \
xgettext-perl-5 xgettext-perl-6 xgettext-perl-7 xgettext-perl-8 \
+ xgettext-perl-9 \
xgettext-perl-stackovfl-1 xgettext-perl-stackovfl-2 \
xgettext-perl-stackovfl-3 xgettext-perl-stackovfl-4 \
xgettext-php-1 xgettext-php-2 xgettext-php-3 xgettext-php-4 \
diff --git a/gettext-tools/tests/xgettext-perl-9 b/gettext-tools/tests/xgettext-perl-9
new file mode 100755
index 000000000..89462f8b5
--- /dev/null
+++ b/gettext-tools/tests/xgettext-perl-9
@@ -0,0 +1,56 @@
+#!/bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test of Perl support: use of NUL as delimiter.
+
+printf '"\0' > xg-pl-9a.pl
+printf "'"'\0' > xg-pl-9b.pl
+printf '/\0' > xg-pl-9c.pl
+printf '?\0' > xg-pl-9d.pl
+printf '`\0' > xg-pl-9e.pl
+printf 'm\0' > xg-pl-9f.pl
+printf 'q\0' > xg-pl-9g.pl
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9a.pl 2>xg-pl-9.err
+result=$?
+cat xg-pl-9.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9b.pl 2>xg-pl-9.err
+result=$?
+cat xg-pl-9.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9c.pl 2>xg-pl-9.err
+result=$?
+cat xg-pl-9.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9d.pl 2>xg-pl-9.err
+result=$?
+cat xg-pl-9.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9e.pl 2>xg-pl-9.err
+result=$?
+cat xg-pl-9.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9f.pl 2>xg-pl-9.err
+result=$?
+cat xg-pl-9.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-pl-9.tmp xg-pl-9g.pl 2>xg-pl-9.err
+result=$?
+cat xg-pl-9.err
+test $result = 0 || Exit 1
+
+exit 0