summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarlo Marcelo Arenas Belón <carenas@gmail.com>2023-04-20 18:37:20 -0700
committerJim Meyering <meyering@meta.com>2023-04-30 00:01:41 -0700
commitfa4e6c8a771554111c5890c5db2c80214cc2bcc9 (patch)
tree6536de0a9910508d8036d3b62677e9a699daf93b
parent8d3afeebcc2bdf2e8fd4ed1c5256e54be95f36a1 (diff)
downloadgrep-fa4e6c8a771554111c5890c5db2c80214cc2bcc9.tar.gz
pcre: work around a PCRE2_MATCH_INVALID_UTF bug
PCRE2 has a bug when using PCRE2_MATCH_INVALID_UTF: it would sometimes fail to match patterns using negative classes like \W and \D. * NEWS (Bug fixes): Mention it. * src/pcre2search.c: Restrict impact of the bug. Do not use the problematic flag with broken versions of PCRE2. Also, generate locale tables only for single-byte locales, as the PCRE2 documentation recommends this. * tests/Makefile.am (TESTS): Add the file name * tests/pcre-utf8-bug224: New file, to test for this.
-rw-r--r--NEWS5
-rw-r--r--src/pcresearch.c29
-rw-r--r--tests/Makefile.am1
-rwxr-xr-xtests/pcre-utf8-bug22431
4 files changed, 54 insertions, 12 deletions
diff --git a/NEWS b/NEWS
index 995d14ef..3085b2a6 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,11 @@ GNU grep NEWS -*- outline -*-
when running on 32-bit x86 and ARM hosts using glibc 2.34+.
[bug introduced in grep 3.9]
+ grep -P no longer fails to match patterns using negated classes
+ like \D or \W when linked with PCRE2 10.34 or newer.
+ [bug introduced in grep 3.8]
+
+
** Changes in behavior
grep --version now prints a line describing the version of PCRE2 it uses.
diff --git a/src/pcresearch.c b/src/pcresearch.c
index e867f49f..44262ac6 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -39,6 +39,15 @@
# define PCRE2_EXTRA_ASCII_BSD 0
#endif
+/* Use PCRE2_MATCH_INVALID_UTF if supported and not buggy;
+ see <https://github.com/PCRE2Project/pcre2/issues/224>.
+ Assume the bug will be fixed after PCRE2 10.42. */
+#if defined PCRE2_MATCH_INVALID_UTF && 10 < PCRE2_MAJOR + (42 < PCRE2_MINOR)
+enum { MATCH_INVALID_UTF = PCRE2_MATCH_INVALID_UTF };
+#else
+enum { MATCH_INVALID_UTF = 0 };
+#endif
+
struct pcre_comp
{
/* General context for PCRE operations. */
@@ -130,16 +139,11 @@ jit_exec (struct pcre_comp *pc, char const *subject, idx_t search_bytes,
}
}
-/* Return true if E is an error code for bad UTF-8, and if pcre2_match
- could return E because PCRE lacks PCRE2_MATCH_INVALID_UTF. */
+/* Return true if E is an error code for bad UTF-8. */
static bool
bad_utf8_from_pcre2 (int e)
{
-#ifdef PCRE2_MATCH_INVALID_UTF
- return false;
-#else
return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1;
-#endif
}
/* Compile the -P style PATTERN, containing SIZE bytes that are
@@ -168,6 +172,9 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
flags |= PCRE2_UTF;
+ /* If supported, consider invalid UTF-8 as a barrier not an error. */
+ flags |= MATCH_INVALID_UTF;
+
/* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
so that \d does not have the undesirable effect of matching
non-ASCII digits. Otherwise (i.e., with PCRE2 10.42 and earlier),
@@ -180,10 +187,6 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
/* Do not match individual code units but only UTF-8. */
flags |= PCRE2_NEVER_BACKSLASH_C;
#endif
-#ifdef PCRE2_MATCH_INVALID_UTF
- /* Consider invalid UTF-8 as a barrier, instead of error. */
- flags |= PCRE2_MATCH_INVALID_UTF;
-#endif
}
/* FIXME: Remove this restriction. */
@@ -226,7 +229,9 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
size = re_size;
}
- pcre2_set_character_tables (ccontext, pcre2_maketables (gcontext));
+ if (!localeinfo.multibyte)
+ pcre2_set_character_tables (ccontext, pcre2_maketables (gcontext));
+
pc->cre = pcre2_compile ((PCRE2_SPTR) pattern, size, flags,
&ec, &e, ccontext);
if (!pc->cre)
@@ -313,7 +318,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
e = jit_exec (pc, subject, line_end - subject,
search_offset, options);
- if (!bad_utf8_from_pcre2 (e))
+ if (MATCH_INVALID_UTF || !bad_utf8_from_pcre2 (e))
break;
idx_t valid_bytes = pcre2_get_startchar (pc->data);
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 7718f24a..9b4422eb 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -155,6 +155,7 @@ TESTS = \
pcre-jitstack \
pcre-o \
pcre-utf8 \
+ pcre-utf8-bug224 \
pcre-utf8-w \
pcre-w \
pcre-wx-backref \
diff --git a/tests/pcre-utf8-bug224 b/tests/pcre-utf8-bug224
new file mode 100755
index 00000000..78845017
--- /dev/null
+++ b/tests/pcre-utf8-bug224
@@ -0,0 +1,31 @@
+#!/bin/sh
+# Ensure negated Perl classes match multibyte characters in UTF mode.
+#
+# Copyright (C) 2023 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_en_utf8_locale_
+LC_ALL=en_US.UTF-8
+export LC_ALL
+require_pcre_
+
+echo . | grep -qP '(*UTF).' 2>/dev/null \
+ || skip_ 'PCRE unicode support is compiled out'
+
+fail=0
+
+# 'ñ' - U+00F1 LATIN SMALL LETTER N WITH TILDE
+printf '\302\221\n' > in || framework_failure_
+grep -P '\D' in > out || fail=1
+compare in out || fail=1
+
+# '𝄞' - U+1D11E MUSICAL SYMBOL G CLEF
+printf '\360\235\204\236\n' > in || framework_failure_
+grep -P '\W' in > out || fail=1
+compare in out || fail=1
+
+Exit $fail