diff options
author | Carlo Marcelo Arenas Belón <carenas@gmail.com> | 2023-04-20 18:37:20 -0700 |
---|---|---|
committer | Jim Meyering <meyering@meta.com> | 2023-04-30 00:01:41 -0700 |
commit | fa4e6c8a771554111c5890c5db2c80214cc2bcc9 (patch) | |
tree | 6536de0a9910508d8036d3b62677e9a699daf93b | |
parent | 8d3afeebcc2bdf2e8fd4ed1c5256e54be95f36a1 (diff) | |
download | grep-fa4e6c8a771554111c5890c5db2c80214cc2bcc9.tar.gz |
pcre: work around a PCRE2_MATCH_INVALID_UTF bug
PCRE2 has a bug when using PCRE2_MATCH_INVALID_UTF: it would
sometimes fail to match patterns using negative classes
like \W and \D.
* NEWS (Bug fixes): Mention it.
* src/pcre2search.c: Restrict impact of the bug.
Do not use the problematic flag with broken versions of PCRE2.
Also, generate locale tables only for single-byte locales,
as the PCRE2 documentation recommends this.
* tests/Makefile.am (TESTS): Add the file name
* tests/pcre-utf8-bug224: New file, to test for this.
-rw-r--r-- | NEWS | 5 | ||||
-rw-r--r-- | src/pcresearch.c | 29 | ||||
-rw-r--r-- | tests/Makefile.am | 1 | ||||
-rwxr-xr-x | tests/pcre-utf8-bug224 | 31 |
4 files changed, 54 insertions, 12 deletions
@@ -16,6 +16,11 @@ GNU grep NEWS -*- outline -*- when running on 32-bit x86 and ARM hosts using glibc 2.34+. [bug introduced in grep 3.9] + grep -P no longer fails to match patterns using negated classes + like \D or \W when linked with PCRE2 10.34 or newer. + [bug introduced in grep 3.8] + + ** Changes in behavior grep --version now prints a line describing the version of PCRE2 it uses. diff --git a/src/pcresearch.c b/src/pcresearch.c index e867f49f..44262ac6 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -39,6 +39,15 @@ # define PCRE2_EXTRA_ASCII_BSD 0 #endif +/* Use PCRE2_MATCH_INVALID_UTF if supported and not buggy; + see <https://github.com/PCRE2Project/pcre2/issues/224>. + Assume the bug will be fixed after PCRE2 10.42. */ +#if defined PCRE2_MATCH_INVALID_UTF && 10 < PCRE2_MAJOR + (42 < PCRE2_MINOR) +enum { MATCH_INVALID_UTF = PCRE2_MATCH_INVALID_UTF }; +#else +enum { MATCH_INVALID_UTF = 0 }; +#endif + struct pcre_comp { /* General context for PCRE operations. */ @@ -130,16 +139,11 @@ jit_exec (struct pcre_comp *pc, char const *subject, idx_t search_bytes, } } -/* Return true if E is an error code for bad UTF-8, and if pcre2_match - could return E because PCRE lacks PCRE2_MATCH_INVALID_UTF. */ +/* Return true if E is an error code for bad UTF-8. */ static bool bad_utf8_from_pcre2 (int e) { -#ifdef PCRE2_MATCH_INVALID_UTF - return false; -#else return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1; -#endif } /* Compile the -P style PATTERN, containing SIZE bytes that are @@ -168,6 +172,9 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) flags |= PCRE2_UTF; + /* If supported, consider invalid UTF-8 as a barrier not an error. */ + flags |= MATCH_INVALID_UTF; + /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP so that \d does not have the undesirable effect of matching non-ASCII digits. Otherwise (i.e., with PCRE2 10.42 and earlier), @@ -180,10 +187,6 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) /* Do not match individual code units but only UTF-8. */ flags |= PCRE2_NEVER_BACKSLASH_C; #endif -#ifdef PCRE2_MATCH_INVALID_UTF - /* Consider invalid UTF-8 as a barrier, instead of error. */ - flags |= PCRE2_MATCH_INVALID_UTF; -#endif } /* FIXME: Remove this restriction. */ @@ -226,7 +229,9 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) size = re_size; } - pcre2_set_character_tables (ccontext, pcre2_maketables (gcontext)); + if (!localeinfo.multibyte) + pcre2_set_character_tables (ccontext, pcre2_maketables (gcontext)); + pc->cre = pcre2_compile ((PCRE2_SPTR) pattern, size, flags, &ec, &e, ccontext); if (!pc->cre) @@ -313,7 +318,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, e = jit_exec (pc, subject, line_end - subject, search_offset, options); - if (!bad_utf8_from_pcre2 (e)) + if (MATCH_INVALID_UTF || !bad_utf8_from_pcre2 (e)) break; idx_t valid_bytes = pcre2_get_startchar (pc->data); diff --git a/tests/Makefile.am b/tests/Makefile.am index 7718f24a..9b4422eb 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -155,6 +155,7 @@ TESTS = \ pcre-jitstack \ pcre-o \ pcre-utf8 \ + pcre-utf8-bug224 \ pcre-utf8-w \ pcre-w \ pcre-wx-backref \ diff --git a/tests/pcre-utf8-bug224 b/tests/pcre-utf8-bug224 new file mode 100755 index 00000000..78845017 --- /dev/null +++ b/tests/pcre-utf8-bug224 @@ -0,0 +1,31 @@ +#!/bin/sh +# Ensure negated Perl classes match multibyte characters in UTF mode. +# +# Copyright (C) 2023 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src +require_en_utf8_locale_ +LC_ALL=en_US.UTF-8 +export LC_ALL +require_pcre_ + +echo . | grep -qP '(*UTF).' 2>/dev/null \ + || skip_ 'PCRE unicode support is compiled out' + +fail=0 + +# 'ñ' - U+00F1 LATIN SMALL LETTER N WITH TILDE +printf '\302\221\n' > in || framework_failure_ +grep -P '\D' in > out || fail=1 +compare in out || fail=1 + +# '𝄞' - U+1D11E MUSICAL SYMBOL G CLEF +printf '\360\235\204\236\n' > in || framework_failure_ +grep -P '\W' in > out || fail=1 +compare in out || fail=1 + +Exit $fail |