From c83ffc197ec483c6f44f907346f34127ec044ef0 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Sat, 18 Mar 2023 08:28:36 -0700 Subject: grep: -P (--perl-regexp) \d: match only ASCII digits Prior to grep-3.9, the PCRE matcher had always treated \d just like [0-9]. grep-3.9's fix for \w and \b mistakenly relaxed \d to also match multibyte digits. * src/grep.c (P_MATCHER_INDEX): Define enum. (pcre_pattern_expand_backslash_d): New function. (main): Call it for -P. * NEWS (Bug fixes): Mention it. * doc/grep.texi: Document it: with -P, \d matches only ASCII digits. Provide a PCRE documentation URL and an example of how to use (?s) with -z. * tests/pcre-ascii-digits: New test. * tests/Makefile.am (TESTS): Add that file name. Reported as https://bugs.gnu.org/62267 --- NEWS | 10 ++++++ doc/grep.texi | 31 +++++++++++++++++++ src/grep.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++- tests/Makefile.am | 1 + tests/pcre-ascii-digits | 31 +++++++++++++++++++ 5 files changed, 154 insertions(+), 1 deletion(-) create mode 100755 tests/pcre-ascii-digits diff --git a/NEWS b/NEWS index 803e14b3..a24cebd8 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,16 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Bug fixes + + With -P, \d now matches only ASCII digits, regardless of PCRE + options/modes. The changes in grep-3.9 to make \b and \w work + properly had the undesirable side effect of making \d also match + e.g., the Arabic digits: ٠١٢٣٤٥٦٧٨٩. With grep-3.9, -P '\d+' + would match that ten-digit (20-byte) string. Now, to match such + a digit, you would use \p{Nd}. + [bug introduced in grep 3.9] + * Noteworthy changes in release 3.9 (2023-03-05) [stable] diff --git a/doc/grep.texi b/doc/grep.texi index 621beaf5..eaad6e17 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -1141,6 +1141,37 @@ combined with the @option{-z} (@option{--null-data}) option, and note that @samp{grep@ -P} may warn of unimplemented features. @xref{Other Options}. +For documentation, refer to @url{https://www.pcre.org/}, with these caveats: +@itemize +@item +@samp{\d} always matches only the ten ASCII digits, regardless of locale or +in-regexp directives like @samp{(?aD)}. +Use @samp{\p@{Nd@}} if you require to match non-ASCII digits. +Once pcre2 support for @samp{(?aD)} is widespread enough, +we expect to make that the default, so it will be overridable. +@c Using pcre2 git commit pcre2-10.40-112-g6277357, this demonstrates how +@c we'll prefix with (?aD) to make \d's ASCII-only behavior the default: +@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '(?aD)^\d+' <<< '٠١٢٣٤٥٦٧٨٩' +@c [Exit 1] +@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '^\d+' <<< '٠١٢٣٤٥٦٧٨٩' +@c ٠١٢٣٤٥٦٧٨٩ + +@item +By default, @command{grep} applies each regexp to a line at a time, +so the @samp{(?s)} directive (making @samp{.} match line breaks) +is generally ineffective. +However, with @option{-z} (@option{--null-data}) it can work: +@example +$ printf 'a\nb\n' |grep -zP '(?s)a.b' +a +b +@end example +But beware: with the @option{-z} (@option{--null-data}) and a file +containing no NUL byte, grep must read the entire file into memory +before processing any of it. +Thus, it will exhaust memory and fail for some large files. +@end itemize + @end table diff --git a/src/grep.c b/src/grep.c index 7547b641..6ba881ef 100644 --- a/src/grep.c +++ b/src/grep.c @@ -2089,7 +2089,8 @@ static struct #endif }; /* Keep these in sync with the 'matchers' table. */ -enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 }; +enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0, + P_MATCHER_INDEX = 6 }; /* Return the index of the matcher corresponding to M if available. MATCHER is the index of the previous matcher, or -1 if none. @@ -2378,6 +2379,80 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p) *len_p = p - new_keys; } +/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII + digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise + match non-ASCII digits in some locales. Use \p{Nd} if you require to match + those. */ +static void +pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) +{ + idx_t len = *len_p; + char *keys = *keys_p; + mbstate_t mb_state = { 0 }; + char *new_keys = xnmalloc (len / 2 + 1, 5); + char *p = new_keys; + bool prev_backslash = false; + + for (ptrdiff_t n; len; keys += n, len -= n) + { + n = mb_clen (keys, len, &mb_state); + switch (n) + { + case -2: + n = len; + FALLTHROUGH; + default: + if (prev_backslash) + { + prev_backslash = false; + *p++ = '\\'; + } + p = mempcpy (p, keys, n); + break; + + case -1: + if (prev_backslash) + { + prev_backslash = false; + *p++ = '\\'; + } + memset (&mb_state, 0, sizeof mb_state); + n = 1; + FALLTHROUGH; + case 1: + if (prev_backslash) + { + prev_backslash = false; + switch (*keys) + { + case 'd': + p = mempcpy (p, "[0-9]", 5); + break; + default: + *p++ = '\\'; + *p++ = *keys; + break; + } + } + else + { + if (*keys == '\\') + prev_backslash = true; + else + *p++ = *keys; + } + break; + } + } + + if (prev_backslash) + *p++ = '\\'; + *p = '\n'; + free (*keys_p); + *keys_p = new_keys; + *len_p = p - new_keys; +} + /* If it is easy, convert the MATCHER-style patterns KEYS (of size *LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and return F_MATCHER_INDEX. If not, leave KEYS and *LEN_P alone and @@ -2970,6 +3045,11 @@ main (int argc, char **argv) matcher = try_fgrep_pattern (matcher, keys, &keycc); } + /* If -P, replace each \d with [0-9]. + Those who want to match non-ASCII digits must use \p{Nd}. */ + if (matcher == P_MATCHER_INDEX) + pcre_pattern_expand_backslash_d (&keys, &keycc); + execute = matchers[matcher].execute; compiled_pattern = matchers[matcher].compile (keys, keycc, matchers[matcher].syntax, diff --git a/tests/Makefile.am b/tests/Makefile.am index a47cf5c0..f195c8da 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -139,6 +139,7 @@ TESTS = \ options \ pcre \ pcre-abort \ + pcre-ascii-digits \ pcre-context \ pcre-count \ pcre-infloop \ diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits new file mode 100755 index 00000000..ae713f7a --- /dev/null +++ b/tests/pcre-ascii-digits @@ -0,0 +1,31 @@ +#!/bin/sh +# Ensure that grep -P's \d matches only the 10 ASCII digits. +# With, grep-3.9, \d would match e.g., the multibyte Arabic digits. +# +# Copyright (C) 2023 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src +require_en_utf8_locale_ +LC_ALL=en_US.UTF-8 +export LC_ALL +require_pcre_ + +echo . | grep -qP '(*UTF).' 2>/dev/null \ + || skip_ 'PCRE unicode support is compiled out' + +fail=0 + +# $ printf %s ٠١٢٣٤٥٦٧٨٩|od -An -to1 -w10 |sed 's/ /\\/g'; : arabic digits +# \331\240\331\241\331\242\331\243\331\244 +# \331\245\331\246\331\247\331\250\331\251 +printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_ +printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_ + +grep -P '\d+' in > out && fail=1 +compare /dev/null out || fail=1 + +Exit $fail -- cgit v1.2.1