diff options
author | Jim Meyering <meyering@fb.com> | 2023-03-18 23:25:03 -0700 |
---|---|---|
committer | Jim Meyering <meyering@meta.com> | 2023-03-19 13:36:23 -0700 |
commit | 98ee05b4ddfee5c1db2248bdb060a2cd64bf75fa (patch) | |
tree | 3f698eb54e1cd35347d16ad14270261e3c0a95d3 | |
parent | 99330c2b1dc8b619dff8a5a6a35f524d382508c8 (diff) | |
download | grep-98ee05b4ddfee5c1db2248bdb060a2cd64bf75fa.tar.gz |
grep: -P (--perl-regexp) \D once again works like [^0-9]
* NEWS: Mention \D, too.
* doc/grep.texi: Likewise
* src/pcresearch.c (pcre_pattern_expand_backslash_d): Handle \D.
Also, ifdef-out this new function and its call site when not needed.
* tests/pcre-ascii-digits: Test \D, too.
Tighten one test by using returns_ 1.
Add comments and tests that work only with 10.43 and newer.
Paul Eggert raised the issue of \D in https://bugs.gnu.org/62267#8
-rw-r--r-- | NEWS | 2 | ||||
-rw-r--r-- | doc/grep.texi | 20 | ||||
-rw-r--r-- | src/pcresearch.c | 14 | ||||
-rwxr-xr-x | tests/pcre-ascii-digits | 33 |
4 files changed, 51 insertions, 18 deletions
@@ -9,7 +9,7 @@ GNU grep NEWS -*- outline -*- properly had the undesirable side effect of making \d also match e.g., the Arabic digits: ٠١٢٣٤٥٦٧٨٩. With grep-3.9, -P '\d+' would match that ten-digit (20-byte) string. Now, to match such - a digit, you would use \p{Nd}. + a digit, you would use \p{Nd}. Similarly, \D is now mapped to [^0-9]. [bug introduced in grep 3.9] diff --git a/doc/grep.texi b/doc/grep.texi index 8a0aef51..7a00adda 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -1144,21 +1144,15 @@ combined with the @option{-z} (@option{--null-data}) option, and note that For documentation, refer to @url{https://www.pcre.org/}, with these caveats: @itemize @item -@samp{\d} matches only the ten ASCII digits, regardless of locale. +@samp{\d} matches only the ten ASCII digits +(and @samp{\D} matches the complement), regardless of locale. Use @samp{\p@{Nd@}} to also match non-ASCII digits. -When @command{grep} is built with PCRE2 10.42 and earlier, @samp{\d} -ignores in-regexp directives like @samp{(?aD)} and matches only ASCII -digits regardless of these directives. However, later versions of -PCRE2 likely will fix this, and the plan is for @command{grep} to -respect those directives if possible. -@c Using PCRE2 git commit pcre2-10.40-112-g6277357, this demonstrates -@c the equivalent of how grep could use PCRE2_EXTRA_ASCII_BSD to make \d's -@c ASCII-only behavior the default: -@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '(?aD)^\d+' <<< '٠١٢٣٤٥٦٧٨٩' -@c [Exit 1] -@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '^\d+' <<< '٠١٢٣٤٥٦٧٨٩' -@c ٠١٢٣٤٥٦٧٨٩ +When @command{grep} is built with PCRE2 10.42 and earlier, +@samp{\d} and @samp{\D} ignore in-regexp directives like @samp{(?aD)} +and work like @samp{[0-9]} and @samp{[^0-9]} respectively. +However, later versions of PCRE2 likely will fix this, +and the plan is for @command{grep} to respect those directives if possible. @item Although PCRE tracks the syntax and semantics of Perl's regular diff --git a/src/pcresearch.c b/src/pcresearch.c index d3701816..34b2aeb9 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -133,10 +133,13 @@ bad_utf8_from_pcre2 (int e) #endif } +#if ! PCRE2_EXTRA_ASCII_BSD /* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise match non-ASCII digits in some locales. Use \p{Nd} if you require to match - those. */ + those. Similarly, replace each \D with [^0-9]. + FIXME: remove in 2025, or whenever we no longer accommodate pcre2-10.42 + and prior. */ static void pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) { @@ -182,6 +185,9 @@ pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) case 'd': p = mempcpy (p, "[0-9]", 5); break; + case 'D': + p = mempcpy (p, "[^0-9]", 6); + break; default: *p++ = '\\'; *p++ = *keys; @@ -206,6 +212,7 @@ pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) *keys_p = new_keys; *len_p = p - new_keys; } +#endif /* Compile the -P style PATTERN, containing SIZE bytes that are followed by '\n'. Return a description of the compiled pattern. */ @@ -213,8 +220,9 @@ pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) void * Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) { - if (! PCRE2_EXTRA_ASCII_BSD) - pcre_pattern_expand_backslash_d (&pattern, &size); +#if ! PCRE2_EXTRA_ASCII_BSD + pcre_pattern_expand_backslash_d (&pattern, &size); +#endif PCRE2_SIZE e; int ec; diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits index ae713f7a..de9fe383 100755 --- a/tests/pcre-ascii-digits +++ b/tests/pcre-ascii-digits @@ -1,6 +1,7 @@ #!/bin/sh # Ensure that grep -P's \d matches only the 10 ASCII digits. # With, grep-3.9, \d would match e.g., the multibyte Arabic digits. +# The same applied to \D. # # Copyright (C) 2023 Free Software Foundation, Inc. # @@ -24,8 +25,38 @@ fail=0 # \331\245\331\246\331\247\331\250\331\251 printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_ printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_ +printf '\n' >> in || framework_failure_ -grep -P '\d+' in > out && fail=1 +# Ensure that \d matches no character. +returns_ 1 grep -P '\d' in > out || fail=1 compare /dev/null out || fail=1 +# Ensure that ^\D+$ matches the entire line. +grep -P '^\D+$' in > out || fail=1 +compare in out || fail=1 + +# When built with PCRE2 10.43 and newer, one may use (?aD) and (?-aD) +# to toggle between modes. (?aD) is the default (making \d == [0-9]). +# (?-aD) relaxes \d, making it match "all" digits. +# Use mixed digits as input: Arabic 0 and ASCII 4: ٠4 +printf '\331\2404\n' > in2 || framework_failure_ + +returns_ 1 grep -P '\d\d' in2 > out || fail=1 +compare /dev/null out || fail=1 + +# The following tests work only when built with 10.43 or newer, +# with which, grep accepts the mode-setting '(?aD)': +if echo 0 | grep -qP '(?aD)\d'; then + + grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1 + compare in2 out || fail=1 + + returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1 + compare /dev/null out || fail=1 + +else + warn_ 'skipped some tests: use PCRE2 10.43 or newer to enable' \ + 'support for e.g., (?aD) and (?-aD)' +fi + Exit $fail |