grep: fix -P [\d] by fixing \w only if PCRE2 10.43

Our prepass-based fixes for the -P \d bug have caused repeated further bugs. Avoid the need for a prepass, by using PCRE2_UCP only if PCRE2_EXTRA_ASCII_BSD is also supported. Since the -P \w bug was present from grep 2.5 through 3.8 it’s OK if we wait a little longer to fix it. * NEWS: Mention this. * src/pcresearch.c (pcre_pattern_expand_backslash_d}: Remove. Remove its use. (Pcompile): Use PCRE2_UCP only if PCRE2_EXTRA_ASCII_BSD. * tests/pcre-ascii-digits, tests/pcre-utf8-w: Skip tests on older PCRE2 implementations.
author: Paul Eggert <eggert@cs.ucla.edu> 2023-04-01 13:55:26 -0700
committer: Paul Eggert <eggert@cs.ucla.edu> 2023-04-02 09:47:16 -0700
commit: c63a0950ff852c94e27d14b6d0eea001eddb7de1 (patch)
tree: 8d704d100948c3194c174e7d01182a2ffabc0464
parent: 1d59f1b342e1ec681b87cb21788ec04ebd7a1c75 (diff)
download: grep-c63a0950ff852c94e27d14b6d0eea001eddb7de1.tar.gz
4 files changed, 27 insertions, 99 deletions
diff --git a/NEWS b/NEWS
index 400c2566..6ebade3f 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,14 @@ GNU grep NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Bug fixes
+
+  With -P, patterns like [\d] now work again.  The fix relies on PCRE2
+  support for the PCRE2_EXTRA_ASCII_BSD flag planned for PCRE2 10.43.
+  With PCRE2 version 10.42 or earlier, behavior reverts to that of
+  grep 3.8, in that patterns like \w and \b use ASCII rather than
+  Unicode interpretations.
+
 
 * Noteworthy changes in release 3.10 (2023-03-22) [stable]
 
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 34b2aeb9..e77509c4 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -133,97 +133,12 @@ bad_utf8_from_pcre2 (int e)
 #endif
 }
 
-#if ! PCRE2_EXTRA_ASCII_BSD
-/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
-   digits.  Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
-   match non-ASCII digits in some locales.  Use \p{Nd} if you require to match
-   those.  Similarly, replace each \D with [^0-9].
-   FIXME: remove in 2025, or whenever we no longer accommodate pcre2-10.42
-   and prior.  */
-static void
-pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
-{
-  idx_t len = *len_p;
-  char *keys = *keys_p;
-  mbstate_t mb_state = { 0 };
-  char *new_keys = xnmalloc (len / 2 + 1, 5);
-  char *p = new_keys;
-  bool prev_backslash = false;
-
-  for (ptrdiff_t n; len; keys += n, len -= n)
-    {
-      n = mb_clen (keys, len, &mb_state);
-      switch (n)
-        {
-        case -2:
-          n = len;
-          FALLTHROUGH;
-        default:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              *p++ = '\\';
-            }
-          p = mempcpy (p, keys, n);
-          break;
-
-        case -1:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              *p++ = '\\';
-            }
-          memset (&mb_state, 0, sizeof mb_state);
-          n = 1;
-          FALLTHROUGH;
-        case 1:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              switch (*keys)
-                {
-                case 'd':
-                  p = mempcpy (p, "[0-9]", 5);
-                  break;
-                case 'D':
-                  p = mempcpy (p, "[^0-9]", 6);
-                  break;
-                default:
-                  *p++ = '\\';
-                  *p++ = *keys;
-                  break;
-                }
-            }
-          else
-            {
-              if (*keys == '\\')
-                prev_backslash = true;
-              else
-                *p++ = *keys;
-            }
-          break;
-        }
-    }
-
-  if (prev_backslash)
-    *p++ = '\\';
-  *p = '\n';
-  free (*keys_p);
-  *keys_p = new_keys;
-  *len_p = p - new_keys;
-}
-#endif
-
 /* Compile the -P style PATTERN, containing SIZE bytes that are
    followed by '\n'.  Return a description of the compiled pattern.  */
 
 void *
 Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
 {
-#if ! PCRE2_EXTRA_ASCII_BSD
-  pcre_pattern_expand_backslash_d (&pattern, &size);
-#endif
-
   PCRE2_SIZE e;
   int ec;
   int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
@@ -241,7 +156,17 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
              _("-P supports only unibyte locales on this platform"));
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-      flags |= (PCRE2_UTF | PCRE2_UCP);
+
+      flags |= PCRE2_UTF;
+
+      /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
+         so that \d does not have the undesirable effect of matching
+         non-ASCII digits.  Otherwise (i.e., with PCRE2 10.42 and earlier),
+         escapes like \w have only their ASCII interpretations,
+         but that's better than the confusion that would ensue if \d
+         matched non-ASCII digits.  */
+      flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
+
 #if 0
       /* Do not match individual code units but only UTF-8.  */
       flags |= PCRE2_NEVER_BACKSLASH_C;
diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits
index de9fe383..9dfc0fae 100755
--- a/tests/pcre-ascii-digits
+++ b/tests/pcre-ascii-digits
@@ -17,6 +17,8 @@ require_pcre_
 
 echo . | grep -qP '(*UTF).' 2>/dev/null \
   || skip_ 'PCRE unicode support is compiled out'
+echo 0 | grep -qP '(?aD)\d' \
+  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
 
 fail=0
 
@@ -44,19 +46,10 @@ printf '\331\2404\n' > in2 || framework_failure_
 returns_ 1 grep -P '\d\d' in2 > out || fail=1
 compare /dev/null out || fail=1
 
-# The following tests work only when built with 10.43 or newer,
-# with which, grep accepts the mode-setting '(?aD)':
-if echo 0 | grep -qP '(?aD)\d'; then
+grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
+compare in2 out || fail=1
 
-  grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
-  compare in2 out || fail=1
-
-  returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
-  compare /dev/null out || fail=1
-
-else
-  warn_ 'skipped some tests: use PCRE2 10.43 or newer to enable' \
-    'support for e.g., (?aD) and (?-aD)'
-fi
+returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
+compare /dev/null out || fail=1
 
 Exit $fail
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
index a88ace4d..aa347840 100755
--- a/tests/pcre-utf8-w
+++ b/tests/pcre-utf8-w
@@ -16,6 +16,8 @@ require_pcre_
 
 echo . | grep -qP '(*UTF).' 2>/dev/null \
   || skip_ 'PCRE unicode support is compiled out'
+echo 0 | grep -qP '(?aD)\d' \
+  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
 
 fail=0
author	Paul Eggert <eggert@cs.ucla.edu>	2023-04-01 13:55:26 -0700
committer	Paul Eggert <eggert@cs.ucla.edu>	2023-04-02 09:47:16 -0700
commit	c63a0950ff852c94e27d14b6d0eea001eddb7de1 (patch)
tree	8d704d100948c3194c174e7d01182a2ffabc0464
parent	1d59f1b342e1ec681b87cb21788ec04ebd7a1c75 (diff)
download	grep-c63a0950ff852c94e27d14b6d0eea001eddb7de1.tar.gz