summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2023-04-01 13:55:26 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2023-04-02 09:47:16 -0700
commitc63a0950ff852c94e27d14b6d0eea001eddb7de1 (patch)
tree8d704d100948c3194c174e7d01182a2ffabc0464
parent1d59f1b342e1ec681b87cb21788ec04ebd7a1c75 (diff)
downloadgrep-c63a0950ff852c94e27d14b6d0eea001eddb7de1.tar.gz
grep: fix -P [\d] by fixing \w only if PCRE2 10.43
Our prepass-based fixes for the -P \d bug have caused repeated further bugs. Avoid the need for a prepass, by using PCRE2_UCP only if PCRE2_EXTRA_ASCII_BSD is also supported. Since the -P \w bug was present from grep 2.5 through 3.8 it’s OK if we wait a little longer to fix it. * NEWS: Mention this. * src/pcresearch.c (pcre_pattern_expand_backslash_d}: Remove. Remove its use. (Pcompile): Use PCRE2_UCP only if PCRE2_EXTRA_ASCII_BSD. * tests/pcre-ascii-digits, tests/pcre-utf8-w: Skip tests on older PCRE2 implementations.
-rw-r--r--NEWS8
-rw-r--r--src/pcresearch.c97
-rwxr-xr-xtests/pcre-ascii-digits19
-rwxr-xr-xtests/pcre-utf8-w2
4 files changed, 27 insertions, 99 deletions
diff --git a/NEWS b/NEWS
index 400c2566..6ebade3f 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,14 @@ GNU grep NEWS -*- outline -*-
* Noteworthy changes in release ?.? (????-??-??) [?]
+** Bug fixes
+
+ With -P, patterns like [\d] now work again. The fix relies on PCRE2
+ support for the PCRE2_EXTRA_ASCII_BSD flag planned for PCRE2 10.43.
+ With PCRE2 version 10.42 or earlier, behavior reverts to that of
+ grep 3.8, in that patterns like \w and \b use ASCII rather than
+ Unicode interpretations.
+
* Noteworthy changes in release 3.10 (2023-03-22) [stable]
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 34b2aeb9..e77509c4 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -133,97 +133,12 @@ bad_utf8_from_pcre2 (int e)
#endif
}
-#if ! PCRE2_EXTRA_ASCII_BSD
-/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
- digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
- match non-ASCII digits in some locales. Use \p{Nd} if you require to match
- those. Similarly, replace each \D with [^0-9].
- FIXME: remove in 2025, or whenever we no longer accommodate pcre2-10.42
- and prior. */
-static void
-pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
-{
- idx_t len = *len_p;
- char *keys = *keys_p;
- mbstate_t mb_state = { 0 };
- char *new_keys = xnmalloc (len / 2 + 1, 5);
- char *p = new_keys;
- bool prev_backslash = false;
-
- for (ptrdiff_t n; len; keys += n, len -= n)
- {
- n = mb_clen (keys, len, &mb_state);
- switch (n)
- {
- case -2:
- n = len;
- FALLTHROUGH;
- default:
- if (prev_backslash)
- {
- prev_backslash = false;
- *p++ = '\\';
- }
- p = mempcpy (p, keys, n);
- break;
-
- case -1:
- if (prev_backslash)
- {
- prev_backslash = false;
- *p++ = '\\';
- }
- memset (&mb_state, 0, sizeof mb_state);
- n = 1;
- FALLTHROUGH;
- case 1:
- if (prev_backslash)
- {
- prev_backslash = false;
- switch (*keys)
- {
- case 'd':
- p = mempcpy (p, "[0-9]", 5);
- break;
- case 'D':
- p = mempcpy (p, "[^0-9]", 6);
- break;
- default:
- *p++ = '\\';
- *p++ = *keys;
- break;
- }
- }
- else
- {
- if (*keys == '\\')
- prev_backslash = true;
- else
- *p++ = *keys;
- }
- break;
- }
- }
-
- if (prev_backslash)
- *p++ = '\\';
- *p = '\n';
- free (*keys_p);
- *keys_p = new_keys;
- *len_p = p - new_keys;
-}
-#endif
-
/* Compile the -P style PATTERN, containing SIZE bytes that are
followed by '\n'. Return a description of the compiled pattern. */
void *
Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
{
-#if ! PCRE2_EXTRA_ASCII_BSD
- pcre_pattern_expand_backslash_d (&pattern, &size);
-#endif
-
PCRE2_SIZE e;
int ec;
int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
@@ -241,7 +156,17 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
_("-P supports only unibyte locales on this platform"));
if (! localeinfo.using_utf8)
die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
- flags |= (PCRE2_UTF | PCRE2_UCP);
+
+ flags |= PCRE2_UTF;
+
+ /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
+ so that \d does not have the undesirable effect of matching
+ non-ASCII digits. Otherwise (i.e., with PCRE2 10.42 and earlier),
+ escapes like \w have only their ASCII interpretations,
+ but that's better than the confusion that would ensue if \d
+ matched non-ASCII digits. */
+ flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
+
#if 0
/* Do not match individual code units but only UTF-8. */
flags |= PCRE2_NEVER_BACKSLASH_C;
diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits
index de9fe383..9dfc0fae 100755
--- a/tests/pcre-ascii-digits
+++ b/tests/pcre-ascii-digits
@@ -17,6 +17,8 @@ require_pcre_
echo . | grep -qP '(*UTF).' 2>/dev/null \
|| skip_ 'PCRE unicode support is compiled out'
+echo 0 | grep -qP '(?aD)\d' \
+ || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
fail=0
@@ -44,19 +46,10 @@ printf '\331\2404\n' > in2 || framework_failure_
returns_ 1 grep -P '\d\d' in2 > out || fail=1
compare /dev/null out || fail=1
-# The following tests work only when built with 10.43 or newer,
-# with which, grep accepts the mode-setting '(?aD)':
-if echo 0 | grep -qP '(?aD)\d'; then
+grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
+compare in2 out || fail=1
- grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
- compare in2 out || fail=1
-
- returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
- compare /dev/null out || fail=1
-
-else
- warn_ 'skipped some tests: use PCRE2 10.43 or newer to enable' \
- 'support for e.g., (?aD) and (?-aD)'
-fi
+returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
+compare /dev/null out || fail=1
Exit $fail
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
index a88ace4d..aa347840 100755
--- a/tests/pcre-utf8-w
+++ b/tests/pcre-utf8-w
@@ -16,6 +16,8 @@ require_pcre_
echo . | grep -qP '(*UTF).' 2>/dev/null \
|| skip_ 'PCRE unicode support is compiled out'
+echo 0 | grep -qP '(?aD)\d' \
+ || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
fail=0