summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2014-10-19 10:40:18 +0900
committerJim Meyering <meyering@fb.com>2014-10-19 18:21:52 -0700
commitf5cd191a624f33237d7618d1c24829ad501bc5c0 (patch)
tree7ed3220e00d5c8a3a82382de20b347ed235e323c
parent423bdd5d9405af4836d9359426ceedf24cf18ceb (diff)
downloadgrep-f5cd191a624f33237d7618d1c24829ad501bc5c0.tar.gz
dfa: process all MBCSET constructs via glibc's matcher
The DFA matcher does not support collating symbols or equivalence classes, so ensure that any MBCSET reference is handled by the glibc matcher. dfa.c already handled this in one case, but not the other, so that a command like "printf '\0' |src/grep -aE '^\s?$'" would mistakenly end up using dfa.c's match_mb_charset function rather than glibc's matcher. * src/dfa.c (dfaexec_main): Move that code into the State_transition macro. This renders the match_mb_charset unused by grep. * tests/multibyte-white-space: Add a test to exercise the just-rendered-inaccessible code path.
-rw-r--r--src/dfa.c20
-rwxr-xr-xtests/multibyte-white-space10
2 files changed, 20 insertions, 10 deletions
diff --git a/src/dfa.c b/src/dfa.c
index 58a4b832..de836891 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3338,20 +3338,20 @@ dfaexec_main (struct dfa *d, char const *begin, char *end,
continue;
}
- /* Falling back to the glibc matcher in this case gives
- better performance (up to 25% better on [a-z], for
- example) and enables support for collating symbols and
- equivalence classes. */
- if (d->states[s].has_mbcset && backref)
- {
- *backref = 1;
- goto done;
- }
-
/* The following code is used twice.
Use a macro to avoid the risk that they diverge. */
#define State_transition() \
do { \
+ /* Falling back to the glibc matcher in this case gives \
+ better performance (up to 25% better on [a-z], for \
+ example) and enables support for collating symbols and \
+ equivalence classes. */ \
+ if (d->states[s].has_mbcset && backref) \
+ { \
+ *backref = 1; \
+ goto done; \
+ } \
+ \
/* Can match with a multibyte character (and multi-character \
collating element). Transition table might be updated. */ \
s = transit_state (d, s, &p, (unsigned char *) end); \
diff --git a/tests/multibyte-white-space b/tests/multibyte-white-space
index c9b3d1fa..58166431 100755
--- a/tests/multibyte-white-space
+++ b/tests/multibyte-white-space
@@ -73,4 +73,14 @@ for i in $utf8_space_characters; do
|| { warn_ "$i vs. \\S FAILED"; fail=1; }
done
+
+# This is a separate test, only nominally related to \s.
+# It is solely to get coverage of a code path (exercising dfa.c's
+# match_mb_charset function) that would have otherwise been untouched.
+# However, as of the change-set adding this new test, match_mb_charset
+# is unreachable via grep.
+printf '\0' | grep -aE '^\s?$' > out 2>&1
+test $? = 1 || fail=1
+compare /dev/null out
+
Exit $fail