summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2014-10-25 01:46:01 +0900
committerJim Meyering <meyering@fb.com>2014-10-28 13:05:27 -0700
commitf66dafc2181bf997f8e7192ad49d3d6ec9dc2b87 (patch)
tree297b2308c20f28542c98535650a85d70fab812a7
parent1519c4e5e4bf68ec348bfe4261f78768710aa985 (diff)
downloadgrep-f66dafc2181bf997f8e7192ad49d3d6ec9dc2b87.tar.gz
dfa: make \w and \W work in multibyte locales
Reported by Jaroslav Skarvada in: http://bugs.gnu.org/18817 Now, \w and \W are supported in not only single byte locale but multibyte locale. * src/dfa.c (PUSH_LEX_STATE, POP_LEX_STATE): Move definitions "up", so they are not within the function. (lex): Make \w and \W work in a multibyte locale, the same way we made \s and \S work. * tests/word-multibyte: New test for this change. * tests/Makefile.am: Add a rule to build new test. * NEWS (Bug fixes): Mention it.
-rw-r--r--NEWS3
-rw-r--r--src/dfa.c61
-rw-r--r--tests/Makefile.am1
-rw-r--r--tests/word-multibyte23
4 files changed, 67 insertions, 21 deletions
diff --git a/NEWS b/NEWS
index 94eeeeba..183b7f0b 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,9 @@ GNU grep NEWS -*- outline -*-
** Bug fixes
+ grep no longer mishandles patterns that contain \w or \W in multibyte
+ locales.
+
grep would fail to count newlines internally when operating in non-UTF8
multibyte locales, leading it to print potentially many lines that did
not match. E.g., the command, "seq 10 | env LC_ALL=zh_CN src/grep -n .."
diff --git a/src/dfa.c b/src/dfa.c
index 5b9d154a..e0fc120c 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1249,6 +1249,20 @@ parse_bracket_exp (void)
return CSET + charclass_index (ccl);
}
+#define PUSH_LEX_STATE(s) \
+ do \
+ { \
+ char const *lexptr_saved = lexptr; \
+ size_t lexleft_saved = lexleft; \
+ lexptr = (s); \
+ lexleft = strlen (lexptr)
+
+#define POP_LEX_STATE() \
+ lexptr = lexptr_saved; \
+ lexleft = lexleft_saved; \
+ } \
+ while (0)
+
static token
lex (void)
{
@@ -1496,20 +1510,6 @@ lex (void)
return lasttok = CSET + charclass_index (ccl);
}
-#define PUSH_LEX_STATE(s) \
- do \
- { \
- char const *lexptr_saved = lexptr; \
- size_t lexleft_saved = lexleft; \
- lexptr = (s); \
- lexleft = strlen (lexptr)
-
-#define POP_LEX_STATE() \
- lexptr = lexptr_saved; \
- lexleft = lexleft_saved; \
- } \
- while (0)
-
/* FIXME: see if optimizing this, as is done with ANYCHAR and
add_utf8_anychar, makes sense. */
@@ -1529,14 +1529,33 @@ lex (void)
case 'W':
if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- zeroset (ccl);
- for (c2 = 0; c2 < NOTCHAR; ++c2)
- if (IS_WORD_CONSTITUENT (c2))
- setbit (c2, ccl);
- if (c == 'W')
- notset (ccl);
+
+ if (!dfa->multibyte)
+ {
+ zeroset (ccl);
+ for (c2 = 0; c2 < NOTCHAR; ++c2)
+ if (IS_WORD_CONSTITUENT (c2))
+ setbit (c2, ccl);
+ if (c == 'W')
+ notset (ccl);
+ laststart = false;
+ return lasttok = CSET + charclass_index (ccl);
+ }
+
+ /* FIXME: see if optimizing this, as is done with ANYCHAR and
+ add_utf8_anychar, makes sense. */
+
+ /* \w and \W are documented to be equivalent to [_[:alnum:]] and
+ [^_[:alnum:]] respectively, so tell the lexer to process those
+ strings, each minus its "already processed" '['. */
+ PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
+
+ lasttok = parse_bracket_exp ();
+
+ POP_LEX_STATE ();
+
laststart = false;
- return lasttok = CSET + charclass_index (ccl);
+ return lasttok;
case '[':
if (backslash)
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f6f051c1..c006e582 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -114,6 +114,7 @@ TESTS = \
warn-char-classes \
word-delim-multibyte \
word-multi-file \
+ word-multibyte \
yesno
EXTRA_DIST = \
diff --git a/tests/word-multibyte b/tests/word-multibyte
new file mode 100644
index 00000000..e067a374
--- /dev/null
+++ b/tests/word-multibyte
@@ -0,0 +1,23 @@
+#!/bin/sh
+# This would fail for grep-2.20
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_en_utf8_locale_
+
+printf '\xc3\xa1\n' > in || framework_failure_
+LC_ALL=en_US.UTF-8
+export LC_ALL
+
+fail=0
+
+for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do
+ out=out1-$LOC
+ LC_ALL=$LOC grep '\w' in >$out || fail=1
+ compare in $out || fail=1
+
+ out=out2-$LOC
+ LC_ALL=$LOC grep '\W' in >$out && fail=1
+ compare /dev/null $out || fail=1
+done
+
+Exit $fail