summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim Meyering <meyering@fb.com>2019-11-16 09:24:00 -0800
committerJim Meyering <meyering@fb.com>2019-11-16 10:37:25 -0800
commit090a4dbe03951e427f03f83be424caacc3303799 (patch)
treeabd36d3ea0298bda6f92cb1047b33d86c61240ce
parenta37a439a7fc45afecbefa814999b5c2511b49e23 (diff)
downloadgrep-090a4dbe03951e427f03f83be424caacc3303799.tar.gz
grep: avoid false -Fw match in non-UTF8 multibyte locales
For example, this command would erroneously print its input line: echo ab | LC_CTYPE=ja_JP.eucjp grep -Fw b This arose when the "memrchr" search for a preceding newline failed: in that case, MB_START was not adjusted and was initially the same as BEG, so wordchar_prev mistakenly returned 0. * src/kwsearch.c (Fexecute): Set MB_START also when there is no preceding newline. * NEWS (Bug fixes): Mention it. * tests/mb-non-UTF8-word-boundary: New file. Test for the bug. * tests/Makefile.am (TESTS): Add it. Reported by NIDE, Naoyuki in https://bugs.gnu.org/38223.
-rw-r--r--NEWS5
-rw-r--r--src/kwsearch.c8
-rw-r--r--tests/Makefile.am1
-rwxr-xr-xtests/mb-non-UTF8-word-boundary29
4 files changed, 40 insertions, 3 deletions
diff --git a/NEWS b/NEWS
index c31c6000..4d682055 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,11 @@ GNU grep NEWS -*- outline -*-
** Bug fixes
+ grep -Fw can no longer false match in non-UTF8 multibyte locales
+ For example, this command would erroneously print its input line:
+ echo ab | LC_CTYPE=ja_JP.eucjp grep -Fw b
+ [Bug#38223 introduced in grep 3.0]
+
The exit status of 'grep -L' is no longer incorrect when standard
output is /dev/null.
[Bug#37716 introduced in grep 3.2]
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 42567e98..65e056c6 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -223,12 +223,14 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
if (! match_words)
goto success;
+ /* We need a preceding mb_start pointer. Use the beginning of line
+ if there is a preceding newline, else BUF. */
+ char const *bol = memrchr (mb_start, eol, beg - mb_start);
+ mb_start = bol ? bol + 1 : buf;
+
/* Succeed if the preceding and following characters are word
constituents. If the following character is not a word
constituent, keep trying with shorter matches. */
- char const *bol = memrchr (mb_start, eol, beg - mb_start);
- if (bol)
- mb_start = bol + 1;
if (! wordchar_prev (mb_start, beg, buf + size))
for (;;)
{
diff --git a/tests/Makefile.am b/tests/Makefile.am
index a70c994b..d82501e7 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -123,6 +123,7 @@ TESTS = \
mb-dot-newline \
mb-non-UTF8-overrun \
mb-non-UTF8-performance \
+ mb-non-UTF8-word-boundary \
multibyte-white-space \
multiple-begin-or-end-line \
null-byte \
diff --git a/tests/mb-non-UTF8-word-boundary b/tests/mb-non-UTF8-word-boundary
new file mode 100755
index 00000000..a29193d2
--- /dev/null
+++ b/tests/mb-non-UTF8-word-boundary
@@ -0,0 +1,29 @@
+#!/bin/sh
+# grep -Fw could false-match when using a non-UTF8 multibyte locale.
+
+# Copyright 2019 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_JP_EUC_locale_
+
+fail=0
+
+echo ab > in || framework_failure_
+
+# This would mistakenly print its input line from grep-3.0..3.3
+returns_ 1 grep -Fw b in || fail=1
+
+Exit $fail