summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2014-09-14 13:49:18 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2014-09-16 18:23:49 -0700
commitcd36abd46c5e0768606979ea75a51732062f5624 (patch)
tree430b623978b2dc2c95bee90b99b6dee185deb9cd
parent564a06e761ac06c4a0bcd91ce5060118d35bf912 (diff)
downloadgrep-cd36abd46c5e0768606979ea75a51732062f5624.tar.gz
grep: treat a file as binary if its prefix contains encoding errors
* NEWS: * doc/grep.texi (File and Directory Selection): Document this. * src/grep.c (buffer_encoding, buffer_textbin): New functions. (file_textbin): Rename from file_is_binary. Now returns 3-way value. All callers changed. (file_textbin, grep): Check the input more carefully for text vs binary data. (contains_encoding_error): Remove; use replaced by buffer_encoding. * tests/backref-multibyte-slow: * tests/high-bit-range: * tests/invalid-multibyte-infloop: Use -a, since the input is now considered to be binary. * tests/invalid-multibyte-infloop: Add a check for new behavior.
-rw-r--r--NEWS4
-rw-r--r--doc/grep.texi3
-rw-r--r--src/grep.c126
-rwxr-xr-xtests/backref-multibyte-slow2
-rwxr-xr-xtests/high-bit-range2
-rwxr-xr-xtests/invalid-multibyte-infloop14
6 files changed, 106 insertions, 45 deletions
diff --git a/NEWS b/NEWS
index 36bb48fd..9377d7d6 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,10 @@ GNU grep NEWS -*- outline -*-
Performance has improved for very long strings in patterns.
+ If a file contains data improperly encoded for the current locale,
+ and this is discovered before any of the file's contents are output,
+ grep now treats the file as binary.
+
grep -P no longer reports an error and exits when given invalid UTF-8 data.
Instead, it considers the data to be non-matching.
diff --git a/doc/grep.texi b/doc/grep.texi
index c8e4acdd..14bd69e1 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -592,7 +592,8 @@ this is equivalent to the @samp{--binary-files=text} option.
@item --binary-files=@var{type}
@opindex --binary-files
@cindex binary files
-If a file's allocation metadata or its first few bytes
+If a file's allocation metadata,
+or if its data read before a line is selected for output,
indicate that the file contains binary data,
assume that the file is of type @var{type}.
By default, @var{type} is @samp{binary},
diff --git a/src/grep.c b/src/grep.c
index 1e0cc6d1..ccba1b63 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -437,50 +437,74 @@ clean_up_stdout (void)
close_stdout ();
}
-/* Return true if a file is known to be binary for the purpose of 'grep'.
+/* Return 1 if BUF (of size SIZE) contains text, -1 if it contains
+ binary data, and 0 if the answer depends on what comes immediately
+ after BUF. */
+static int
+buffer_textbin (char const *buf, size_t size)
+{
+ mbstate_t mbs = { 0 };
+ size_t charlen;
+ char badbyte = eolbyte ? '\0' : '\200';
+ char const *p;
+
+ for (p = buf; p < buf + size; p += charlen)
+ {
+ if (*p == badbyte)
+ return -1;
+ charlen = mbrlen (p, buf + size - p, &mbs);
+ if ((size_t) -2 <= charlen)
+ return charlen == (size_t) -2 ? 0 : -1;
+ charlen += !charlen;
+ }
+
+ return 1;
+}
+
+/* Return 1 if a file is known to be text for the purpose of 'grep'.
+ Return -1 if it is known to be binary, 0 if unknown.
BUF, of size BUFSIZE, is the initial buffer read from the file with
descriptor FD and status ST. */
-static bool
-file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
+static int
+file_textbin (char const *buf, size_t bufsize, int fd, struct stat const *st)
{
#ifndef SEEK_HOLE
enum { SEEK_HOLE = SEEK_END };
#endif
- /* If -z, test only whether the initial buffer contains '\200';
- knowing about holes won't help. */
- if (! eolbyte)
- return memchr (buf, '\200', bufsize) != 0;
+ int textbin = buffer_textbin (buf, bufsize);
+ if (textbin < 0)
+ return textbin;
- /* If the initial buffer contains a null byte, guess that the file
- is binary. */
- if (memchr (buf, '\0', bufsize))
- return true;
-
- /* If the file has holes, it must contain a null byte somewhere. */
- if (SEEK_HOLE != SEEK_END && usable_st_size (st))
+ if (usable_st_size (st))
{
- off_t cur = bufsize;
- if (O_BINARY || fd == STDIN_FILENO)
- {
- cur = lseek (fd, 0, SEEK_CUR);
- if (cur < 0)
- return false;
- }
+ if (st->st_size <= bufsize)
+ return 2 * textbin - 1;
- /* Look for a hole after the current location. */
- off_t hole_start = lseek (fd, cur, SEEK_HOLE);
- if (0 <= hole_start)
+ /* If the file has holes, it must contain a null byte somewhere. */
+ if (SEEK_HOLE != SEEK_END && eolbyte)
{
- if (lseek (fd, cur, SEEK_SET) < 0)
- suppressible_error (filename, errno);
- if (hole_start < st->st_size)
- return true;
+ off_t cur = bufsize;
+ if (O_BINARY || fd == STDIN_FILENO)
+ {
+ cur = lseek (fd, 0, SEEK_CUR);
+ if (cur < 0)
+ return 0;
+ }
+
+ /* Look for a hole after the current location. */
+ off_t hole_start = lseek (fd, cur, SEEK_HOLE);
+ if (0 <= hole_start)
+ {
+ if (lseek (fd, cur, SEEK_SET) < 0)
+ suppressible_error (filename, errno);
+ if (hole_start < st->st_size)
+ return -1;
+ }
}
}
- /* Guess that the file does not contain binary data. */
- return false;
+ return 0;
}
/* Convert STR to a nonnegative integer, storing the result in *OUT.
@@ -1100,7 +1124,7 @@ static intmax_t
grep (int fd, struct stat const *st)
{
intmax_t nlines, i;
- bool not_text;
+ int textbin;
size_t residue, save;
char oldc;
char *beg;
@@ -1129,13 +1153,18 @@ grep (int fd, struct stat const *st)
return 0;
}
- not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
- || binary_files == WITHOUT_MATCH_BINARY_FILES)
- && file_is_binary (bufbeg, buflim - bufbeg, fd, st));
- if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
- return 0;
- done_on_match |= not_text;
- out_quiet |= not_text;
+ if (binary_files == TEXT_BINARY_FILES)
+ textbin = 1;
+ else
+ {
+ textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st);
+ if (textbin < 0)
+ {
+ if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+ return 0;
+ done_on_match = out_quiet = true;
+ }
+ }
for (;;)
{
@@ -1187,8 +1216,13 @@ grep (int fd, struct stat const *st)
}
/* Detect whether leading context is adjacent to previous output. */
- if (beg != lastout)
- lastout = 0;
+ if (lastout)
+ {
+ if (!textbin)
+ textbin = 1;
+ if (beg != lastout)
+ lastout = 0;
+ }
/* Handle some details and read more data to scan. */
save = residue + lim - beg;
@@ -1201,6 +1235,16 @@ grep (int fd, struct stat const *st)
suppressible_error (filename, errno);
goto finish_grep;
}
+
+ /* If the file's textbin has not been determined yet, assume
+ it's binary if the next input buffer suggests so. */
+ if (! textbin && buffer_textbin (bufbeg, buflim - bufbeg) < 0)
+ {
+ textbin = -1;
+ if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+ return 0;
+ done_on_match = out_quiet = true;
+ }
}
if (residue)
{
@@ -1214,7 +1258,7 @@ grep (int fd, struct stat const *st)
finish_grep:
done_on_match = done_on_match_0;
out_quiet = out_quiet_0;
- if ((not_text & ~out_quiet) && nlines != 0)
+ if (textbin < 0 && !out_quiet && nlines != 0)
printf (_("Binary file %s matches\n"), filename);
return nlines;
}
diff --git a/tests/backref-multibyte-slow b/tests/backref-multibyte-slow
index ffebb6b2..d447a4a9 100755
--- a/tests/backref-multibyte-slow
+++ b/tests/backref-multibyte-slow
@@ -21,7 +21,7 @@ max_seconds=$(LC_ALL=C perl -le 'use Time::HiRes qw(time); my $s = time();
for LOC in en_US.UTF-8; do
out=out-$LOC
- LC_ALL=$LOC timeout ${max_seconds}s grep -E '^([a-z]).\1$' in > $out 2>&1
+ LC_ALL=$LOC timeout ${max_seconds}s grep -aE '^([a-z]).\1$' in > $out 2>&1
test $? = 0 || fail=1
compare $out in || fail=1
done
diff --git a/tests/high-bit-range b/tests/high-bit-range
index 74b6e659..76c33100 100755
--- a/tests/high-bit-range
+++ b/tests/high-bit-range
@@ -21,7 +21,7 @@
fail=0
printf '\201\n' > in || framework_failure_
-grep "$(printf '[\201]')" in > out || fail=1
+grep -a "$(printf '[\201]')" in > out || fail=1
compare out in || fail=1
diff --git a/tests/invalid-multibyte-infloop b/tests/invalid-multibyte-infloop
index b28bc532..d7c6165d 100755
--- a/tests/invalid-multibyte-infloop
+++ b/tests/invalid-multibyte-infloop
@@ -14,7 +14,7 @@ encode AA > input
fail=0
# Before 2.15, this would infloop.
-LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out
+LC_ALL=en_US.UTF-8 timeout 3 grep -aF $(encode A) input > out
status=$?
if test $status -eq 0; then
compare input out
@@ -24,4 +24,16 @@ else
test $status -eq 2
fi || fail=1
+echo 'Binary file input matches' >binary-file-matches
+
+LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out
+status=$?
+if test $status -eq 0; then
+ compare binary-file-matches out
+elif test $status -eq 1; then
+ compare_dev_null_ /dev/null out
+else
+ test $status -eq 2
+fi || fail=1
+
Exit $fail