summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-03-04 16:51:13 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-03-04 16:51:13 +0000
commit0ee79eee4dd20323cece37a871bc2cf1daf59698 (patch)
treefc2f0603b9ac75f39c2f775611cc63164f5bb03c
parent1a75f94473f2731c1f85923b881b97ceaa9f0d73 (diff)
downloadpcre-0ee79eee4dd20323cece37a871bc2cf1daf59698.tar.gz
Add support for binary files to pcregrep.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@947 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog3
-rwxr-xr-xRunGrepTest32
-rw-r--r--doc/pcregrep.132
-rw-r--r--pcregrep.c70
-rw-r--r--testdata/grepbinarybin0 -> 45 bytes
5 files changed, 129 insertions, 8 deletions
diff --git a/ChangeLog b/ChangeLog
index 2f9d466..eb5f025 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -69,6 +69,9 @@ Version 8.31
18. Added --file-list option to pcregrep.
+19. Added binary file support to pcregrep, including the -a, --binary-files,
+ -I, and --text options.
+
Version 8.30 04-February-2012
-----------------------------
diff --git a/RunGrepTest b/RunGrepTest
index 4fa5ed3..706c777 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -415,6 +415,38 @@ echo "---------------------------- Test 85 -----------------------------" >>test
(cd $srcdir; $valgrind $pcregrep --file-list=./testdata/grepfilelist "dolor" ./testdata/grepinput3) >>testtry 2>&1
echo "RC=$?" >>testtry
+echo "---------------------------- Test 86 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 87 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep "cat" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 88 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep -v "cat" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 89 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep -I "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 90 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep --binary-files=without-match "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 91 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep -a "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 92 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep --binary-files=text "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 93 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep --text "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
# Now compare the results.
$cf $srcdir/testdata/grepoutput testtry
diff --git a/doc/pcregrep.1 b/doc/pcregrep.1
index 055b7c8..1706d85 100644
--- a/doc/pcregrep.1
+++ b/doc/pcregrep.1
@@ -95,6 +95,15 @@ appropriate support is not present, files are treated as plain text. The
standard input is always so treated.
.
.
+.SH "BINARY FILES"
+.rs
+.sp
+By default, a file that contains a binary zero byte within the first 1024 bytes
+is identified as a binary file, and is processed specially. (GNU grep also
+identifies binary files in this manner.) See the \fB--binary-files\fP option
+for a means of changing the way binary files are handled.
+.
+.
.SH OPTIONS
.rs
.sp
@@ -117,6 +126,10 @@ group of lines, unless they are in fact contiguous in the input file. The value
of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP
guarantees to have up to 8K of following text available for context output.
.TP
+\fB-a\fP, \fB--text\fP
+Treat binary files as text. This is equivalent to
+\fB--binary-files\fP=\fItext\fP.
+.TP
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
Output \fInumber\fP lines of context before each matching line. If filenames
and/or line numbers are being output, a hyphen separator is used instead of a
@@ -125,6 +138,17 @@ group of lines, unless they are in fact contiguous in the input file. The value
of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP
guarantees to have up to 8K of preceding text available for context output.
.TP
+\fB--binary-files=\fP\fIword\fP
+Specify how binary files are to be processed. If the word is "binary" (the
+default), pattern matching is performed on binary files, but the only output is
+"Binary file <name> matches" when a match succeeds. If the word is "text",
+which is equivalent to the \fB-a\fP or \fB--text\fP option, binary files are
+processed in the same way as any other file. In this case, when a match
+succeeds, the output may be binary garbage, which can have nasty effects if
+sent to a terminal. If the word is "without-match", which is equivalent to the
+\fB-I\fP option, binary files are not processed at all; they are assumed not to
+be of interest.
+.TP
\fB--buffer-size=\fP\fInumber\fP
Set the parameter that controls how much memory is used for buffering files
that are being scanned.
@@ -265,6 +289,10 @@ If a line number is also being output, it follows the file name.
Output a help message, giving brief details of the command options and file
type support, and then exit.
.TP
+\fB-I\fP
+Treat binary files as never matching. This is equivalent to
+\fB--binary-files\fP=\fIwithout-match\fP.
+.TP
\fB-i\fP, \fB--ignore-case\fP
Ignore upper/lower case distinctions during comparisons.
.TP
@@ -493,7 +521,7 @@ convert this to an appropriate sequence if the output is sent to a file.
.rs
.sp
Many of the short and long forms of \fBpcregrep\fP's options are the same
-as in the GNU \fBgrep\fP program (version 2.5.4). Any long option of the form
+as in the GNU \fBgrep\fP program. Any long option of the form
\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
(PCRE terminology). However, the \fB--file-list\fP, \fB--file-offsets\fP,
\fB--include-dir\fP, \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP,
@@ -588,6 +616,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 28 February 2012
+Last updated: 04 March 2012
Copyright (c) 1997-2012 University of Cambridge.
.fi
diff --git a/pcregrep.c b/pcregrep.c
index 7149425..be07f61 100644
--- a/pcregrep.c
+++ b/pcregrep.c
@@ -104,6 +104,10 @@ enum { DEE_READ, DEE_SKIP };
enum { EL_LF, EL_CR, EL_CRLF, EL_ANY, EL_ANYCRLF };
+/* Binary file options */
+
+enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT };
+
/* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some
environments), a warning is issued if the value of fwrite() is ignored.
Unfortunately, casting to (void) does not suppress the warning. To get round
@@ -160,6 +164,7 @@ static pcre *exclude_dir_compiled = NULL;
static int after_context = 0;
static int before_context = 0;
+static int binary_files = BIN_BINARY;
static int both_context = 0;
static int bufthird = PCREGREP_BUFSIZE;
static int bufsize = 3*PCREGREP_BUFSIZE;
@@ -197,7 +202,7 @@ static BOOL utf8 = FALSE;
/* Structure for options and list of them */
enum { OP_NODATA, OP_STRING, OP_OP_STRING, OP_NUMBER, OP_LONGNUMBER,
- OP_OP_NUMBER, OP_PATLIST };
+ OP_OP_NUMBER, OP_PATLIST, OP_BINFILES };
typedef struct option_item {
int type;
@@ -227,12 +232,15 @@ used to identify them. */
#define N_BUFSIZE (-15)
#define N_NOJIT (-16)
#define N_FILE_LIST (-17)
+#define N_BINARY_FILES (-18)
static option_item optionlist[] = {
- { OP_NODATA, N_NULL, NULL, "", " terminate options" },
+ { OP_NODATA, N_NULL, NULL, "", "terminate options" },
{ OP_NODATA, N_HELP, NULL, "help", "display this help and exit" },
{ OP_NUMBER, 'A', &after_context, "after-context=number", "set number of following context lines" },
+ { OP_NODATA, 'a', NULL, "text", "treat binary files as text" },
{ OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" },
+ { OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" },
{ OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" },
@@ -247,6 +255,7 @@ static option_item optionlist[] = {
{ OP_NODATA, N_FOFFSETS, NULL, "file-offsets", "output file offsets, not text" },
{ OP_NODATA, 'H', NULL, "with-filename", "force the prefixing filename on output" },
{ OP_NODATA, 'h', NULL, "no-filename", "suppress the prefixing filename on output" },
+ { OP_NODATA, 'I', NULL, "", "treat binary files as not matching (ignore)" },
{ OP_NODATA, 'i', NULL, "ignore-case", "ignore case distinctions" },
#ifdef SUPPORT_PCREGREP_JIT
{ OP_NODATA, N_NOJIT, NULL, "no-jit", "do not use just-in-time compiler optimization" },
@@ -1047,6 +1056,7 @@ char *lastmatchrestart = NULL;
char *ptr = main_buffer;
char *endptr;
size_t bufflength;
+BOOL binary = FALSE;
BOOL endhyphenpending = FALSE;
BOOL input_line_buffered = line_buffered;
FILE *in = NULL; /* Ensure initialized */
@@ -1094,6 +1104,17 @@ else
endptr = main_buffer + bufflength;
+/* Unless binary-files=text, see if we have a binary file. This uses the same
+rule as GNU grep, namely, a search for a binary zero byte near the start of the
+file. */
+
+if (binary_files != BIN_TEXT)
+ {
+ binary =
+ memchr(main_buffer, 0, (bufflength > 1024)? 1024 : bufflength) != NULL;
+ if (binary && binary_files == BIN_NOMATCH) return 1;
+ }
+
/* Loop while the current pointer is not at the end of the file. For large
files, endptr will be at the end of the buffer when we are in the middle of the
file, but ptr will never get there, because as soon as it gets over 2/3 of the
@@ -1209,6 +1230,16 @@ while (ptr < endptr)
/* Just count if just counting is wanted. */
if (count_only) count++;
+
+ /* When handling a binary file and binary-files==binary, the "binary"
+ variable will be set true (it's false in all other cases). In this
+ situation we just want to output the file name. No need to scan further. */
+
+ else if (binary)
+ {
+ fprintf(stdout, "Binary file %s matches\n", filename);
+ return 0;
+ }
/* If all we want is a file name, there is no need to scan any more lines
in the file. */
@@ -1845,11 +1876,18 @@ for (op = optionlist; op->one_char != 0; op++)
contains an underscore. */
if (strchr(op->long_name, '_') != NULL) continue;
+
+ if (op->one_char > 0 && (op->long_name)[0] == 0)
+ n = 31 - printf(" -%c", op->one_char);
+ else
+ {
+ if (op->one_char > 0) sprintf(s, "-%c,", op->one_char);
+ else strcpy(s, " ");
+ n = 31 - printf(" %s --%s", s, op->long_name);
+ }
- if (op->one_char > 0) sprintf(s, "-%c,", op->one_char); else strcpy(s, " ");
- n = 31 - printf(" %s --%s", s, op->long_name);
if (n < 1) n = 1;
- printf("%.*s%s\n", n, " ", op->help_text);
+ printf("%.*s%s\n", n, " ", op->help_text);
}
printf("\nNumbers may be followed by K or M, e.g. --buffer-size=100K.\n");
@@ -1880,9 +1918,11 @@ switch(letter)
case N_LBUFFER: line_buffered = TRUE; break;
case N_LOFFSETS: line_offsets = number = TRUE; break;
case N_NOJIT: study_options &= ~PCRE_STUDY_JIT_COMPILE; break;
+ case 'a': binary_files = BIN_TEXT; break;
case 'c': count_only = TRUE; break;
case 'F': process_options |= PO_FIXED_STRINGS; break;
case 'H': filenames = FN_FORCE; break;
+ case 'I': binary_files = BIN_NOMATCH; break;
case 'h': filenames = FN_NONE; break;
case 'i': options |= PCRE_CASELESS; break;
case 'l': omit_zero_count = TRUE; filenames = FN_MATCH_ONLY; break;
@@ -2316,7 +2356,7 @@ for (i = 1; i < argc; i++)
/* If the option type is OP_PATLIST, it's the -e option, which can be called
multiple times to create a list of patterns. */
-
+
if (op->type == OP_PATLIST)
{
if (cmd_pattern_count >= MAX_PATTERN_COUNT)
@@ -2327,6 +2367,24 @@ for (i = 1; i < argc; i++)
}
patterns[cmd_pattern_count++] = option_data;
}
+
+ /* Handle OP_BINARY_FILES */
+
+ else if (op->type == OP_BINFILES)
+ {
+ if (strcmp(option_data, "binary") == 0)
+ binary_files = BIN_BINARY;
+ else if (strcmp(option_data, "without-match") == 0)
+ binary_files = BIN_NOMATCH;
+ else if (strcmp(option_data, "text") == 0)
+ binary_files = BIN_TEXT;
+ else
+ {
+ fprintf(stderr, "pcregrep: unknown value \"%s\" for binary-files\n",
+ option_data);
+ pcregrep_exit(usage(2));
+ }
+ }
/* Otherwise, deal with single string or numeric data values. */
diff --git a/testdata/grepbinary b/testdata/grepbinary
new file mode 100644
index 0000000..5efa130
--- /dev/null
+++ b/testdata/grepbinary
Binary files differ