diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-03-04 16:51:13 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-03-04 16:51:13 +0000 |
commit | 0ee79eee4dd20323cece37a871bc2cf1daf59698 (patch) | |
tree | fc2f0603b9ac75f39c2f775611cc63164f5bb03c | |
parent | 1a75f94473f2731c1f85923b881b97ceaa9f0d73 (diff) | |
download | pcre-0ee79eee4dd20323cece37a871bc2cf1daf59698.tar.gz |
Add support for binary files to pcregrep.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@947 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 3 | ||||
-rwxr-xr-x | RunGrepTest | 32 | ||||
-rw-r--r-- | doc/pcregrep.1 | 32 | ||||
-rw-r--r-- | pcregrep.c | 70 | ||||
-rw-r--r-- | testdata/grepbinary | bin | 0 -> 45 bytes |
5 files changed, 129 insertions, 8 deletions
@@ -69,6 +69,9 @@ Version 8.31 18. Added --file-list option to pcregrep. +19. Added binary file support to pcregrep, including the -a, --binary-files, + -I, and --text options. + Version 8.30 04-February-2012 ----------------------------- diff --git a/RunGrepTest b/RunGrepTest index 4fa5ed3..706c777 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -415,6 +415,38 @@ echo "---------------------------- Test 85 -----------------------------" >>test (cd $srcdir; $valgrind $pcregrep --file-list=./testdata/grepfilelist "dolor" ./testdata/grepinput3) >>testtry 2>&1 echo "RC=$?" >>testtry +echo "---------------------------- Test 86 -----------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep "dog" ./testdata/grepbinary) >>testtry 2>&1 +echo "RC=$?" >>testtry + +echo "---------------------------- Test 87 -----------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep "cat" ./testdata/grepbinary) >>testtry 2>&1 +echo "RC=$?" >>testtry + +echo "---------------------------- Test 88 -----------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep -v "cat" ./testdata/grepbinary) >>testtry 2>&1 +echo "RC=$?" >>testtry + +echo "---------------------------- Test 89 -----------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep -I "dog" ./testdata/grepbinary) >>testtry 2>&1 +echo "RC=$?" >>testtry + +echo "---------------------------- Test 90 -----------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep --binary-files=without-match "dog" ./testdata/grepbinary) >>testtry 2>&1 +echo "RC=$?" >>testtry + +echo "---------------------------- Test 91 -----------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep -a "dog" ./testdata/grepbinary) >>testtry 2>&1 +echo "RC=$?" >>testtry + +echo "---------------------------- Test 92 -----------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep --binary-files=text "dog" ./testdata/grepbinary) >>testtry 2>&1 +echo "RC=$?" >>testtry + +echo "---------------------------- Test 93 -----------------------------" >>testtry +(cd $srcdir; $valgrind $pcregrep --text "dog" ./testdata/grepbinary) >>testtry 2>&1 +echo "RC=$?" >>testtry + # Now compare the results. $cf $srcdir/testdata/grepoutput testtry diff --git a/doc/pcregrep.1 b/doc/pcregrep.1 index 055b7c8..1706d85 100644 --- a/doc/pcregrep.1 +++ b/doc/pcregrep.1 @@ -95,6 +95,15 @@ appropriate support is not present, files are treated as plain text. The standard input is always so treated. . . +.SH "BINARY FILES" +.rs +.sp +By default, a file that contains a binary zero byte within the first 1024 bytes +is identified as a binary file, and is processed specially. (GNU grep also +identifies binary files in this manner.) See the \fB--binary-files\fP option +for a means of changing the way binary files are handled. +. +. .SH OPTIONS .rs .sp @@ -117,6 +126,10 @@ group of lines, unless they are in fact contiguous in the input file. The value of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP guarantees to have up to 8K of following text available for context output. .TP +\fB-a\fP, \fB--text\fP +Treat binary files as text. This is equivalent to +\fB--binary-files\fP=\fItext\fP. +.TP \fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP Output \fInumber\fP lines of context before each matching line. If filenames and/or line numbers are being output, a hyphen separator is used instead of a @@ -125,6 +138,17 @@ group of lines, unless they are in fact contiguous in the input file. The value of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP guarantees to have up to 8K of preceding text available for context output. .TP +\fB--binary-files=\fP\fIword\fP +Specify how binary files are to be processed. If the word is "binary" (the +default), pattern matching is performed on binary files, but the only output is +"Binary file <name> matches" when a match succeeds. If the word is "text", +which is equivalent to the \fB-a\fP or \fB--text\fP option, binary files are +processed in the same way as any other file. In this case, when a match +succeeds, the output may be binary garbage, which can have nasty effects if +sent to a terminal. If the word is "without-match", which is equivalent to the +\fB-I\fP option, binary files are not processed at all; they are assumed not to +be of interest. +.TP \fB--buffer-size=\fP\fInumber\fP Set the parameter that controls how much memory is used for buffering files that are being scanned. @@ -265,6 +289,10 @@ If a line number is also being output, it follows the file name. Output a help message, giving brief details of the command options and file type support, and then exit. .TP +\fB-I\fP +Treat binary files as never matching. This is equivalent to +\fB--binary-files\fP=\fIwithout-match\fP. +.TP \fB-i\fP, \fB--ignore-case\fP Ignore upper/lower case distinctions during comparisons. .TP @@ -493,7 +521,7 @@ convert this to an appropriate sequence if the output is sent to a file. .rs .sp Many of the short and long forms of \fBpcregrep\fP's options are the same -as in the GNU \fBgrep\fP program (version 2.5.4). Any long option of the form +as in the GNU \fBgrep\fP program. Any long option of the form \fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP (PCRE terminology). However, the \fB--file-list\fP, \fB--file-offsets\fP, \fB--include-dir\fP, \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, @@ -588,6 +616,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 28 February 2012 +Last updated: 04 March 2012 Copyright (c) 1997-2012 University of Cambridge. .fi @@ -104,6 +104,10 @@ enum { DEE_READ, DEE_SKIP }; enum { EL_LF, EL_CR, EL_CRLF, EL_ANY, EL_ANYCRLF }; +/* Binary file options */ + +enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT }; + /* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some environments), a warning is issued if the value of fwrite() is ignored. Unfortunately, casting to (void) does not suppress the warning. To get round @@ -160,6 +164,7 @@ static pcre *exclude_dir_compiled = NULL; static int after_context = 0; static int before_context = 0; +static int binary_files = BIN_BINARY; static int both_context = 0; static int bufthird = PCREGREP_BUFSIZE; static int bufsize = 3*PCREGREP_BUFSIZE; @@ -197,7 +202,7 @@ static BOOL utf8 = FALSE; /* Structure for options and list of them */ enum { OP_NODATA, OP_STRING, OP_OP_STRING, OP_NUMBER, OP_LONGNUMBER, - OP_OP_NUMBER, OP_PATLIST }; + OP_OP_NUMBER, OP_PATLIST, OP_BINFILES }; typedef struct option_item { int type; @@ -227,12 +232,15 @@ used to identify them. */ #define N_BUFSIZE (-15) #define N_NOJIT (-16) #define N_FILE_LIST (-17) +#define N_BINARY_FILES (-18) static option_item optionlist[] = { - { OP_NODATA, N_NULL, NULL, "", " terminate options" }, + { OP_NODATA, N_NULL, NULL, "", "terminate options" }, { OP_NODATA, N_HELP, NULL, "help", "display this help and exit" }, { OP_NUMBER, 'A', &after_context, "after-context=number", "set number of following context lines" }, + { OP_NODATA, 'a', NULL, "text", "treat binary files as text" }, { OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" }, + { OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" }, { OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" }, { OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" }, { OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" }, @@ -247,6 +255,7 @@ static option_item optionlist[] = { { OP_NODATA, N_FOFFSETS, NULL, "file-offsets", "output file offsets, not text" }, { OP_NODATA, 'H', NULL, "with-filename", "force the prefixing filename on output" }, { OP_NODATA, 'h', NULL, "no-filename", "suppress the prefixing filename on output" }, + { OP_NODATA, 'I', NULL, "", "treat binary files as not matching (ignore)" }, { OP_NODATA, 'i', NULL, "ignore-case", "ignore case distinctions" }, #ifdef SUPPORT_PCREGREP_JIT { OP_NODATA, N_NOJIT, NULL, "no-jit", "do not use just-in-time compiler optimization" }, @@ -1047,6 +1056,7 @@ char *lastmatchrestart = NULL; char *ptr = main_buffer; char *endptr; size_t bufflength; +BOOL binary = FALSE; BOOL endhyphenpending = FALSE; BOOL input_line_buffered = line_buffered; FILE *in = NULL; /* Ensure initialized */ @@ -1094,6 +1104,17 @@ else endptr = main_buffer + bufflength; +/* Unless binary-files=text, see if we have a binary file. This uses the same +rule as GNU grep, namely, a search for a binary zero byte near the start of the +file. */ + +if (binary_files != BIN_TEXT) + { + binary = + memchr(main_buffer, 0, (bufflength > 1024)? 1024 : bufflength) != NULL; + if (binary && binary_files == BIN_NOMATCH) return 1; + } + /* Loop while the current pointer is not at the end of the file. For large files, endptr will be at the end of the buffer when we are in the middle of the file, but ptr will never get there, because as soon as it gets over 2/3 of the @@ -1209,6 +1230,16 @@ while (ptr < endptr) /* Just count if just counting is wanted. */ if (count_only) count++; + + /* When handling a binary file and binary-files==binary, the "binary" + variable will be set true (it's false in all other cases). In this + situation we just want to output the file name. No need to scan further. */ + + else if (binary) + { + fprintf(stdout, "Binary file %s matches\n", filename); + return 0; + } /* If all we want is a file name, there is no need to scan any more lines in the file. */ @@ -1845,11 +1876,18 @@ for (op = optionlist; op->one_char != 0; op++) contains an underscore. */ if (strchr(op->long_name, '_') != NULL) continue; + + if (op->one_char > 0 && (op->long_name)[0] == 0) + n = 31 - printf(" -%c", op->one_char); + else + { + if (op->one_char > 0) sprintf(s, "-%c,", op->one_char); + else strcpy(s, " "); + n = 31 - printf(" %s --%s", s, op->long_name); + } - if (op->one_char > 0) sprintf(s, "-%c,", op->one_char); else strcpy(s, " "); - n = 31 - printf(" %s --%s", s, op->long_name); if (n < 1) n = 1; - printf("%.*s%s\n", n, " ", op->help_text); + printf("%.*s%s\n", n, " ", op->help_text); } printf("\nNumbers may be followed by K or M, e.g. --buffer-size=100K.\n"); @@ -1880,9 +1918,11 @@ switch(letter) case N_LBUFFER: line_buffered = TRUE; break; case N_LOFFSETS: line_offsets = number = TRUE; break; case N_NOJIT: study_options &= ~PCRE_STUDY_JIT_COMPILE; break; + case 'a': binary_files = BIN_TEXT; break; case 'c': count_only = TRUE; break; case 'F': process_options |= PO_FIXED_STRINGS; break; case 'H': filenames = FN_FORCE; break; + case 'I': binary_files = BIN_NOMATCH; break; case 'h': filenames = FN_NONE; break; case 'i': options |= PCRE_CASELESS; break; case 'l': omit_zero_count = TRUE; filenames = FN_MATCH_ONLY; break; @@ -2316,7 +2356,7 @@ for (i = 1; i < argc; i++) /* If the option type is OP_PATLIST, it's the -e option, which can be called multiple times to create a list of patterns. */ - + if (op->type == OP_PATLIST) { if (cmd_pattern_count >= MAX_PATTERN_COUNT) @@ -2327,6 +2367,24 @@ for (i = 1; i < argc; i++) } patterns[cmd_pattern_count++] = option_data; } + + /* Handle OP_BINARY_FILES */ + + else if (op->type == OP_BINFILES) + { + if (strcmp(option_data, "binary") == 0) + binary_files = BIN_BINARY; + else if (strcmp(option_data, "without-match") == 0) + binary_files = BIN_NOMATCH; + else if (strcmp(option_data, "text") == 0) + binary_files = BIN_TEXT; + else + { + fprintf(stderr, "pcregrep: unknown value \"%s\" for binary-files\n", + option_data); + pcregrep_exit(usage(2)); + } + } /* Otherwise, deal with single string or numeric data values. */ diff --git a/testdata/grepbinary b/testdata/grepbinary Binary files differnew file mode 100644 index 0000000..5efa130 --- /dev/null +++ b/testdata/grepbinary |