diff options
-rw-r--r-- | CMakeLists.txt | 6 | ||||
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | README | 18 | ||||
-rwxr-xr-x | RunGrepTest | 2 | ||||
-rw-r--r-- | config-cmake.h.in | 1 | ||||
-rw-r--r-- | configure.ac | 108 | ||||
-rw-r--r-- | doc/pcre2build.3 | 23 | ||||
-rw-r--r-- | doc/pcre2grep.1 | 60 | ||||
-rw-r--r-- | src/config.h.generic | 25 | ||||
-rw-r--r-- | src/config.h.in | 17 | ||||
-rw-r--r-- | src/pcre2grep.c | 185 | ||||
-rw-r--r-- | testdata/grepoutput | 4 |
12 files changed, 286 insertions, 167 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 1987acb..034d230 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,6 +76,7 @@ # a new option instead of being unconditional. # 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch # fix by David Gaussmann +# 2016-10-07 PH added PCREGREP_MAX_BUFSIZE PROJECT(PCRE2 C) @@ -148,7 +149,10 @@ SET(PCRE2_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING "Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.") SET(PCRE2GREP_BUFSIZE "20480" CACHE STRING - "Buffer size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.") + "Buffer starting size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.") + +SET(PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING + "Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details.") SET(PCRE2_NEWLINE "LF" CACHE STRING "What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).") @@ -61,6 +61,10 @@ escape sequence for a character whose code point was greater than \x{ff}. 9. Change 19 for 10.22 had a typo (PCRE_STATIC_RUNTIME should be PCRE2_STATIC_RUNTIME). Fix from David Gaussmann. +10. Added --max-buffer-size to pcre2grep, to allow for automatic buffer +expansion when long lines are encountered. Original patch by Dmitry +Cherniachenko. + Version 10.22 29-July-2016 -------------------------- @@ -339,12 +339,22 @@ library. They are also documented in the pcre2build man page. Of course, the relevant libraries must be installed on your system. -. The default size (in bytes) of the internal buffer used by pcre2grep can be - set by, for example: +. The default starting size (in bytes) of the internal buffer used by pcre2grep + can be set by, for example: --with-pcre2grep-bufsize=51200 - The value must be a plain integer. The default is 20480. + The value must be a plain integer. The default is 20480. The amount of memory + used by pcre2grep is actually three times this number, to allow for "before" + and "after" lines. + +. The default maximum size of pcre2grep's internal buffer can be set by, for + example: + + --with-pcre2grep-max-bufsize=2097152 + + The default is either 1048576 or the value of --with-pcre2grep-bufsize, + whichever is the larger. . It is possible to compile pcre2test so that it links with the libreadline or libedit libraries, by specifying, respectively, @@ -845,4 +855,4 @@ The distribution should contain the files listed below. Philip Hazel Email local part: ph10 Email domain: cam.ac.uk -Last updated: 01 April 2016 +Last updated: 07 October 2016 diff --git a/RunGrepTest b/RunGrepTest index a3e1312..8042531 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -440,7 +440,7 @@ echo "---------------------------- Test 82 -----------------------------" >>test echo "RC=$?" >>testtrygrep echo "---------------------------- Test 83 -----------------------------" >>testtrygrep -(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1 +(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=10 --max-buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep echo "---------------------------- Test 84 -----------------------------" >>testtrygrep diff --git a/config-cmake.h.in b/config-cmake.h.in index 0cfd8b1..6ea4cc9 100644 --- a/config-cmake.h.in +++ b/config-cmake.h.in @@ -41,6 +41,7 @@ #define NEWLINE_DEFAULT @NEWLINE_DEFAULT@ #define PARENS_NEST_LIMIT @PCRE2_PARENS_NEST_LIMIT@ #define PCRE2GREP_BUFSIZE @PCRE2GREP_BUFSIZE@ +#define PCRE2GREP_MAX_BUFSIZE @PCRE2GREP_MAX_BUFSIZE@ #define MAX_NAME_SIZE 32 #define MAX_NAME_COUNT 10000 diff --git a/configure.ac b/configure.ac index 9b98000..5b472f3 100644 --- a/configure.ac +++ b/configure.ac @@ -240,9 +240,15 @@ AC_ARG_ENABLE(pcre2grep-libbz2, # Handle --with-pcre2grep-bufsize=N AC_ARG_WITH(pcre2grep-bufsize, AS_HELP_STRING([--with-pcre2grep-bufsize=N], - [pcre2grep buffer size (default=20480, minimum=8192)]), + [pcre2grep initial buffer size (default=20480, minimum=8192)]), , with_pcre2grep_bufsize=20480) +# Handle --with-pcre2grep-max-bufsize=N +AC_ARG_WITH(pcre2grep-max-bufsize, + AS_HELP_STRING([--with-pcre2grep-max-bufsize=N], + [pcre2grep maximum buffer size (default=1048576, minimum=8192)]), + , with_pcre2grep_max_bufsize=1048576) + # Handle --enable-pcre2test-libedit AC_ARG_ENABLE(pcre2test-libedit, AS_HELP_STRING([--enable-pcre2test-libedit], @@ -608,15 +614,30 @@ if test $with_pcre2grep_bufsize -lt 8192 ; then with_pcre2grep_bufsize="8192" else if test $? -gt 1 ; then - AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize]) + AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize]) + fi +fi + +if test $with_pcre2grep_max_bufsize -lt $with_pcre2grep_bufsize ; then + with_pcre2grep_max_bufsize="$with_pcre2grep_bufsize" +else + if test $? -gt 1 ; then + AC_MSG_ERROR([Bad value for --with-pcre2grep-max-bufsize]) fi fi AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [ - The value of PCRE2GREP_BUFSIZE determines the size of buffer used by pcre2grep - to hold parts of the file it is searching. This is also the minimum value. - The actual amount of memory used by pcre2grep is three times this number, - because it allows for the buffering of "before" and "after" lines.]) + The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by + pcre2grep to hold parts of the file it is searching. The buffer will be + expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing very + long lines. The actual amount of memory used by pcre2grep is three times this + number, because it allows for the buffering of "before" and "after" lines.]) + +AC_DEFINE_UNQUOTED([PCRE2GREP_MAX_BUFSIZE], [$with_pcre2grep_max_bufsize], [ + The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer + used by pcre2grep to hold parts of the file it is searching. The actual + amount of memory used by pcre2grep is three times this number, because it + allows for the buffering of "before" and "after" lines.]) if test "$enable_pcre2test_libedit" = "yes"; then AC_DEFINE([SUPPORT_LIBEDIT], [], [ @@ -906,43 +927,44 @@ cat <<EOF $PACKAGE-$VERSION configuration summary: - Install prefix .................. : ${prefix} - C preprocessor .................. : ${CPP} - C compiler ...................... : ${CC} - Linker .......................... : ${LD} - C preprocessor flags ............ : ${CPPFLAGS} - C compiler flags ................ : ${CFLAGS} ${VISIBILITY_CFLAGS} - Linker flags .................... : ${LDFLAGS} - Extra libraries ................. : ${LIBS} - - Build 8-bit pcre2 library ....... : ${enable_pcre2_8} - Build 16-bit pcre2 library ...... : ${enable_pcre2_16} - Build 32-bit pcre2 library ...... : ${enable_pcre2_32} - Include debugging code .......... : ${enable_debug} - Enable JIT compiling support .... : ${enable_jit} - Enable Unicode support .......... : ${enable_unicode} - Newline char/sequence ........... : ${enable_newline} - \R matches only ANYCRLF ......... : ${enable_bsr_anycrlf} - \C is disabled .................. : ${enable_never_backslash_C} - EBCDIC coding ................... : ${enable_ebcdic} - EBCDIC code for NL .............. : ${ebcdic_nl_code} - Rebuild char tables ............. : ${enable_rebuild_chartables} - Use stack recursion ............. : ${enable_stack_for_recursion} - Internal link size .............. : ${with_link_size} - Nested parentheses limit ........ : ${with_parens_nest_limit} - Match limit ..................... : ${with_match_limit} - Match limit recursion ........... : ${with_match_limit_recursion} - Build shared libs ............... : ${enable_shared} - Build static libs ............... : ${enable_static} - Use JIT in pcre2grep ............ : ${enable_pcre2grep_jit} - Enable callouts in pcre2grep .... : ${enable_pcre2grep_callout} - Buffer size for pcre2grep ....... : ${with_pcre2grep_bufsize} - Link pcre2grep with libz ........ : ${enable_pcre2grep_libz} - Link pcre2grep with libbz2 ...... : ${enable_pcre2grep_libbz2} - Link pcre2test with libedit ..... : ${enable_pcre2test_libedit} - Link pcre2test with libreadline . : ${enable_pcre2test_libreadline} - Valgrind support ................ : ${enable_valgrind} - Code coverage ................... : ${enable_coverage} + Install prefix ..................... : ${prefix} + C preprocessor ..................... : ${CPP} + C compiler ......................... : ${CC} + Linker ............................. : ${LD} + C preprocessor flags ............... : ${CPPFLAGS} + C compiler flags ................... : ${CFLAGS} ${VISIBILITY_CFLAGS} + Linker flags ....................... : ${LDFLAGS} + Extra libraries .................... : ${LIBS} + + Build 8-bit pcre2 library .......... : ${enable_pcre2_8} + Build 16-bit pcre2 library ......... : ${enable_pcre2_16} + Build 32-bit pcre2 library ......... : ${enable_pcre2_32} + Include debugging code ............. : ${enable_debug} + Enable JIT compiling support ....... : ${enable_jit} + Enable Unicode support ............. : ${enable_unicode} + Newline char/sequence .............. : ${enable_newline} + \R matches only ANYCRLF ............ : ${enable_bsr_anycrlf} + \C is disabled ..................... : ${enable_never_backslash_C} + EBCDIC coding ...................... : ${enable_ebcdic} + EBCDIC code for NL ................. : ${ebcdic_nl_code} + Rebuild char tables ................ : ${enable_rebuild_chartables} + Use stack recursion ................ : ${enable_stack_for_recursion} + Internal link size ................. : ${with_link_size} + Nested parentheses limit ........... : ${with_parens_nest_limit} + Match limit ........................ : ${with_match_limit} + Match limit recursion .............. : ${with_match_limit_recursion} + Build shared libs .................. : ${enable_shared} + Build static libs .................. : ${enable_static} + Use JIT in pcre2grep ............... : ${enable_pcre2grep_jit} + Enable callouts in pcre2grep ....... : ${enable_pcre2grep_callout} + Initial buffer size for pcre2grep .. : ${with_pcre2grep_bufsize} + Maximum buffer size for pcre2grep .. : ${with_pcre2grep_max_bufsize} + Link pcre2grep with libz ........... : ${enable_pcre2grep_libz} + Link pcre2grep with libbz2 ......... : ${enable_pcre2grep_libbz2} + Link pcre2test with libedit ........ : ${enable_pcre2test_libedit} + Link pcre2test with libreadline .... : ${enable_pcre2test_libreadline} + Valgrind support ................... : ${enable_valgrind} + Code coverage ...................... : ${enable_coverage} EOF diff --git a/doc/pcre2build.3 b/doc/pcre2build.3 index 11b1c57..6088248 100644 --- a/doc/pcre2build.3 +++ b/doc/pcre2build.3 @@ -1,4 +1,4 @@ -.TH PCRE2BUILD 3 "01 April 2016" "PCRE2 10.22" +.TH PCRE2BUILD 3 "07 October 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) . @@ -385,16 +385,19 @@ they are not. .sp \fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is scanning, in order to be able to output "before" and "after" lines when it -finds a match. The size of the buffer is controlled by a parameter whose -default value is 20K. The buffer itself is three times this size, but because -of the way it is used for holding "before" lines, the longest line that is -guaranteed to be processable is the parameter size. You can change the default -parameter value by adding, for example, +finds a match. The starting size of the buffer is controlled by a parameter +whose default value is 20K. The buffer itself is three times this size, but +because of the way it is used for holding "before" lines, the longest line that +is guaranteed to be processable is the parameter size. If a longer line is +encountered, \fBpcre2grep\fP automatically expands the buffer, up to a +specified maximum size, whose default is 1M or the starting size, whichever is +the larger. You can change the default parameter values by adding, for example, .sp - --with-pcre2grep-bufsize=50K + --with-pcre2grep-bufsize=51200 + --with-pcre2grep-max-bufsize=2097152 .sp -to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override this -value by using --buffer-size on the command line. +to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override +these values by using --buffer-size and --max-buffer-size on the command line. . . .SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT" @@ -532,6 +535,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 01 April 2016 +Last updated: 07 October 2016 Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1 index 6d27780..82be375 100644 --- a/doc/pcre2grep.1 +++ b/doc/pcre2grep.1 @@ -1,4 +1,4 @@ -.TH PCRE2GREP 1 "19 June 2016" "PCRE2 10.22" +.TH PCRE2GREP 1 "11 October 2016" "PCRE2 10.23" .SH NAME pcre2grep - a grep with Perl-compatible regular expressions. .SH SYNOPSIS @@ -52,11 +52,18 @@ span line boundaries. What defines a line boundary is controlled by the \fB-N\fP (\fB--newline\fP) option. .P The amount of memory used for buffering files that are being scanned is -controlled by a parameter that can be set by the \fB--buffer-size\fP option. -The default value for this parameter is specified when \fBpcre2grep\fP is -built, with the default default being 20K. A block of memory three times this -size is used (to allow for buffering "before" and "after" lines). An error -occurs if a line overflows the buffer. +controlled by parameters that can be set by the \fB--buffer-size\fP and +\fB--max-buffer-size\fP options. The first of these sets the size of buffer +that is obtained at the start of processing. If an input file contains very +long lines, a larger buffer may be needed; this is handled by automatically +extending the buffer, up to the limit specified by \fB--max-buffer-size\fP. The +default values for these parameters are specified when \fBpcre2grep\fP is +built, with the default defaults being 20K and 1M respectively. An error occurs +if a line is too long and the buffer can no longer be expanded. +.P +The block of memory that is actually used is three times the "buffer size", to +allow for buffering "before" and "after" lines. If the buffer size is too +small, fewer than requested "before" and "after" lines may be output. .P Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater. BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern @@ -126,24 +133,29 @@ command line starts with a hyphen but is not an option. This allows for the processing of patterns and file names that start with hyphens. .TP \fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP -Output \fInumber\fP lines of context after each matching line. If file names -and/or line numbers are being output, a hyphen separator is used instead of a -colon for the context lines. A line containing "--" is output between each -group of lines, unless they are in fact contiguous in the input file. The value -of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP -guarantees to have up to 8K of following text available for context output. +Output up to \fInumber\fP lines of context after each matching line. Fewer +lines are output if the next match or the end of the file is reached, or if the +processing buffer size has been set too small. If file names and/or line +numbers are being output, a hyphen separator is used instead of a colon for the +context lines. A line containing "--" is output between each group of lines, +unless they are in fact contiguous in the input file. The value of \fInumber\fP +is expected to be relatively small. However, \fBpcre2grep\fP guarantees to have +up to 8K of following text available for context output. .TP \fB-a\fP, \fB--text\fP Treat binary files as text. This is equivalent to \fB--binary-files\fP=\fItext\fP. .TP \fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP -Output \fInumber\fP lines of context before each matching line. If file names -and/or line numbers are being output, a hyphen separator is used instead of a -colon for the context lines. A line containing "--" is output between each -group of lines, unless they are in fact contiguous in the input file. The value -of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP -guarantees to have up to 8K of preceding text available for context output. +Output up to \fInumber\fP lines of context before each matching line. Fewer +lines are output if the previous match or the start of the file is within +\fInumber\fP lines, or if the processing buffer size has been set too small. If +file names and/or line numbers are being output, a hyphen separator is used +instead of a colon for the context lines. A line containing "--" is output +between each group of lines, unless they are in fact contiguous in the input +file. The value of \fInumber\fP is expected to be relatively small. However, +\fBpcre2grep\fP guarantees to have up to 8K of preceding text available for +context output. .TP \fB--binary-files=\fP\fIword\fP Specify how binary files are to be processed. If the word is "binary" (the @@ -158,8 +170,9 @@ be of interest and are skipped without causing any output or affecting the return code. .TP \fB--buffer-size=\fP\fInumber\fP -Set the parameter that controls how much memory is used for buffering files -that are being scanned. +Set the parameter that controls how much memory is obtained at the start of +processing for buffering files that are being scanned. See also +\fB--max-buffer-size\fP below. .TP \fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP Output \fInumber\fP lines of context both before and after each matching line. @@ -432,6 +445,11 @@ of use only if it is set smaller than \fB--match-limit\fP. There are no short forms for these options. The default settings are specified when the PCRE2 library is compiled, with the default default being 10 million. .TP +\fB--max-buffer-size=\fInumber\fP +This limits the expansion of the processing buffer, whose initial size can be +set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no +smaller than the starting buffer size. +.TP \fB-M\fP, \fB--multiline\fP Allow patterns to match more than one line. When this option is given, patterns may usefully contain literal newline characters and internal occurrences of ^ @@ -757,6 +775,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 19 June 2016 +Last updated: 11 October 2016 Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/src/config.h.generic b/src/config.h.generic index 8a71be0..2dc0864 100644 --- a/src/config.h.generic +++ b/src/config.h.generic @@ -206,7 +206,7 @@ sure both macros are undefined; an emulation function will then be used. */ #define PACKAGE_NAME "PCRE2" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "PCRE2 10.22" +#define PACKAGE_STRING "PCRE2 10.23-RC1" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "pcre2" @@ -215,7 +215,7 @@ sure both macros are undefined; an emulation function will then be used. */ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "10.22" +#define PACKAGE_VERSION "10.23-RC1" /* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested parentheses (of any kind) in a pattern. This limits the amount of system @@ -224,15 +224,24 @@ sure both macros are undefined; an emulation function will then be used. */ #define PARENS_NEST_LIMIT 250 #endif -/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by - pcre2grep to hold parts of the file it is searching. This is also the - minimum value. The actual amount of memory used by pcre2grep is three times - this number, because it allows for the buffering of "before" and "after" - lines. */ +/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by + pcre2grep to hold parts of the file it is searching. The buffer will be + expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing + very long lines. The actual amount of memory used by pcre2grep is three + times this number, because it allows for the buffering of "before" and + "after" lines. */ #ifndef PCRE2GREP_BUFSIZE #define PCRE2GREP_BUFSIZE 20480 #endif +/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer + used by pcre2grep to hold parts of the file it is searching. The actual + amount of memory used by pcre2grep is three times this number, because it + allows for the buffering of "before" and "after" lines. */ +#ifndef PCRE2GREP_MAX_BUFSIZE +#define PCRE2GREP_MAX_BUFSIZE 1048576 +#endif + /* Define to any value to include debugging code. */ /* #undef PCRE2_DEBUG */ @@ -299,7 +308,7 @@ sure both macros are undefined; an emulation function will then be used. */ /* #undef SUPPORT_VALGRIND */ /* Version number of package */ -#define VERSION "10.22" +#define VERSION "10.23-RC1" /* Define to empty if `const' does not conform to ANSI C. */ /* #undef const */ diff --git a/src/config.h.in b/src/config.h.in index d4821af..477efc2 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -207,13 +207,20 @@ sure both macros are undefined; an emulation function will then be used. */ stack that is used while compiling a pattern. */ #undef PARENS_NEST_LIMIT -/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by - pcre2grep to hold parts of the file it is searching. This is also the - minimum value. The actual amount of memory used by pcre2grep is three times - this number, because it allows for the buffering of "before" and "after" - lines. */ +/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by + pcre2grep to hold parts of the file it is searching. The buffer will be + expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing + very long lines. The actual amount of memory used by pcre2grep is three + times this number, because it allows for the buffering of "before" and + "after" lines. */ #undef PCRE2GREP_BUFSIZE +/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer + used by pcre2grep to hold parts of the file it is searching. The actual + amount of memory used by pcre2grep is three times this number, because it + allows for the buffering of "before" and "after" lines. */ +#undef PCRE2GREP_MAX_BUFSIZE + /* to make a symbol visible */ #undef PCRE2POSIX_EXP_DECL diff --git a/src/pcre2grep.c b/src/pcre2grep.c index 49747c0..aa8c5c2 100644 --- a/src/pcre2grep.c +++ b/src/pcre2grep.c @@ -173,6 +173,7 @@ static int before_context = 0; static int binary_files = BIN_BINARY; static int both_context = 0; static int bufthird = PCRE2GREP_BUFSIZE; +static int max_bufthird = PCRE2GREP_MAX_BUFSIZE; static int bufsize = 3*PCRE2GREP_BUFSIZE; static int endlinetype; @@ -344,6 +345,7 @@ used to identify them. */ #define N_EXCLUDE_FROM (-19) #define N_INCLUDE_FROM (-20) #define N_OM_SEPARATOR (-21) +#define N_MAX_BUFSIZE (-22) static option_item optionlist[] = { { OP_NODATA, N_NULL, NULL, "", "terminate options" }, @@ -352,7 +354,8 @@ static option_item optionlist[] = { { OP_NODATA, 'a', NULL, "text", "treat binary files as text" }, { OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" }, { OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" }, - { OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" }, + { OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer starting size" }, + { OP_NUMBER, N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number", "set processing buffer maximum size" }, { OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" }, { OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" }, { OP_NUMBER, 'C', &both_context, "context=number", "set number of context lines, before & after" }, @@ -952,8 +955,9 @@ for (op = optionlist; op->one_char != 0; op++) printf("%.*s%s" STDOUT_NL, n, " ", op->help_text); } -printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --buffer-size=100K." STDOUT_NL); +printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --max-buffer-size=100K." STDOUT_NL); printf("The default value for --buffer-size is %d." STDOUT_NL, PCRE2GREP_BUFSIZE); +printf("The default value for --max-buffer-size is %d." STDOUT_NL, PCRE2GREP_MAX_BUFSIZE); printf("When reading patterns or file names from a file, trailing white" STDOUT_NL); printf("space is removed and blank lines are ignored." STDOUT_NL); printf("The maximum size of any pattern is %d bytes." STDOUT_NL, MAXPATLEN); @@ -1100,12 +1104,12 @@ return om; * Read one line of input * *************************************************/ -/* Normally, input is read using fread() into a large buffer, so many lines may -be read at once. However, doing this for tty input means that no output appears -until a lot of input has been typed. Instead, tty input is handled line by -line. We cannot use fgets() for this, because it does not stop at a binary -zero, and therefore there is no way of telling how many characters it has read, -because there may be binary zeros embedded in the data. +/* Normally, input is read using fread() (or gzread, or BZ2_read) into a large +buffer, so many lines may be read at once. However, doing this for tty input +means that no output appears until a lot of input has been typed. Instead, tty +input is handled line by line. We cannot use fgets() for this, because it does +not stop at a binary zero, and therefore there is no way of telling how many +characters it has read, because there may be binary zeros embedded in the data. Arguments: buffer the buffer to read into @@ -1424,17 +1428,18 @@ do_after_lines(int lastmatchnumber, char *lastmatchrestart, char *endptr, if (after_context > 0 && lastmatchnumber > 0) { int count = 0; - while (lastmatchrestart < endptr && count++ < after_context) + while (lastmatchrestart < endptr && count < after_context) { int ellength; - char *pp = lastmatchrestart; + char *pp = end_of_line(lastmatchrestart, endptr, &ellength); + if (ellength == 0 && pp == main_buffer + bufsize) break; if (printname != NULL) fprintf(stdout, "%s-", printname); if (number) fprintf(stdout, "%d-", lastmatchnumber++); - pp = end_of_line(pp, endptr, &ellength); FWRITE(lastmatchrestart, 1, pp - lastmatchrestart, stdout); lastmatchrestart = pp; + count++; } - hyphenpending = TRUE; + if (count > 0) hyphenpending = TRUE; } } @@ -1770,6 +1775,33 @@ return result != 0; /************************************************* +* Read a portion of the file into buffer * +*************************************************/ + +static int +fill_buffer(void *handle, int frtype, char *buffer, int length, + BOOL input_line_buffered) +{ +#ifdef SUPPORT_LIBZ +if (frtype == FR_LIBZ) + return gzread((gzFile)handle, buffer, length); +else +#endif + +#ifdef SUPPORT_LIBBZ2 +if (frtype == FR_LIBBZ2) + return BZ2_bzread((BZFILE *)handle, buffer, length); +else +#endif + +return (input_line_buffered ? + read_one_line(buffer, length, (FILE *)handle) : + fread(buffer, 1, length, (FILE *)handle)); +} + + + +/************************************************* * Grep an individual file * *************************************************/ @@ -1813,49 +1845,24 @@ BOOL endhyphenpending = FALSE; BOOL input_line_buffered = line_buffered; FILE *in = NULL; /* Ensure initialized */ -#ifdef SUPPORT_LIBZ -gzFile ingz = NULL; -#endif - -#ifdef SUPPORT_LIBBZ2 -BZFILE *inbz2 = NULL; -#endif - - /* Do the first read into the start of the buffer and set up the pointer to end of what we have. In the case of libz, a non-zipped .gz file will be read as a plain file. However, if a .bz2 file isn't actually bzipped, the first read will fail. */ -(void)frtype; - -#ifdef SUPPORT_LIBZ -if (frtype == FR_LIBZ) +if (frtype != FR_LIBZ && frtype != FR_LIBBZ2) { - ingz = (gzFile)handle; - bufflength = gzread (ingz, main_buffer, bufsize); + in = (FILE *)handle; + if (is_file_tty(in)) input_line_buffered = TRUE; } -else -#endif + +bufflength = fill_buffer(handle, frtype, main_buffer, bufsize, + input_line_buffered); #ifdef SUPPORT_LIBBZ2 -if (frtype == FR_LIBBZ2) - { - inbz2 = (BZFILE *)handle; - bufflength = BZ2_bzread(inbz2, main_buffer, bufsize); - if ((int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */ - } /* without the cast it is unsigned. */ -else +if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */ #endif - { - in = (FILE *)handle; - if (is_file_tty(in)) input_line_buffered = TRUE; - bufflength = input_line_buffered? - read_one_line(main_buffer, bufsize, in) : - fread(main_buffer, 1, bufsize, in); - } - endptr = main_buffer + bufflength; /* Unless binary-files=text, see if we have a binary file. This uses the same @@ -1899,18 +1906,61 @@ while (ptr < endptr) /* Check to see if the line we are looking at extends right to the very end of the buffer without a line terminator. This means the line is too long to - handle. */ + handle at the current buffer size. Until the buffer reaches its maximum size, + try doubling it and reading more data. */ if (endlinelength == 0 && t == main_buffer + bufsize) { - fprintf(stderr, "pcre2grep: line %d%s%s is too long for the internal buffer\n" - "pcre2grep: the buffer size is %d\n" - "pcre2grep: use the --buffer-size option to change it\n", - linenumber, - (filename == NULL)? "" : " of file ", - (filename == NULL)? "" : filename, - bufthird); - return 2; + if (bufthird < max_bufthird) + { + char *new_buffer; + int new_bufthird = 2*bufthird; + + if (new_bufthird > max_bufthird) new_bufthird = max_bufthird; + new_buffer = (char *)malloc(3*new_bufthird); + + if (new_buffer == NULL) + { + fprintf(stderr, + "pcre2grep: line %d%s%s is too long for the internal buffer\n" + "pcre2grep: not enough memory to increase the buffer size to %d\n", + linenumber, + (filename == NULL)? "" : " of file ", + (filename == NULL)? "" : filename, + new_bufthird); + return 2; + } + + /* Copy the data and adjust pointers to the new buffer location. */ + + memcpy(new_buffer, main_buffer, bufsize); + bufthird = new_bufthird; + bufsize = 3*bufthird; + ptr = new_buffer + (ptr - main_buffer); + lastmatchrestart = new_buffer + (lastmatchrestart - main_buffer); + free(main_buffer); + main_buffer = new_buffer; + + /* Read more data into the buffer and then try to find the line ending + again. */ + + bufflength += fill_buffer(handle, frtype, main_buffer + bufflength, + bufsize - bufflength, input_line_buffered); + endptr = main_buffer + bufflength; + continue; + } + else + { + fprintf(stderr, + "pcre2grep: line %d%s%s is too long for the internal buffer\n" + "pcre2grep: the maximum buffer size is %d\n" + "pcre2grep: use the --max-buffer-size option to change it\n", + linenumber, + (filename == NULL)? "" : " of file ", + (filename == NULL)? "" : filename, + bufthird); + return 2; + } } /* Extra processing for Jeffrey Friedl's debugging. */ @@ -2320,8 +2370,9 @@ while (ptr < endptr) lastmatchnumber > 0 && lastmatchrestart < main_buffer + bufthird) { + do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname); - lastmatchnumber = 0; + lastmatchnumber = 0; /* Indicates no after lines pending */ } /* Now do the shuffle */ @@ -2329,24 +2380,8 @@ while (ptr < endptr) memmove(main_buffer, main_buffer + bufthird, 2*bufthird); ptr -= bufthird; -#ifdef SUPPORT_LIBZ - if (frtype == FR_LIBZ) - bufflength = 2*bufthird + - gzread (ingz, main_buffer + 2*bufthird, bufthird); - else -#endif - -#ifdef SUPPORT_LIBBZ2 - if (frtype == FR_LIBBZ2) - bufflength = 2*bufthird + - BZ2_bzread(inbz2, main_buffer + 2*bufthird, bufthird); - else -#endif - - bufflength = 2*bufthird + - (input_line_buffered? - read_one_line(main_buffer + 2*bufthird, bufthird, in) : - fread(main_buffer + 2*bufthird, 1, bufthird, in)); + bufflength = 2*bufthird + fill_buffer(handle, frtype, + main_buffer + 2*bufthird, bufthird, input_line_buffered); endptr = main_buffer + bufflength; /* Adjust any last match point */ @@ -3427,6 +3462,12 @@ if (jfriedl_XT != 0 || jfriedl_XR != 0) /* Get memory for the main buffer. */ +if (bufthird <= 0) + { + fprintf(stderr, "pcre2grep: --buffer-size must be greater than zero\n"); + goto EXIT2; + } + bufsize = 3*bufthird; main_buffer = (char *)malloc(bufsize); diff --git a/testdata/grepoutput b/testdata/grepoutput index 9d41817..8e1f68c 100644 --- a/testdata/grepoutput +++ b/testdata/grepoutput @@ -637,8 +637,8 @@ RC=0 RC=0 ---------------------------- Test 83 ----------------------------- pcre2grep: line 4 of file ./testdata/grepinput3 is too long for the internal buffer -pcre2grep: the buffer size is 100 -pcre2grep: use the --buffer-size option to change it +pcre2grep: the maximum buffer size is 100 +pcre2grep: use the --max-buffer-size option to change it RC=2 ---------------------------- Test 84 ----------------------------- testdata/grepinputv:fox jumps |