summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2016-10-11 16:40:09 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2016-10-11 16:40:09 +0000
commit9d50d884a602b7d13578f60b792f8b4eb721a688 (patch)
treebca5a76682d97065c9e555481182f95af4578ea8
parent64cda2113e422dba36de0699bf2a3a6a393b3eae (diff)
downloadpcre2-9d50d884a602b7d13578f60b792f8b4eb721a688.tar.gz
Implement buffer expansion in pcre2grep.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@563 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r--CMakeLists.txt6
-rw-r--r--ChangeLog4
-rw-r--r--README18
-rwxr-xr-xRunGrepTest2
-rw-r--r--config-cmake.h.in1
-rw-r--r--configure.ac108
-rw-r--r--doc/pcre2build.323
-rw-r--r--doc/pcre2grep.160
-rw-r--r--src/config.h.generic25
-rw-r--r--src/config.h.in17
-rw-r--r--src/pcre2grep.c185
-rw-r--r--testdata/grepoutput4
12 files changed, 286 insertions, 167 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1987acb..034d230 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,6 +76,7 @@
# a new option instead of being unconditional.
# 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch
# fix by David Gaussmann
+# 2016-10-07 PH added PCREGREP_MAX_BUFSIZE
PROJECT(PCRE2 C)
@@ -148,7 +149,10 @@ SET(PCRE2_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING
"Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.")
SET(PCRE2GREP_BUFSIZE "20480" CACHE STRING
- "Buffer size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
+ "Buffer starting size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
+
+SET(PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING
+ "Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details.")
SET(PCRE2_NEWLINE "LF" CACHE STRING
"What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).")
diff --git a/ChangeLog b/ChangeLog
index 76b3e0d..4ddfdd1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -61,6 +61,10 @@ escape sequence for a character whose code point was greater than \x{ff}.
9. Change 19 for 10.22 had a typo (PCRE_STATIC_RUNTIME should be
PCRE2_STATIC_RUNTIME). Fix from David Gaussmann.
+10. Added --max-buffer-size to pcre2grep, to allow for automatic buffer
+expansion when long lines are encountered. Original patch by Dmitry
+Cherniachenko.
+
Version 10.22 29-July-2016
--------------------------
diff --git a/README b/README
index 03d67f6..4264772 100644
--- a/README
+++ b/README
@@ -339,12 +339,22 @@ library. They are also documented in the pcre2build man page.
Of course, the relevant libraries must be installed on your system.
-. The default size (in bytes) of the internal buffer used by pcre2grep can be
- set by, for example:
+. The default starting size (in bytes) of the internal buffer used by pcre2grep
+ can be set by, for example:
--with-pcre2grep-bufsize=51200
- The value must be a plain integer. The default is 20480.
+ The value must be a plain integer. The default is 20480. The amount of memory
+ used by pcre2grep is actually three times this number, to allow for "before"
+ and "after" lines.
+
+. The default maximum size of pcre2grep's internal buffer can be set by, for
+ example:
+
+ --with-pcre2grep-max-bufsize=2097152
+
+ The default is either 1048576 or the value of --with-pcre2grep-bufsize,
+ whichever is the larger.
. It is possible to compile pcre2test so that it links with the libreadline
or libedit libraries, by specifying, respectively,
@@ -845,4 +855,4 @@ The distribution should contain the files listed below.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
-Last updated: 01 April 2016
+Last updated: 07 October 2016
diff --git a/RunGrepTest b/RunGrepTest
index a3e1312..8042531 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -440,7 +440,7 @@ echo "---------------------------- Test 82 -----------------------------" >>test
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 83 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1
+(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=10 --max-buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 84 -----------------------------" >>testtrygrep
diff --git a/config-cmake.h.in b/config-cmake.h.in
index 0cfd8b1..6ea4cc9 100644
--- a/config-cmake.h.in
+++ b/config-cmake.h.in
@@ -41,6 +41,7 @@
#define NEWLINE_DEFAULT @NEWLINE_DEFAULT@
#define PARENS_NEST_LIMIT @PCRE2_PARENS_NEST_LIMIT@
#define PCRE2GREP_BUFSIZE @PCRE2GREP_BUFSIZE@
+#define PCRE2GREP_MAX_BUFSIZE @PCRE2GREP_MAX_BUFSIZE@
#define MAX_NAME_SIZE 32
#define MAX_NAME_COUNT 10000
diff --git a/configure.ac b/configure.ac
index 9b98000..5b472f3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -240,9 +240,15 @@ AC_ARG_ENABLE(pcre2grep-libbz2,
# Handle --with-pcre2grep-bufsize=N
AC_ARG_WITH(pcre2grep-bufsize,
AS_HELP_STRING([--with-pcre2grep-bufsize=N],
- [pcre2grep buffer size (default=20480, minimum=8192)]),
+ [pcre2grep initial buffer size (default=20480, minimum=8192)]),
, with_pcre2grep_bufsize=20480)
+# Handle --with-pcre2grep-max-bufsize=N
+AC_ARG_WITH(pcre2grep-max-bufsize,
+ AS_HELP_STRING([--with-pcre2grep-max-bufsize=N],
+ [pcre2grep maximum buffer size (default=1048576, minimum=8192)]),
+ , with_pcre2grep_max_bufsize=1048576)
+
# Handle --enable-pcre2test-libedit
AC_ARG_ENABLE(pcre2test-libedit,
AS_HELP_STRING([--enable-pcre2test-libedit],
@@ -608,15 +614,30 @@ if test $with_pcre2grep_bufsize -lt 8192 ; then
with_pcre2grep_bufsize="8192"
else
if test $? -gt 1 ; then
- AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize])
+ AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize])
+ fi
+fi
+
+if test $with_pcre2grep_max_bufsize -lt $with_pcre2grep_bufsize ; then
+ with_pcre2grep_max_bufsize="$with_pcre2grep_bufsize"
+else
+ if test $? -gt 1 ; then
+ AC_MSG_ERROR([Bad value for --with-pcre2grep-max-bufsize])
fi
fi
AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [
- The value of PCRE2GREP_BUFSIZE determines the size of buffer used by pcre2grep
- to hold parts of the file it is searching. This is also the minimum value.
- The actual amount of memory used by pcre2grep is three times this number,
- because it allows for the buffering of "before" and "after" lines.])
+ The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
+ pcre2grep to hold parts of the file it is searching. The buffer will be
+ expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing very
+ long lines. The actual amount of memory used by pcre2grep is three times this
+ number, because it allows for the buffering of "before" and "after" lines.])
+
+AC_DEFINE_UNQUOTED([PCRE2GREP_MAX_BUFSIZE], [$with_pcre2grep_max_bufsize], [
+ The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
+ used by pcre2grep to hold parts of the file it is searching. The actual
+ amount of memory used by pcre2grep is three times this number, because it
+ allows for the buffering of "before" and "after" lines.])
if test "$enable_pcre2test_libedit" = "yes"; then
AC_DEFINE([SUPPORT_LIBEDIT], [], [
@@ -906,43 +927,44 @@ cat <<EOF
$PACKAGE-$VERSION configuration summary:
- Install prefix .................. : ${prefix}
- C preprocessor .................. : ${CPP}
- C compiler ...................... : ${CC}
- Linker .......................... : ${LD}
- C preprocessor flags ............ : ${CPPFLAGS}
- C compiler flags ................ : ${CFLAGS} ${VISIBILITY_CFLAGS}
- Linker flags .................... : ${LDFLAGS}
- Extra libraries ................. : ${LIBS}
-
- Build 8-bit pcre2 library ....... : ${enable_pcre2_8}
- Build 16-bit pcre2 library ...... : ${enable_pcre2_16}
- Build 32-bit pcre2 library ...... : ${enable_pcre2_32}
- Include debugging code .......... : ${enable_debug}
- Enable JIT compiling support .... : ${enable_jit}
- Enable Unicode support .......... : ${enable_unicode}
- Newline char/sequence ........... : ${enable_newline}
- \R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
- \C is disabled .................. : ${enable_never_backslash_C}
- EBCDIC coding ................... : ${enable_ebcdic}
- EBCDIC code for NL .............. : ${ebcdic_nl_code}
- Rebuild char tables ............. : ${enable_rebuild_chartables}
- Use stack recursion ............. : ${enable_stack_for_recursion}
- Internal link size .............. : ${with_link_size}
- Nested parentheses limit ........ : ${with_parens_nest_limit}
- Match limit ..................... : ${with_match_limit}
- Match limit recursion ........... : ${with_match_limit_recursion}
- Build shared libs ............... : ${enable_shared}
- Build static libs ............... : ${enable_static}
- Use JIT in pcre2grep ............ : ${enable_pcre2grep_jit}
- Enable callouts in pcre2grep .... : ${enable_pcre2grep_callout}
- Buffer size for pcre2grep ....... : ${with_pcre2grep_bufsize}
- Link pcre2grep with libz ........ : ${enable_pcre2grep_libz}
- Link pcre2grep with libbz2 ...... : ${enable_pcre2grep_libbz2}
- Link pcre2test with libedit ..... : ${enable_pcre2test_libedit}
- Link pcre2test with libreadline . : ${enable_pcre2test_libreadline}
- Valgrind support ................ : ${enable_valgrind}
- Code coverage ................... : ${enable_coverage}
+ Install prefix ..................... : ${prefix}
+ C preprocessor ..................... : ${CPP}
+ C compiler ......................... : ${CC}
+ Linker ............................. : ${LD}
+ C preprocessor flags ............... : ${CPPFLAGS}
+ C compiler flags ................... : ${CFLAGS} ${VISIBILITY_CFLAGS}
+ Linker flags ....................... : ${LDFLAGS}
+ Extra libraries .................... : ${LIBS}
+
+ Build 8-bit pcre2 library .......... : ${enable_pcre2_8}
+ Build 16-bit pcre2 library ......... : ${enable_pcre2_16}
+ Build 32-bit pcre2 library ......... : ${enable_pcre2_32}
+ Include debugging code ............. : ${enable_debug}
+ Enable JIT compiling support ....... : ${enable_jit}
+ Enable Unicode support ............. : ${enable_unicode}
+ Newline char/sequence .............. : ${enable_newline}
+ \R matches only ANYCRLF ............ : ${enable_bsr_anycrlf}
+ \C is disabled ..................... : ${enable_never_backslash_C}
+ EBCDIC coding ...................... : ${enable_ebcdic}
+ EBCDIC code for NL ................. : ${ebcdic_nl_code}
+ Rebuild char tables ................ : ${enable_rebuild_chartables}
+ Use stack recursion ................ : ${enable_stack_for_recursion}
+ Internal link size ................. : ${with_link_size}
+ Nested parentheses limit ........... : ${with_parens_nest_limit}
+ Match limit ........................ : ${with_match_limit}
+ Match limit recursion .............. : ${with_match_limit_recursion}
+ Build shared libs .................. : ${enable_shared}
+ Build static libs .................. : ${enable_static}
+ Use JIT in pcre2grep ............... : ${enable_pcre2grep_jit}
+ Enable callouts in pcre2grep ....... : ${enable_pcre2grep_callout}
+ Initial buffer size for pcre2grep .. : ${with_pcre2grep_bufsize}
+ Maximum buffer size for pcre2grep .. : ${with_pcre2grep_max_bufsize}
+ Link pcre2grep with libz ........... : ${enable_pcre2grep_libz}
+ Link pcre2grep with libbz2 ......... : ${enable_pcre2grep_libbz2}
+ Link pcre2test with libedit ........ : ${enable_pcre2test_libedit}
+ Link pcre2test with libreadline .... : ${enable_pcre2test_libreadline}
+ Valgrind support ................... : ${enable_valgrind}
+ Code coverage ...................... : ${enable_coverage}
EOF
diff --git a/doc/pcre2build.3 b/doc/pcre2build.3
index 11b1c57..6088248 100644
--- a/doc/pcre2build.3
+++ b/doc/pcre2build.3
@@ -1,4 +1,4 @@
-.TH PCRE2BUILD 3 "01 April 2016" "PCRE2 10.22"
+.TH PCRE2BUILD 3 "07 October 2016" "PCRE2 10.23"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.
@@ -385,16 +385,19 @@ they are not.
.sp
\fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when it
-finds a match. The size of the buffer is controlled by a parameter whose
-default value is 20K. The buffer itself is three times this size, but because
-of the way it is used for holding "before" lines, the longest line that is
-guaranteed to be processable is the parameter size. You can change the default
-parameter value by adding, for example,
+finds a match. The starting size of the buffer is controlled by a parameter
+whose default value is 20K. The buffer itself is three times this size, but
+because of the way it is used for holding "before" lines, the longest line that
+is guaranteed to be processable is the parameter size. If a longer line is
+encountered, \fBpcre2grep\fP automatically expands the buffer, up to a
+specified maximum size, whose default is 1M or the starting size, whichever is
+the larger. You can change the default parameter values by adding, for example,
.sp
- --with-pcre2grep-bufsize=50K
+ --with-pcre2grep-bufsize=51200
+ --with-pcre2grep-max-bufsize=2097152
.sp
-to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override this
-value by using --buffer-size on the command line.
+to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override
+these values by using --buffer-size and --max-buffer-size on the command line.
.
.
.SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT"
@@ -532,6 +535,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 01 April 2016
+Last updated: 07 October 2016
Copyright (c) 1997-2016 University of Cambridge.
.fi
diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1
index 6d27780..82be375 100644
--- a/doc/pcre2grep.1
+++ b/doc/pcre2grep.1
@@ -1,4 +1,4 @@
-.TH PCRE2GREP 1 "19 June 2016" "PCRE2 10.22"
+.TH PCRE2GREP 1 "11 October 2016" "PCRE2 10.23"
.SH NAME
pcre2grep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS
@@ -52,11 +52,18 @@ span line boundaries. What defines a line boundary is controlled by the
\fB-N\fP (\fB--newline\fP) option.
.P
The amount of memory used for buffering files that are being scanned is
-controlled by a parameter that can be set by the \fB--buffer-size\fP option.
-The default value for this parameter is specified when \fBpcre2grep\fP is
-built, with the default default being 20K. A block of memory three times this
-size is used (to allow for buffering "before" and "after" lines). An error
-occurs if a line overflows the buffer.
+controlled by parameters that can be set by the \fB--buffer-size\fP and
+\fB--max-buffer-size\fP options. The first of these sets the size of buffer
+that is obtained at the start of processing. If an input file contains very
+long lines, a larger buffer may be needed; this is handled by automatically
+extending the buffer, up to the limit specified by \fB--max-buffer-size\fP. The
+default values for these parameters are specified when \fBpcre2grep\fP is
+built, with the default defaults being 20K and 1M respectively. An error occurs
+if a line is too long and the buffer can no longer be expanded.
+.P
+The block of memory that is actually used is three times the "buffer size", to
+allow for buffering "before" and "after" lines. If the buffer size is too
+small, fewer than requested "before" and "after" lines may be output.
.P
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern
@@ -126,24 +133,29 @@ command line starts with a hyphen but is not an option. This allows for the
processing of patterns and file names that start with hyphens.
.TP
\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP
-Output \fInumber\fP lines of context after each matching line. If file names
-and/or line numbers are being output, a hyphen separator is used instead of a
-colon for the context lines. A line containing "--" is output between each
-group of lines, unless they are in fact contiguous in the input file. The value
-of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP
-guarantees to have up to 8K of following text available for context output.
+Output up to \fInumber\fP lines of context after each matching line. Fewer
+lines are output if the next match or the end of the file is reached, or if the
+processing buffer size has been set too small. If file names and/or line
+numbers are being output, a hyphen separator is used instead of a colon for the
+context lines. A line containing "--" is output between each group of lines,
+unless they are in fact contiguous in the input file. The value of \fInumber\fP
+is expected to be relatively small. However, \fBpcre2grep\fP guarantees to have
+up to 8K of following text available for context output.
.TP
\fB-a\fP, \fB--text\fP
Treat binary files as text. This is equivalent to
\fB--binary-files\fP=\fItext\fP.
.TP
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
-Output \fInumber\fP lines of context before each matching line. If file names
-and/or line numbers are being output, a hyphen separator is used instead of a
-colon for the context lines. A line containing "--" is output between each
-group of lines, unless they are in fact contiguous in the input file. The value
-of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP
-guarantees to have up to 8K of preceding text available for context output.
+Output up to \fInumber\fP lines of context before each matching line. Fewer
+lines are output if the previous match or the start of the file is within
+\fInumber\fP lines, or if the processing buffer size has been set too small. If
+file names and/or line numbers are being output, a hyphen separator is used
+instead of a colon for the context lines. A line containing "--" is output
+between each group of lines, unless they are in fact contiguous in the input
+file. The value of \fInumber\fP is expected to be relatively small. However,
+\fBpcre2grep\fP guarantees to have up to 8K of preceding text available for
+context output.
.TP
\fB--binary-files=\fP\fIword\fP
Specify how binary files are to be processed. If the word is "binary" (the
@@ -158,8 +170,9 @@ be of interest and are skipped without causing any output or affecting the
return code.
.TP
\fB--buffer-size=\fP\fInumber\fP
-Set the parameter that controls how much memory is used for buffering files
-that are being scanned.
+Set the parameter that controls how much memory is obtained at the start of
+processing for buffering files that are being scanned. See also
+\fB--max-buffer-size\fP below.
.TP
\fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP
Output \fInumber\fP lines of context both before and after each matching line.
@@ -432,6 +445,11 @@ of use only if it is set smaller than \fB--match-limit\fP.
There are no short forms for these options. The default settings are specified
when the PCRE2 library is compiled, with the default default being 10 million.
.TP
+\fB--max-buffer-size=\fInumber\fP
+This limits the expansion of the processing buffer, whose initial size can be
+set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
+smaller than the starting buffer size.
+.TP
\fB-M\fP, \fB--multiline\fP
Allow patterns to match more than one line. When this option is given, patterns
may usefully contain literal newline characters and internal occurrences of ^
@@ -757,6 +775,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 19 June 2016
+Last updated: 11 October 2016
Copyright (c) 1997-2016 University of Cambridge.
.fi
diff --git a/src/config.h.generic b/src/config.h.generic
index 8a71be0..2dc0864 100644
--- a/src/config.h.generic
+++ b/src/config.h.generic
@@ -206,7 +206,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_NAME "PCRE2"
/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "PCRE2 10.22"
+#define PACKAGE_STRING "PCRE2 10.23-RC1"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre2"
@@ -215,7 +215,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
-#define PACKAGE_VERSION "10.22"
+#define PACKAGE_VERSION "10.23-RC1"
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
parentheses (of any kind) in a pattern. This limits the amount of system
@@ -224,15 +224,24 @@ sure both macros are undefined; an emulation function will then be used. */
#define PARENS_NEST_LIMIT 250
#endif
-/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by
- pcre2grep to hold parts of the file it is searching. This is also the
- minimum value. The actual amount of memory used by pcre2grep is three times
- this number, because it allows for the buffering of "before" and "after"
- lines. */
+/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
+ pcre2grep to hold parts of the file it is searching. The buffer will be
+ expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
+ very long lines. The actual amount of memory used by pcre2grep is three
+ times this number, because it allows for the buffering of "before" and
+ "after" lines. */
#ifndef PCRE2GREP_BUFSIZE
#define PCRE2GREP_BUFSIZE 20480
#endif
+/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
+ used by pcre2grep to hold parts of the file it is searching. The actual
+ amount of memory used by pcre2grep is three times this number, because it
+ allows for the buffering of "before" and "after" lines. */
+#ifndef PCRE2GREP_MAX_BUFSIZE
+#define PCRE2GREP_MAX_BUFSIZE 1048576
+#endif
+
/* Define to any value to include debugging code. */
/* #undef PCRE2_DEBUG */
@@ -299,7 +308,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* #undef SUPPORT_VALGRIND */
/* Version number of package */
-#define VERSION "10.22"
+#define VERSION "10.23-RC1"
/* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */
diff --git a/src/config.h.in b/src/config.h.in
index d4821af..477efc2 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -207,13 +207,20 @@ sure both macros are undefined; an emulation function will then be used. */
stack that is used while compiling a pattern. */
#undef PARENS_NEST_LIMIT
-/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by
- pcre2grep to hold parts of the file it is searching. This is also the
- minimum value. The actual amount of memory used by pcre2grep is three times
- this number, because it allows for the buffering of "before" and "after"
- lines. */
+/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
+ pcre2grep to hold parts of the file it is searching. The buffer will be
+ expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
+ very long lines. The actual amount of memory used by pcre2grep is three
+ times this number, because it allows for the buffering of "before" and
+ "after" lines. */
#undef PCRE2GREP_BUFSIZE
+/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
+ used by pcre2grep to hold parts of the file it is searching. The actual
+ amount of memory used by pcre2grep is three times this number, because it
+ allows for the buffering of "before" and "after" lines. */
+#undef PCRE2GREP_MAX_BUFSIZE
+
/* to make a symbol visible */
#undef PCRE2POSIX_EXP_DECL
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index 49747c0..aa8c5c2 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c
@@ -173,6 +173,7 @@ static int before_context = 0;
static int binary_files = BIN_BINARY;
static int both_context = 0;
static int bufthird = PCRE2GREP_BUFSIZE;
+static int max_bufthird = PCRE2GREP_MAX_BUFSIZE;
static int bufsize = 3*PCRE2GREP_BUFSIZE;
static int endlinetype;
@@ -344,6 +345,7 @@ used to identify them. */
#define N_EXCLUDE_FROM (-19)
#define N_INCLUDE_FROM (-20)
#define N_OM_SEPARATOR (-21)
+#define N_MAX_BUFSIZE (-22)
static option_item optionlist[] = {
{ OP_NODATA, N_NULL, NULL, "", "terminate options" },
@@ -352,7 +354,8 @@ static option_item optionlist[] = {
{ OP_NODATA, 'a', NULL, "text", "treat binary files as text" },
{ OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" },
{ OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" },
- { OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" },
+ { OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer starting size" },
+ { OP_NUMBER, N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number", "set processing buffer maximum size" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" },
{ OP_NUMBER, 'C', &both_context, "context=number", "set number of context lines, before & after" },
@@ -952,8 +955,9 @@ for (op = optionlist; op->one_char != 0; op++)
printf("%.*s%s" STDOUT_NL, n, " ", op->help_text);
}
-printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --buffer-size=100K." STDOUT_NL);
+printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --max-buffer-size=100K." STDOUT_NL);
printf("The default value for --buffer-size is %d." STDOUT_NL, PCRE2GREP_BUFSIZE);
+printf("The default value for --max-buffer-size is %d." STDOUT_NL, PCRE2GREP_MAX_BUFSIZE);
printf("When reading patterns or file names from a file, trailing white" STDOUT_NL);
printf("space is removed and blank lines are ignored." STDOUT_NL);
printf("The maximum size of any pattern is %d bytes." STDOUT_NL, MAXPATLEN);
@@ -1100,12 +1104,12 @@ return om;
* Read one line of input *
*************************************************/
-/* Normally, input is read using fread() into a large buffer, so many lines may
-be read at once. However, doing this for tty input means that no output appears
-until a lot of input has been typed. Instead, tty input is handled line by
-line. We cannot use fgets() for this, because it does not stop at a binary
-zero, and therefore there is no way of telling how many characters it has read,
-because there may be binary zeros embedded in the data.
+/* Normally, input is read using fread() (or gzread, or BZ2_read) into a large
+buffer, so many lines may be read at once. However, doing this for tty input
+means that no output appears until a lot of input has been typed. Instead, tty
+input is handled line by line. We cannot use fgets() for this, because it does
+not stop at a binary zero, and therefore there is no way of telling how many
+characters it has read, because there may be binary zeros embedded in the data.
Arguments:
buffer the buffer to read into
@@ -1424,17 +1428,18 @@ do_after_lines(int lastmatchnumber, char *lastmatchrestart, char *endptr,
if (after_context > 0 && lastmatchnumber > 0)
{
int count = 0;
- while (lastmatchrestart < endptr && count++ < after_context)
+ while (lastmatchrestart < endptr && count < after_context)
{
int ellength;
- char *pp = lastmatchrestart;
+ char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
+ if (ellength == 0 && pp == main_buffer + bufsize) break;
if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
- pp = end_of_line(pp, endptr, &ellength);
FWRITE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
lastmatchrestart = pp;
+ count++;
}
- hyphenpending = TRUE;
+ if (count > 0) hyphenpending = TRUE;
}
}
@@ -1770,6 +1775,33 @@ return result != 0;
/*************************************************
+* Read a portion of the file into buffer *
+*************************************************/
+
+static int
+fill_buffer(void *handle, int frtype, char *buffer, int length,
+ BOOL input_line_buffered)
+{
+#ifdef SUPPORT_LIBZ
+if (frtype == FR_LIBZ)
+ return gzread((gzFile)handle, buffer, length);
+else
+#endif
+
+#ifdef SUPPORT_LIBBZ2
+if (frtype == FR_LIBBZ2)
+ return BZ2_bzread((BZFILE *)handle, buffer, length);
+else
+#endif
+
+return (input_line_buffered ?
+ read_one_line(buffer, length, (FILE *)handle) :
+ fread(buffer, 1, length, (FILE *)handle));
+}
+
+
+
+/*************************************************
* Grep an individual file *
*************************************************/
@@ -1813,49 +1845,24 @@ BOOL endhyphenpending = FALSE;
BOOL input_line_buffered = line_buffered;
FILE *in = NULL; /* Ensure initialized */
-#ifdef SUPPORT_LIBZ
-gzFile ingz = NULL;
-#endif
-
-#ifdef SUPPORT_LIBBZ2
-BZFILE *inbz2 = NULL;
-#endif
-
-
/* Do the first read into the start of the buffer and set up the pointer to end
of what we have. In the case of libz, a non-zipped .gz file will be read as a
plain file. However, if a .bz2 file isn't actually bzipped, the first read will
fail. */
-(void)frtype;
-
-#ifdef SUPPORT_LIBZ
-if (frtype == FR_LIBZ)
+if (frtype != FR_LIBZ && frtype != FR_LIBBZ2)
{
- ingz = (gzFile)handle;
- bufflength = gzread (ingz, main_buffer, bufsize);
+ in = (FILE *)handle;
+ if (is_file_tty(in)) input_line_buffered = TRUE;
}
-else
-#endif
+
+bufflength = fill_buffer(handle, frtype, main_buffer, bufsize,
+ input_line_buffered);
#ifdef SUPPORT_LIBBZ2
-if (frtype == FR_LIBBZ2)
- {
- inbz2 = (BZFILE *)handle;
- bufflength = BZ2_bzread(inbz2, main_buffer, bufsize);
- if ((int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */
- } /* without the cast it is unsigned. */
-else
+if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */
#endif
- {
- in = (FILE *)handle;
- if (is_file_tty(in)) input_line_buffered = TRUE;
- bufflength = input_line_buffered?
- read_one_line(main_buffer, bufsize, in) :
- fread(main_buffer, 1, bufsize, in);
- }
-
endptr = main_buffer + bufflength;
/* Unless binary-files=text, see if we have a binary file. This uses the same
@@ -1899,18 +1906,61 @@ while (ptr < endptr)
/* Check to see if the line we are looking at extends right to the very end
of the buffer without a line terminator. This means the line is too long to
- handle. */
+ handle at the current buffer size. Until the buffer reaches its maximum size,
+ try doubling it and reading more data. */
if (endlinelength == 0 && t == main_buffer + bufsize)
{
- fprintf(stderr, "pcre2grep: line %d%s%s is too long for the internal buffer\n"
- "pcre2grep: the buffer size is %d\n"
- "pcre2grep: use the --buffer-size option to change it\n",
- linenumber,
- (filename == NULL)? "" : " of file ",
- (filename == NULL)? "" : filename,
- bufthird);
- return 2;
+ if (bufthird < max_bufthird)
+ {
+ char *new_buffer;
+ int new_bufthird = 2*bufthird;
+
+ if (new_bufthird > max_bufthird) new_bufthird = max_bufthird;
+ new_buffer = (char *)malloc(3*new_bufthird);
+
+ if (new_buffer == NULL)
+ {
+ fprintf(stderr,
+ "pcre2grep: line %d%s%s is too long for the internal buffer\n"
+ "pcre2grep: not enough memory to increase the buffer size to %d\n",
+ linenumber,
+ (filename == NULL)? "" : " of file ",
+ (filename == NULL)? "" : filename,
+ new_bufthird);
+ return 2;
+ }
+
+ /* Copy the data and adjust pointers to the new buffer location. */
+
+ memcpy(new_buffer, main_buffer, bufsize);
+ bufthird = new_bufthird;
+ bufsize = 3*bufthird;
+ ptr = new_buffer + (ptr - main_buffer);
+ lastmatchrestart = new_buffer + (lastmatchrestart - main_buffer);
+ free(main_buffer);
+ main_buffer = new_buffer;
+
+ /* Read more data into the buffer and then try to find the line ending
+ again. */
+
+ bufflength += fill_buffer(handle, frtype, main_buffer + bufflength,
+ bufsize - bufflength, input_line_buffered);
+ endptr = main_buffer + bufflength;
+ continue;
+ }
+ else
+ {
+ fprintf(stderr,
+ "pcre2grep: line %d%s%s is too long for the internal buffer\n"
+ "pcre2grep: the maximum buffer size is %d\n"
+ "pcre2grep: use the --max-buffer-size option to change it\n",
+ linenumber,
+ (filename == NULL)? "" : " of file ",
+ (filename == NULL)? "" : filename,
+ bufthird);
+ return 2;
+ }
}
/* Extra processing for Jeffrey Friedl's debugging. */
@@ -2320,8 +2370,9 @@ while (ptr < endptr)
lastmatchnumber > 0 &&
lastmatchrestart < main_buffer + bufthird)
{
+
do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname);
- lastmatchnumber = 0;
+ lastmatchnumber = 0; /* Indicates no after lines pending */
}
/* Now do the shuffle */
@@ -2329,24 +2380,8 @@ while (ptr < endptr)
memmove(main_buffer, main_buffer + bufthird, 2*bufthird);
ptr -= bufthird;
-#ifdef SUPPORT_LIBZ
- if (frtype == FR_LIBZ)
- bufflength = 2*bufthird +
- gzread (ingz, main_buffer + 2*bufthird, bufthird);
- else
-#endif
-
-#ifdef SUPPORT_LIBBZ2
- if (frtype == FR_LIBBZ2)
- bufflength = 2*bufthird +
- BZ2_bzread(inbz2, main_buffer + 2*bufthird, bufthird);
- else
-#endif
-
- bufflength = 2*bufthird +
- (input_line_buffered?
- read_one_line(main_buffer + 2*bufthird, bufthird, in) :
- fread(main_buffer + 2*bufthird, 1, bufthird, in));
+ bufflength = 2*bufthird + fill_buffer(handle, frtype,
+ main_buffer + 2*bufthird, bufthird, input_line_buffered);
endptr = main_buffer + bufflength;
/* Adjust any last match point */
@@ -3427,6 +3462,12 @@ if (jfriedl_XT != 0 || jfriedl_XR != 0)
/* Get memory for the main buffer. */
+if (bufthird <= 0)
+ {
+ fprintf(stderr, "pcre2grep: --buffer-size must be greater than zero\n");
+ goto EXIT2;
+ }
+
bufsize = 3*bufthird;
main_buffer = (char *)malloc(bufsize);
diff --git a/testdata/grepoutput b/testdata/grepoutput
index 9d41817..8e1f68c 100644
--- a/testdata/grepoutput
+++ b/testdata/grepoutput
@@ -637,8 +637,8 @@ RC=0
RC=0
---------------------------- Test 83 -----------------------------
pcre2grep: line 4 of file ./testdata/grepinput3 is too long for the internal buffer
-pcre2grep: the buffer size is 100
-pcre2grep: use the --buffer-size option to change it
+pcre2grep: the maximum buffer size is 100
+pcre2grep: use the --max-buffer-size option to change it
RC=2
---------------------------- Test 84 -----------------------------
testdata/grepinputv:fox jumps