summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-04-16 15:28:08 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-04-16 15:28:08 +0000
commit99c49a95b9892e71ec8cd69cd1bcb3a96964ba15 (patch)
tree5b2b9bf5d6fb86e4d3851650956920f7557632d8
parent88e1c64afcd55068b44b441995139eb18c36bb01 (diff)
downloadpcre-99c49a95b9892e71ec8cd69cd1bcb3a96964ba15.tar.gz
Add PCRE_NEWLINE_ANYCRLF.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@149 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog3
-rw-r--r--README28
-rwxr-xr-xRunGrepTest5
-rw-r--r--configure.ac21
-rw-r--r--doc/pcre_compile.31
-rw-r--r--doc/pcre_compile2.31
-rw-r--r--doc/pcre_config.31
-rw-r--r--doc/pcre_dfa_exec.31
-rw-r--r--doc/pcre_exec.31
-rw-r--r--doc/pcreapi.344
-rw-r--r--doc/pcrebuild.39
-rw-r--r--doc/pcregrep.119
-rw-r--r--doc/pcretest.130
-rwxr-xr-xmaint/ManyConfigTests1
-rw-r--r--pcre.h.in1
-rw-r--r--pcre_compile.c10
-rw-r--r--pcre_dfa_exec.c23
-rw-r--r--pcre_exec.c23
-rw-r--r--pcre_internal.h32
-rw-r--r--pcre_newline.c45
-rw-r--r--pcregexp.pas31
-rw-r--r--pcregrep.c69
-rw-r--r--pcretest.c24
-rw-r--r--testdata/grepoutputN6
-rw-r--r--testdata/testinput26
-rw-r--r--testdata/testoutput214
26 files changed, 313 insertions, 136 deletions
diff --git a/ChangeLog b/ChangeLog
index 49dc7a7..4696346 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -139,6 +139,9 @@ Version 7.1 12-Mar-07
20. pcretest is supposed to handle patterns and data of any length, by
extending its buffers when necessary. It was getting this wrong when the
buffer for a data line had to be extended.
+
+21. Added PCRE_NEWLINE_ANYCRLF which is like ANY, but matches only CR, LF, or
+ CRLF as a newline sequence.
Version 7.0 19-Dec-06
diff --git a/README b/README
index 38d4fe4..da2e3e0 100644
--- a/README
+++ b/README
@@ -169,18 +169,20 @@ library. You can read more about them in the pcrebuild man page.
supported.
. You can build PCRE to recognize either CR or LF or the sequence CRLF or any
- of the Unicode newline sequences as indicating the end of a line. Whatever
- you specify at build time is the default; the caller of PCRE can change the
- selection at run time. The default newline indicator is a single LF character
- (the Unix standard). You can specify the default newline indicator by adding
- --newline-is-cr or --newline-is-lf or --newline-is-crlf or --newline-is-any
- to the "configure" command, respectively.
-
- If you specify --newline-is-cr or --newline-is-crlf, some of the standard
- tests will fail, because the lines in the test files end with LF. Even if
- the files are edited to change the line endings, there are likely to be some
- failures. With --newline-is-any, many tests should succeed, but there may be
- some failures.
+ of the preceding, or any of the Unicode newline sequences as indicating the
+ end of a line. Whatever you specify at build time is the default; the caller
+ of PCRE can change the selection at run time. The default newline indicator
+ is a single LF character (the Unix standard). You can specify the default
+ newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
+ or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
+ --enable-newline-is-any to the "configure" command, respectively.
+
+ If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
+ the standard tests will fail, because the lines in the test files end with
+ LF. Even if the files are edited to change the line endings, there are likely
+ to be some failures. With --enable-newline-is-anycrlf or
+ --enable-newline-is-any, many tests should succeed, but there may be some
+ failures.
. When called via the POSIX interface, PCRE uses malloc() to get additional
storage for processing capturing parentheses if there are more than 10 of
@@ -709,4 +711,4 @@ The distribution should contain the following files:
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
-Last updated: 29 March 2007
+Last updated: 16 April 2007
diff --git a/RunGrepTest b/RunGrepTest
index 5f73798..0996e68 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -242,13 +242,16 @@ echo "---------------------------- Test N3 ------------------------------" >>tes
pattern=`printf 'def\rjkl'`
(cd $srcdir; $valgrind $pcregrep --newline=cr -F "$pattern" ./testdata/grepinputx) >>testtry
-echo "---------------------------- Test n$ ------------------------------" >>testtry
+echo "---------------------------- Test N4 ------------------------------" >>testtry
pattern=`printf 'xxx\r\njkl'`
(cd $srcdir; $valgrind $pcregrep --newline=crlf -F "$pattern" ./testdata/grepinputx) >>testtry
echo "---------------------------- Test N5 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -n --newline=any "^(abc|def|ghi|jkl)" ./testdata/grepinputx) >>testtry
+echo "---------------------------- Test N6 ------------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep -n --newline=anycrlf "^(abc|def|ghi|jkl)" ./testdata/grepinputx) >>testtry
+
$cf testtry $srcdir/testdata/grepoutputN
if [ $? != 0 ] ; then exit 1; fi
diff --git a/configure.ac b/configure.ac
index 4cfb682..38cb057 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,8 +8,8 @@ dnl empty.
m4_define(pcre_major, [7])
m4_define(pcre_minor, [1])
-m4_define(pcre_prerelease, [-RC4])
-m4_define(pcre_date, [2007-04-04])
+m4_define(pcre_prerelease, [-RC5])
+m4_define(pcre_date, [2007-04-16])
# Libtool shared library interface versions (current:revision:age)
m4_define(libpcre_version, [0:1:0])
@@ -71,7 +71,7 @@ AC_ARG_ENABLE(unicode-properties,
# Handle --enable-newline=NL
dnl AC_ARG_ENABLE(newline,
dnl AS_HELP_STRING([--enable-newline=NL],
-dnl [use NL as newline (lf, cr, crlf, any; default=lf)]),
+dnl [use NL as newline (lf, cr, crlf, anycrlf, any; default=lf)]),
dnl , enable_newline=lf)
# Separate newline options
@@ -88,6 +88,10 @@ AC_ARG_ENABLE(newline-is-crlf,
AS_HELP_STRING([--enable-newline-is-crlf],
[use CRLF as newline sequence]),
ac_pcre_newline=crlf)
+AC_ARG_ENABLE(newline-is-anycrlf,
+ AS_HELP_STRING([--enable-newline-is-anycrlf],
+ [use CR, LF, or CRLF as newline sequence]),
+ ac_pcre_newline=anycrlf)
AC_ARG_ENABLE(newline-is-any,
AS_HELP_STRING([--enable-newline-is-any],
[use any valid Unicode newline sequence]),
@@ -163,10 +167,11 @@ fi
# Convert the newline identifier into the appropriate integer value.
case "$enable_newline" in
- lf) ac_pcre_newline_value=10 ;;
- cr) ac_pcre_newline_value=13 ;;
- crlf) ac_pcre_newline_value=3338 ;;
- any) ac_pcre_newline_value=-1 ;;
+ lf) ac_pcre_newline_value=10 ;;
+ cr) ac_pcre_newline_value=13 ;;
+ crlf) ac_pcre_newline_value=3338 ;;
+ anycrlf) ac_pcre_newline_value=-2 ;;
+ any) ac_pcre_newline_value=-1 ;;
*)
AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option])
;;
@@ -280,7 +285,7 @@ AC_DEFINE_UNQUOTED([NEWLINE], [$ac_pcre_newline_value], [
The value of NEWLINE determines the newline character sequence. On
Unix-like systems, "configure" can be used to override the default,
which is 10. The possible values are 10 (LF), 13 (CR), 3338 (CRLF),
- or -1 (ANY).])
+ -1 (ANY), or -2 (ANYCRLF).])
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
The value of LINK_SIZE determines the number of bytes used to store
diff --git a/doc/pcre_compile.3 b/doc/pcre_compile.3
index 3eaab2f..dbab684 100644
--- a/doc/pcre_compile.3
+++ b/doc/pcre_compile.3
@@ -42,6 +42,7 @@ The option bits are:
PCRE_FIRSTLINE Force matching to be before newline
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
diff --git a/doc/pcre_compile2.3 b/doc/pcre_compile2.3
index 5d986a7..eeffdb9 100644
--- a/doc/pcre_compile2.3
+++ b/doc/pcre_compile2.3
@@ -46,6 +46,7 @@ The option bits are:
PCRE_FIRSTLINE Force matching to be before newline
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
diff --git a/doc/pcre_config.3 b/doc/pcre_config.3
index a82b083..52f26c3 100644
--- a/doc/pcre_config.3
+++ b/doc/pcre_config.3
@@ -29,6 +29,7 @@ The available codes are:
13 (0x000d) for CR
10 (0x000a) for LF
3338 (0x0d0a) for CRLF
+ -2 for ANYCRLF
-1 for ANY
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
Threshold of return slots, above
diff --git a/doc/pcre_dfa_exec.3 b/doc/pcre_dfa_exec.3
index 4e23b7e..8416d30 100644
--- a/doc/pcre_dfa_exec.3
+++ b/doc/pcre_dfa_exec.3
@@ -40,6 +40,7 @@ The options are:
.sp
PCRE_ANCHORED Match only at the first position
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
diff --git a/doc/pcre_exec.3 b/doc/pcre_exec.3
index aa321b3..497c329 100644
--- a/doc/pcre_exec.3
+++ b/doc/pcre_exec.3
@@ -35,6 +35,7 @@ The options are:
.sp
PCRE_ANCHORED Match only at the first position
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 1163fc7..49132ea 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -221,12 +221,13 @@ documentation.
.SH NEWLINES
.rs
.sp
-PCRE supports four different conventions for indicating line breaks in
+PCRE supports five different conventions for indicating line breaks in
strings: a single CR (carriage return) character, a single LF (linefeed)
-character, the two-character sequence CRLF, or any Unicode newline sequence.
-The Unicode newline sequences are the three just mentioned, plus the single
-characters VT (vertical tab, U+000B), FF (formfeed, U+000C), NEL (next line,
-U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029).
+character, the two-character sequence CRLF, any of the three preceding, or any
+Unicode newline sequence. The Unicode newline sequences are the three just
+mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
+U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
+(paragraph separator, U+2029).
.P
Each of the first three conventions is used by at least one operating system as
its standard newline sequence. When PCRE is built, a default can be specified.
@@ -297,8 +298,8 @@ properties is available; otherwise it is set to zero.
.sp
The output is an integer whose value specifies the default character sequence
that is recognized as meaning "newline". The four values that are supported
-are: 10 for LF, 13 for CR, 3338 for CRLF, and -1 for ANY. The default should
-normally be the standard sequence for your operating system.
+are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY. The
+default should normally be the standard sequence for your operating system.
.sp
PCRE_CONFIG_LINK_SIZE
.sp
@@ -532,24 +533,27 @@ occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
PCRE_NEWLINE_CR
PCRE_NEWLINE_LF
PCRE_NEWLINE_CRLF
+ PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
.sp
These options override the default newline definition that was chosen when PCRE
was built. Setting the first or the second specifies that a newline is
indicated by a single character (CR or LF, respectively). Setting
PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
-CRLF sequence. Setting PCRE_NEWLINE_ANY specifies that any Unicode newline
-sequence should be recognized. The Unicode newline sequences are the three just
-mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
-U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
-(paragraph separator, U+2029). The last two are recognized only in UTF-8 mode.
+CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
+preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
+that any Unicode newline sequence should be recognized. The Unicode newline
+sequences are the three just mentioned, plus the single characters VT (vertical
+tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
+separator, U+2028), and PS (paragraph separator, U+2029). The last two are
+recognized only in UTF-8 mode.
.P
The newline setting in the options word uses three bits that are treated
-as a number, giving eight possibilities. Currently only five are used (default
-plus the four values above). This means that if you set more than one newline
+as a number, giving eight possibilities. Currently only six are used (default
+plus the five values above). This means that if you set more than one newline
option, the combination may or may not be sensible. For example,
PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
-other combinations yield unused numbers and cause an error.
+other combinations may yield unused numbers and cause an error.
.P
The only time that a line break is specially recognized when compiling a
pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
@@ -1150,6 +1154,7 @@ matching time.
PCRE_NEWLINE_CR
PCRE_NEWLINE_LF
PCRE_NEWLINE_CRLF
+ PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
.sp
These options override the newline definition that was chosen or defaulted when
@@ -1157,9 +1162,10 @@ the pattern was compiled. For details, see the description of
\fBpcre_compile()\fP above. During matching, the newline choice affects the
behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
the way the match position is advanced after a match failure for an unanchored
-pattern. When PCRE_NEWLINE_CRLF or PCRE_NEWLINE_ANY is set, and a match attempt
-fails when the current position is at a CRLF sequence, the match position is
-advanced by two characters instead of one, in other words, to after the CRLF.
+pattern. When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is
+set, and a match attempt fails when the current position is at a CRLF sequence,
+the match position is advanced by two characters instead of one, in other
+words, to after the CRLF.
.sp
PCRE_NOTBOL
.sp
@@ -1843,6 +1849,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 06 March 2007
+Last updated: 16 April 2007
Copyright (c) 1997-2007 University of Cambridge.
.fi
diff --git a/doc/pcrebuild.3 b/doc/pcrebuild.3
index 5c23ddf..f732241 100644
--- a/doc/pcrebuild.3
+++ b/doc/pcrebuild.3
@@ -83,9 +83,14 @@ character sequence CRLF. If you want this, add
.sp
to the \fBconfigure\fP command. There is a fourth option, specified by
.sp
+ --enable-newline-is-anycrlf
+.sp
+which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
+indicating a line ending. Finally, a fifth option, specified by
+.sp
--enable-newline-is-any
.sp
-which causes PCRE to recognize any Unicode newline sequence.
+causes PCRE to recognize any Unicode newline sequence.
.P
Whatever line ending convention is selected when PCRE is built can be
overridden when the library functions are called. At build time it is
@@ -249,6 +254,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 20 March 2007
+Last updated: 16 April 2007
Copyright (c) 1997-2007 University of Cambridge.
.fi
diff --git a/doc/pcregrep.1 b/doc/pcregrep.1
index b7003bb..3f34b2b 100644
--- a/doc/pcregrep.1
+++ b/doc/pcregrep.1
@@ -220,22 +220,23 @@ the previous 8K characters (or all the previous characters, if fewer than 8K)
are guaranteed to be available for lookbehind assertions.
.TP
\fB-N\fP \fInewline-type\fP, \fB--newline=\fP\fInewline-type\fP
-The PCRE library supports four different conventions for indicating
+The PCRE library supports five different conventions for indicating
the ends of lines. They are the single-character sequences CR (carriage return)
-and LF (linefeed), the two-character sequence CRLF, and an "any" convention, in
+and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
+which recognizes any of the preceding three types, and an "any" convention, in
which any Unicode line ending sequence is assumed to end a line. The Unicode
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
-PS (paragraph separator, U+0029).
+PS (paragraph separator, U+2029).
.sp
When the PCRE library is built, a default line-ending sequence is specified.
This is normally the standard sequence for the operating system. Unless
otherwise specified by this option, \fBpcregrep\fP uses the library's default.
-The possible values for this option are CR, LF, CRLF, or ANY. This makes it
-possible to use \fBpcregrep\fP on files that have come from other environments
-without having to modify their line endings. If the data that is being scanned
-does not agree with the convention set by this option, \fBpcregrep\fP may
-behave in strange ways.
+The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
+makes it possible to use \fBpcregrep\fP on files that have come from other
+environments without having to modify their line endings. If the data that is
+being scanned does not agree with the convention set by this option,
+\fBpcregrep\fP may behave in strange ways.
.TP
\fB-n\fP, \fB--line-number\fP
Precede each output line by its line number in the file, followed by a colon
@@ -387,6 +388,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 06 March 2007
+Last updated: 16 April 2007
Copyright (c) 1997-2007 University of Cambridge.
.fi
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index 9236fc7..57e2e1d 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -157,18 +157,19 @@ effect as they do in Perl. For example:
The following table shows additional modifiers for setting PCRE options that do
not correspond to anything in Perl:
.sp
- \fB/A\fP PCRE_ANCHORED
- \fB/C\fP PCRE_AUTO_CALLOUT
- \fB/E\fP PCRE_DOLLAR_ENDONLY
- \fB/f\fP PCRE_FIRSTLINE
- \fB/J\fP PCRE_DUPNAMES
- \fB/N\fP PCRE_NO_AUTO_CAPTURE
- \fB/U\fP PCRE_UNGREEDY
- \fB/X\fP PCRE_EXTRA
- \fB/<cr>\fP PCRE_NEWLINE_CR
- \fB/<lf>\fP PCRE_NEWLINE_LF
- \fB/<crlf>\fP PCRE_NEWLINE_CRLF
- \fB/<any>\fP PCRE_NEWLINE_ANY
+ \fB/A\fP PCRE_ANCHORED
+ \fB/C\fP PCRE_AUTO_CALLOUT
+ \fB/E\fP PCRE_DOLLAR_ENDONLY
+ \fB/f\fP PCRE_FIRSTLINE
+ \fB/J\fP PCRE_DUPNAMES
+ \fB/N\fP PCRE_NO_AUTO_CAPTURE
+ \fB/U\fP PCRE_UNGREEDY
+ \fB/X\fP PCRE_EXTRA
+ \fB/<cr>\fP PCRE_NEWLINE_CR
+ \fB/<lf>\fP PCRE_NEWLINE_LF
+ \fB/<crlf>\fP PCRE_NEWLINE_CRLF
+ \fB/<anycrlf>\fP PCRE_NEWLINE_ANY
+ \fB/<any>\fP PCRE_NEWLINE_ANY
.sp
Those specifying line ending sequencess are literal strings as shown. This
example sets multiline matching with CRLF as the line ending sequence:
@@ -372,6 +373,9 @@ recognized:
\e<crlf> pass the PCRE_NEWLINE_CRLF option to \fBpcre_exec()\fP
or \fBpcre_dfa_exec()\fP
.\" JOIN
+ \e<anycrlf> pass the PCRE_NEWLINE_ANYCRLF option to \fBpcre_exec()\fP
+ or \fBpcre_dfa_exec()\fP
+.\" JOIN
\e<any> pass the PCRE_NEWLINE_ANY option to \fBpcre_exec()\fP
or \fBpcre_dfa_exec()\fP
.sp
@@ -686,6 +690,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 06 March 2007
+Last updated: 16 April 2007
Copyright (c) 1997-2007 University of Cambridge.
.fi
diff --git a/maint/ManyConfigTests b/maint/ManyConfigTests
index cdf5358..0bdac28 100755
--- a/maint/ManyConfigTests
+++ b/maint/ManyConfigTests
@@ -137,6 +137,7 @@ for opts in \
"--enable-newline-is-any --disable-shared" \
"--enable-newline-is-cr --disable-shared" \
"--enable-newline-is-crlf --disable-shared" \
+ "--enable-newline-is-anycrlf --disable-shared" \
"--enable-utf8 --enable-newline-is-any --enable-unicode-properties --disable-stack-for-recursion --disable-static --disable-cpp"
do
runtest
diff --git a/pcre.h.in b/pcre.h.in
index 2f1995a..8e4e8bd 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -106,6 +106,7 @@ extern "C" {
#define PCRE_NEWLINE_LF 0x00200000
#define PCRE_NEWLINE_CRLF 0x00300000
#define PCRE_NEWLINE_ANY 0x00400000
+#define PCRE_NEWLINE_ANYCRLF 0x00500000
/* Exec-time and get/set-time error codes */
diff --git a/pcre_compile.c b/pcre_compile.c
index 70e30cf..46d6d8d 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -5138,7 +5138,8 @@ cd->cbits = tables + cbits_offset;
cd->ctypes = tables + ctypes_offset;
/* Handle different types of newline. The three bits give seven cases. The
-current code allows for fixed one- or two-byte sequences, plus "any". */
+current code allows for fixed one- or two-byte sequences, plus "any" and
+"anycrlf". */
switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
{
@@ -5148,10 +5149,15 @@ switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
case PCRE_NEWLINE_ANY: newline = -1; break;
+ case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
}
-if (newline < 0)
+if (newline == -2)
+ {
+ cd->nltype = NLTYPE_ANYCRLF;
+ }
+else if (newline < 0)
{
cd->nltype = NLTYPE_ANY;
}
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index b972f52..40ea80c 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -2175,10 +2175,15 @@ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)option
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
case PCRE_NEWLINE_ANY: newline = -1; break;
+ case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
default: return PCRE_ERROR_BADNEWLINE;
}
-if (newline < 0)
+if (newline == -2)
+ {
+ md->nltype = NLTYPE_ANYCRLF;
+ }
+else if (newline < 0)
{
md->nltype = NLTYPE_ANY;
}
@@ -2309,11 +2314,12 @@ for (;;)
while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
current_subject++;
- /* If we have just passed a CR and the newline option is ANY, and we
- are now at a LF, advance the match position by one more character. */
+ /* If we have just passed a CR and the newline option is ANY or
+ ANYCRLF, and we are now at a LF, advance the match position by one more
+ character. */
if (current_subject[-1] == '\r' &&
- md->nltype == NLTYPE_ANY &&
+ (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
current_subject < end_subject &&
*current_subject == '\n')
current_subject++;
@@ -2425,11 +2431,14 @@ for (;;)
}
if (current_subject > end_subject) break;
- /* If we have just passed a CR and the newline option is CRLF or ANY, and we
- are now at a LF, advance the match position by one more character. */
+ /* If we have just passed a CR and the newline option is CRLF or ANY or
+ ANYCRLF, and we are now at a LF, advance the match position by one more
+ character. */
if (current_subject[-1] == '\r' &&
- (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
+ (md->nltype == NLTYPE_ANY ||
+ md->nltype == NLTYPE_ANYCRLF ||
+ md->nllen == 2) &&
current_subject < end_subject &&
*current_subject == '\n')
current_subject++;
diff --git a/pcre_exec.c b/pcre_exec.c
index ac38b45..cabe66c 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -3840,10 +3840,15 @@ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)option
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
case PCRE_NEWLINE_ANY: newline = -1; break;
+ case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
default: return PCRE_ERROR_BADNEWLINE;
}
-if (newline < 0)
+if (newline == -2)
+ {
+ md->nltype = NLTYPE_ANYCRLF;
+ }
+else if (newline < 0)
{
md->nltype = NLTYPE_ANY;
}
@@ -4019,11 +4024,12 @@ for(;;)
while (start_match <= end_subject && !WAS_NEWLINE(start_match))
start_match++;
- /* If we have just passed a CR and the newline option is ANY, and we are
- now at a LF, advance the match position by one more character. */
+ /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
+ and we are now at a LF, advance the match position by one more character.
+ */
if (start_match[-1] == '\r' &&
- md->nltype == NLTYPE_ANY &&
+ (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
start_match < end_subject &&
*start_match == '\n')
start_match++;
@@ -4142,11 +4148,14 @@ for(;;)
if (anchored || start_match > end_subject) break;
- /* If we have just passed a CR and the newline option is CRLF or ANY, and we
- are now at a LF, advance the match position by one more character. */
+ /* If we have just passed a CR and the newline option is CRLF or ANY or
+ ANYCRLF, and we are now at a LF, advance the match position by one more
+ character. */
if (start_match[-1] == '\r' &&
- (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
+ (md->nltype == NLTYPE_ANY ||
+ md->nltype == NLTYPE_ANYCRLF ||
+ md->nllen == 2) &&
start_match < end_subject &&
*start_match == '\n')
start_match++;
diff --git a/pcre_internal.h b/pcre_internal.h
index cf79dd5..c38a849 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -174,21 +174,22 @@ characters only go up to 0x7fffffff (though Unicode doesn't go beyond
#define NOTACHAR 0xffffffff
/* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
-and "all" at present). The following macros are used to package up testing for
-newlines. NLBLOCK, PSSTART, and PSEND are defined in the various modules to
-indicate in which datablock the parameters exist, and what the start/end of
-string field names are. */
+"any" and "anycrlf" at present). The following macros are used to package up
+testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various
+modules to indicate in which datablock the parameters exist, and what the
+start/end of string field names are. */
-#define NLTYPE_FIXED 0 /* Newline is a fixed length string */
-#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
+#define NLTYPE_FIXED 0 /* Newline is a fixed length string */
+#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
+#define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */
/* This macro checks for a newline at the given position */
#define IS_NEWLINE(p) \
((NLBLOCK->nltype != NLTYPE_FIXED)? \
((p) < NLBLOCK->PSEND && \
- _pcre_is_newline((p), NLBLOCK->PSEND, &(NLBLOCK->nllen), utf8) \
- ) \
+ _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
+ utf8)) \
: \
((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
(p)[0] == NLBLOCK->nl[0] && \
@@ -201,8 +202,8 @@ string field names are. */
#define WAS_NEWLINE(p) \
((NLBLOCK->nltype != NLTYPE_FIXED)? \
((p) > NLBLOCK->PSSTART && \
- _pcre_was_newline((p), NLBLOCK->PSSTART, &(NLBLOCK->nllen), utf8) \
- ) \
+ _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
+ &(NLBLOCK->nllen), utf8)) \
: \
((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
(p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
@@ -500,7 +501,8 @@ bits. */
/* Masks for identifying the public options that are permitted at compile
time, run time, or study time, respectively. */
-#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY)
+#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \
+ PCRE_NEWLINE_ANYCRLF)
#define PUBLIC_OPTIONS \
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
@@ -1087,16 +1089,16 @@ extern const uschar _pcre_OP_lengths[];
one of the exported public functions. They have to be "external" in the C
sense, but are not part of the PCRE public API. */
-extern BOOL _pcre_is_newline(const uschar *, const uschar *, int *,
- BOOL);
+extern BOOL _pcre_is_newline(const uschar *, int, const uschar *,
+ int *, BOOL);
extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
extern unsigned int _pcre_ucp_othercase(const unsigned int);
extern int _pcre_valid_utf8(const uschar *, int);
-extern BOOL _pcre_was_newline(const uschar *, const uschar *, int *,
- BOOL);
+extern BOOL _pcre_was_newline(const uschar *, int, const uschar *,
+ int *, BOOL);
extern BOOL _pcre_xclass(int, const uschar *);
#endif
diff --git a/pcre_newline.c b/pcre_newline.c
index 0bfcba0..3a5db61 100644
--- a/pcre_newline.c
+++ b/pcre_newline.c
@@ -38,13 +38,12 @@ POSSIBILITY OF SUCH DAMAGE.
*/
-/* This module contains internal functions for testing newlines when more than
+/* This module contains internal functions for testing newlines when more than
one kind of newline is to be recognized. When a newline is found, its length is
-returned. In principle, we could implement several newline "types", each
-referring to a different set of newline characters. At present, PCRE supports
-only NLTYPE_FIXED, which gets handled without these functions, and NLTYPE_ALL,
-so for now the type isn't passed into the functions. It can easily be added
-later if required. The full list of Unicode newline characters is taken from
+returned. In principle, we could implement several newline "types", each
+referring to a different set of newline characters. At present, PCRE supports
+only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF,
+and NLTYPE_ANY. The full list of Unicode newline characters is taken from
http://unicode.org/unicode/reports/tr18/. */
@@ -61,6 +60,7 @@ string that is being processed.
Arguments:
ptr pointer to possible newline
+ type the newline type
endptr pointer to the end of the string
lenptr where to return the length
utf8 TRUE if in utf8 mode
@@ -69,12 +69,23 @@ Returns: TRUE or FALSE
*/
BOOL
-_pcre_is_newline(const uschar *ptr, const uschar *endptr, int *lenptr,
- BOOL utf8)
+_pcre_is_newline(const uschar *ptr, int type, const uschar *endptr,
+ int *lenptr, BOOL utf8)
{
int c;
if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
-switch(c)
+
+if (type == NLTYPE_ANYCRLF) switch(c)
+ {
+ case 0x000a: *lenptr = 1; return TRUE; /* LF */
+ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
+ return TRUE; /* CR */
+ default: return FALSE;
+ }
+
+/* NLTYPE_ANY */
+
+else switch(c)
{
case 0x000a: /* LF */
case 0x000b: /* VT */
@@ -99,6 +110,7 @@ the string that is being processed.
Arguments:
ptr pointer to possible newline
+ type the newline type
startptr pointer to the start of the string
lenptr where to return the length
utf8 TRUE if in utf8 mode
@@ -107,8 +119,8 @@ Returns: TRUE or FALSE
*/
BOOL
-_pcre_was_newline(const uschar *ptr, const uschar *startptr, int *lenptr,
- BOOL utf8)
+_pcre_was_newline(const uschar *ptr, int type, const uschar *startptr,
+ int *lenptr, BOOL utf8)
{
int c;
ptr--;
@@ -118,7 +130,16 @@ if (utf8)
GETCHAR(c, ptr);
}
else c = *ptr;
-switch(c)
+
+if (type == NLTYPE_ANYCRLF) switch(c)
+ {
+ case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
+ return TRUE; /* LF */
+ case 0x000d: *lenptr = 1; return TRUE; /* CR */
+ default: return FALSE;
+ }
+
+else switch(c)
{
case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
return TRUE; /* LF */
diff --git a/pcregexp.pas b/pcregexp.pas
index b678961..4f4d444 100644
--- a/pcregexp.pas
+++ b/pcregexp.pas
@@ -104,6 +104,7 @@ Const { Options }
PCRE_NEWLINE_LF = $00200000;
PCRE_NEWLINE_CRLF = $00300000;
PCRE_NEWLINE_ANY = $00400000;
+ PCRE_NEWLINE_ANYCRLF = $00500000;
{$ENDIF}
PCRE_COMPILE_ALLOWED_OPTIONS = PCRE_ANCHORED + PCRE_AUTO_CALLOUT + PCRE_CASELESS +
@@ -112,14 +113,14 @@ Const { Options }
PCRE_UNGREEDY + PCRE_UTF8 + PCRE_NO_UTF8_CHECK
{$IFDEF PCRE_7_0}
+ PCRE_DUPNAMES + PCRE_FIRSTLINE + PCRE_NEWLINE_CRLF
- + PCRE_NEWLINE_ANY
+ + PCRE_NEWLINE_ANY + PCRE_NEWLINE_CRLF
{$ENDIF}
;
PCRE_EXEC_ALLOWED_OPTIONS = PCRE_ANCHORED + PCRE_NOTBOL + PCRE_NOTEOL +
PCRE_NOTEMPTY + PCRE_NO_UTF8_CHECK + PCRE_PARTIAL
{$IFDEF PCRE_7_0}
- + PCRE_NEWLINE_CRLF + PCRE_NEWLINE_ANY
+ + PCRE_NEWLINE_CRLF + PCRE_NEWLINE_ANY +PCRE_NEWLINE_ANYCRLF
{$ENDIF}
;
@@ -128,7 +129,7 @@ Const { Options }
PCRE_NOTEMPTY + PCRE_NO_UTF8_CHECK + PCRE_PARTIAL +
PCRE_DFA_SHORTEST + PCRE_DFA_RESTART +
PCRE_NEWLINE_CR + PCRE_NEWLINE_LF + PCRE_NEWLINE_CRLF +
- PCRE_NEWLINE_ANY;
+ PCRE_NEWLINE_ANY + PCRE_NEWLINE_ANYCRLF
{$ENDIF}
{ Exec-time and get/set-time error codes }
@@ -163,7 +164,7 @@ Const { Options }
{ Request types for pcre_fullinfo() }
PCRE_INFO_OPTIONS = 0;
- PCRE_INFO_SIZE = 1;
+ PCRE_INFO_SIZE = 1;
PCRE_INFO_CAPTURECOUNT = 2;
PCRE_INFO_BACKREFMAX = 3;
PCRE_INFO_FIRSTBYTE = 4;
@@ -180,9 +181,9 @@ Const { Options }
{ Request types for pcre_config() }
{$IFDEF PCRE_5_0}
- PCRE_CONFIG_UTF8 = 0;
- PCRE_CONFIG_NEWLINE = 1;
- PCRE_CONFIG_LINK_SIZE = 2;
+ PCRE_CONFIG_UTF8 = 0;
+ PCRE_CONFIG_NEWLINE = 1;
+ PCRE_CONFIG_LINK_SIZE = 2;
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD = 3;
PCRE_CONFIG_MATCH_LIMIT = 4;
PCRE_CONFIG_STACKRECURSE = 5;
@@ -194,10 +195,10 @@ Const { Options }
{ Bit flags for the pcre_extra structure }
{$IFDEF PCRE_5_0}
- PCRE_EXTRA_STUDY_DATA = $0001;
- PCRE_EXTRA_MATCH_LIMIT = $0002;
+ PCRE_EXTRA_STUDY_DATA = $0001;
+ PCRE_EXTRA_MATCH_LIMIT = $0002;
PCRE_EXTRA_CALLOUT_DATA = $0004;
- PCRE_EXTRA_TABLES = $0008;
+ PCRE_EXTRA_TABLES = $0008;
{$ENDIF PCRE_5_0}
{$IFDEF PCRE_7_0}
PCRE_EXTRA_MATCH_LIMIT_RECURSION = $0010;
@@ -214,7 +215,7 @@ remain compatible. }
type ppcre_extra = ^tpcre_extra;
tpcre_extra = record
- flags : longint; { Bits for which fields are set }
+ flags : longint; { Bits for which fields are set }
study_data : pointer; { Opaque data from pcre_study() }
match_limit : longint; { Maximum number of calls to match() }
callout_data : pointer; { Data passed back in callouts }
@@ -392,8 +393,8 @@ data is not zero. *)
// Always include the newest version of the library
{$IFDEF PCRE_3_7} {$IFNDEF PCRE_5_0} {$IFNDEF PCRE_7_0} {$L pcre37.lib} {$ENDIF PCRE_7_0} {$ENDIF PCRE_5_0} {$ENDIF PCRE_3_7}
-{$IFDEF PCRE_5_0} {$IFNDEF PCRE_7_0} {$L pcre50.lib} {$ENDIF PCRE_7_0} {$ENDIF PCRE_5_0}
-{$IFDEF PCRE_7_0} {$L pcre70.lib} {$ENDIF PCRE_7_0}
+{$IFDEF PCRE_5_0} {$IFNDEF PCRE_7_0} {$L pcre50.lib} {$ENDIF PCRE_7_0} {$ENDIF PCRE_5_0}
+{$IFDEF PCRE_7_0} {$L pcre70.lib} {$ENDIF PCRE_7_0}
{TpcRegExp}
@@ -645,7 +646,7 @@ begin
// l1:=length(PpcRegExp(P1)^.RegExp);
// l2:=length(PpcRegExp(P2)^.RegExp);
// if l1 > l2 then l:=l2 else
-// l:=l1;
+// l:=l1;
// for i:=1 to l do
// if PpcRegExp(P1).RegExp[i] <> PpcRegExp(P2).RegExp[i] then break;
// if i <=l then
@@ -658,7 +659,7 @@ begin
// l1:=length(PpcRegExp(P1)^.RegExp);
// l2:=length(SearchRegExp);
// if l1 > l2 then l:=l2 else
-// l:=l1;
+// l:=l1;
// for i:=1 to l do
// if PpcRegExp(P1).RegExp[i] <> SearchRegExp[i] then
// begin
diff --git a/pcregrep.c b/pcregrep.c
index 837bc76..a6b9c3f 100644
--- a/pcregrep.c
+++ b/pcregrep.c
@@ -88,7 +88,7 @@ enum { DEE_READ, DEE_SKIP };
/* Line ending types */
-enum { EL_LF, EL_CR, EL_CRLF, EL_ANY };
+enum { EL_LF, EL_CR, EL_CRLF, EL_ANY, EL_ANYCRLF };
@@ -196,7 +196,7 @@ static option_item optionlist[] = {
{ OP_STRING, N_LABEL, &stdin_name, "label=name", "set name for standard input" },
{ OP_STRING, N_LOCALE, &locale, "locale=locale", "use the named locale" },
{ OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" },
- { OP_STRING, 'N', &newline, "newline=type", "specify newline type (CR, LR, CRLF)" },
+ { OP_STRING, 'N', &newline, "newline=type", "specify newline type (CR, LF, CRLF, ANYCRLF or ANY)" },
{ OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" },
{ OP_NODATA, 'o', NULL, "only-matching", "show only the part of the line that matched" },
{ OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" },
@@ -226,7 +226,7 @@ static const char *prefix[] = {
static const char *suffix[] = {
"", "\\b", ")$", ")$", "\\E", "\\E\\b", "\\E)$", "\\E)$" };
-/* UTF-8 tables - used only when the newline setting is "all". */
+/* UTF-8 tables - used only when the newline setting is "any". */
const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
@@ -545,6 +545,50 @@ switch(endlinetype)
}
break;
+ case EL_ANYCRLF:
+ while (p < endptr)
+ {
+ int extra = 0;
+ register int c = *((unsigned char *)p);
+
+ if (utf8 && c >= 0xc0)
+ {
+ int gcii, gcss;
+ extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
+ gcss = 6*extra;
+ c = (c & utf8_table3[extra]) << gcss;
+ for (gcii = 1; gcii <= extra; gcii++)
+ {
+ gcss -= 6;
+ c |= (p[gcii] & 0x3f) << gcss;
+ }
+ }
+
+ p += 1 + extra;
+
+ switch (c)
+ {
+ case 0x0a: /* LF */
+ *lenptr = 1;
+ return p;
+
+ case 0x0d: /* CR */
+ if (p < endptr && *p == 0x0a)
+ {
+ *lenptr = 2;
+ p++;
+ }
+ else *lenptr = 1;
+ return p;
+
+ default:
+ break;
+ }
+ } /* End of loop for ANYCRLF case */
+
+ *lenptr = 0; /* Must have hit the end */
+ return endptr;
+
case EL_ANY:
while (p < endptr)
{
@@ -643,6 +687,7 @@ switch(endlinetype)
return p; /* But control should never get here */
case EL_ANY:
+ case EL_ANYCRLF:
if (*(--p) == '\n' && p > startptr && p[-1] == '\r') p--;
if (utf8) while ((*p & 0xc0) == 0x80) p--;
@@ -671,7 +716,17 @@ switch(endlinetype)
}
else c = *((unsigned char *)pp);
- switch (c)
+ if (endlinetype == EL_ANYCRLF) switch (c)
+ {
+ case 0x0a: /* LF */
+ case 0x0d: /* CR */
+ return p;
+
+ default:
+ break;
+ }
+
+ else switch (c)
{
case 0x0a: /* LF */
case 0x0b: /* VT */
@@ -1512,6 +1567,7 @@ switch(i)
case '\r': newline = (char *)"cr"; break;
case ('\r' << 8) | '\n': newline = (char *)"crlf"; break;
case -1: newline = (char *)"any"; break;
+ case -2: newline = (char *)"anycrlf"; break;
}
/* Process the options */
@@ -1819,6 +1875,11 @@ else if (strcmp(newline, "any") == 0 || strcmp(newline, "ANY") == 0)
pcre_options |= PCRE_NEWLINE_ANY;
endlinetype = EL_ANY;
}
+else if (strcmp(newline, "anycrlf") == 0 || strcmp(newline, "ANYCRLF") == 0)
+ {
+ pcre_options |= PCRE_NEWLINE_ANYCRLF;
+ endlinetype = EL_ANYCRLF;
+ }
else
{
fprintf(stderr, "pcregrep: Invalid newline specifier \"%s\"\n", newline);
diff --git a/pcretest.c b/pcretest.c
index 14818e7..7a6b1d3 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -663,7 +663,8 @@ return count;
*************************************************/
/* This is used both at compile and run-time to check for <xxx> escapes, where
-xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match.
+xxx is LF, CR, CRLF, ANYCRLF, or ANY. Print a message and return 0 if there is
+no match.
Arguments:
p points after the leading '<'
@@ -678,6 +679,7 @@ check_newline(uschar *p, FILE *f)
if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR;
if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF;
if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
+if (strncmp((char *)p, "anycrlf>", 8) == 0) return PCRE_NEWLINE_ANYCRLF;
if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY;
fprintf(f, "Unknown newline type at: <%s\n", p);
return 0;
@@ -850,6 +852,7 @@ while (argc > 1 && argv[op][0] == '-')
(void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
printf(" Newline sequence is %s\n", (rc == '\r')? "CR" :
(rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" :
+ (rc == -2)? "ANYCRLF" :
(rc == -1)? "ANY" : "???");
(void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
printf(" Internal link size = %d\n", rc);
@@ -1445,6 +1448,10 @@ while (!done)
fprintf(outfile, "Forced newline sequence: CRLF\n");
break;
+ case PCRE_NEWLINE_ANYCRLF:
+ fprintf(outfile, "Forced newline sequence: ANYCRLF\n");
+ break;
+
case PCRE_NEWLINE_ANY:
fprintf(outfile, "Forced newline sequence: ANY\n");
break;
@@ -2218,11 +2225,11 @@ while (!done)
to advance the start offset, and continue. We won't be at the end of the
string - that was checked before setting g_notempty.
- Complication arises in the case when the newline option is "any".
- If the previous match was at the end of a line terminated by CRLF, an
- advance of one character just passes the \r, whereas we should prefer the
- longer newline sequence, as does the code in pcre_exec(). Fudge the
- offset value to achieve this.
+ Complication arises in the case when the newline option is "any" or
+ "anycrlf". If the previous match was at the end of a line terminated by
+ CRLF, an advance of one character just passes the \r, whereas we should
+ prefer the longer newline sequence, as does the code in pcre_exec().
+ Fudge the offset value to achieve this.
Otherwise, in the case of UTF-8 matching, the advance must be one
character, not one byte. */
@@ -2241,9 +2248,12 @@ while (!done)
obits = (d == '\r')? PCRE_NEWLINE_CR :
(d == '\n')? PCRE_NEWLINE_LF :
(d == ('\r'<<8 | '\n'))? PCRE_NEWLINE_CRLF :
+ (d == -2)? PCRE_NEWLINE_ANYCRLF :
(d == -1)? PCRE_NEWLINE_ANY : 0;
}
- if ((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY &&
+ if (((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY ||
+ (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANYCRLF)
+ &&
start_offset < len - 1 &&
bptr[start_offset] == '\r' &&
bptr[start_offset+1] == '\n')
diff --git a/testdata/grepoutputN b/testdata/grepoutputN
index 028dd43..b0a27c6 100644
--- a/testdata/grepoutputN
+++ b/testdata/grepoutputN
@@ -10,7 +10,7 @@ ghi
jkl
This is the last line of this file.
----------------------------- Test n$ ------------------------------
+---------------------------- Test N4 ------------------------------
ghi
jkl
@@ -19,3 +19,7 @@ This is the last line of this file.
44:abc 45:def
46:ghi
47:jkl
+---------------------------- Test N6 ------------------------------
+44:abc 45:def
+46:ghi
+47:jkl
diff --git a/testdata/testinput2 b/testdata/testinput2
index dcfa77f..32d9586 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -1980,11 +1980,14 @@ a random value. /Ix
/^a.b/<lf>
a\rb
a\nb\<cr>
+ a\x85b\<anycrlf>
** Failers
a\nb
a\nb\<any>
a\rb\<cr>
a\rb\<any>
+ a\x85b\<any>
+ a\rb\<anycrlf>
/^abc./mgx<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x85abc7 \x{2028}abc8 \x{2029}abc9 JUNK
@@ -2145,4 +2148,7 @@ a random value. /Ix
/(?m)$/<any>g+
abc\r\n\r\n
+/abc.$/mgx<anycrlf>
+ abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x85 abc7\x{2028} abc8\x{2029} abc9
+
/ End of testinput2 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 222b8ef..d3b620b 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -7833,6 +7833,8 @@ Matched, but too many substrings
0: a\x0db
a\nb\<cr>
0: a\x0ab
+ a\x85b\<anycrlf>
+ 0: a\x85b
** Failers
No match
a\nb
@@ -7843,6 +7845,10 @@ No match
No match
a\rb\<any>
No match
+ a\x85b\<any>
+No match
+ a\rb\<anycrlf>
+No match
/^abc./mgx<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x85abc7 \x{2028}abc8 \x{2029}abc9 JUNK
@@ -7865,7 +7871,6 @@ No match
0: abc9
/a/<cr><any>
-Failed: inconsistent NEWLINE options at offset 0
/a/<any><crlf>
Failed: inconsistent NEWLINE options at offset 0
@@ -8157,4 +8162,11 @@ No match
0:
0+
+/abc.$/mgx<anycrlf>
+ abc1\x0a abc2\x0b abc3\x0c abc4\x0d abc5\x0d\x0a abc6\x85 abc7\x{2028} abc8\x{2029} abc9
+ 0: abc1
+ 0: abc4
+ 0: abc5
+ 0: abc9
+
/ End of testinput2 /