summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2015-11-13 16:52:26 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2015-11-13 16:52:26 +0000
commiteda6dd2c83da55fad345fb19a51ff36ceb957167 (patch)
treebc53bea7192842262d9bb0518d24c37d57e3e2d1
parentbd8ab2aaf175c19ba75c03d599b5ade31eff62e3 (diff)
downloadpcre2-eda6dd2c83da55fad345fb19a51ff36ceb957167.tar.gz
Don't split CRLF in pcre2_substitute() when it's a valid newline sequence.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@430 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r--ChangeLog3
-rw-r--r--doc/pcre2api.39
-rw-r--r--doc/pcre2pattern.320
-rw-r--r--src/pcre2_substitute.c16
-rw-r--r--testdata/testinput217
-rw-r--r--testdata/testoutput229
6 files changed, 87 insertions, 7 deletions
diff --git a/ChangeLog b/ChangeLog
index d6d57b2..e373ec9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -296,6 +296,9 @@ not dereferencing it) while handling lookbehind assertions.
87. Failure to get memory for the match data in regcomp() is now given as a
regcomp() error instead of waiting for regexec() to pick it up.
+88. In pcre2_substitute(), ensure that CRLF is not split when it is a valid
+newline sequence.
+
Version 10.20 30-June-2015
--------------------------
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
index 9d71c93..a7452bf 100644
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@@ -1,4 +1,4 @@
-.TH PCRE2API 3 "10 November 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "13 November 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@@ -2729,6 +2729,11 @@ simultaneous substitutions, as this \fBpcre2test\fP example shows:
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
function to iterate over the subject string, replacing every matching
substring. If this is not set, only the first matching substring is replaced.
+If any matched substring has zero length, after the substitution has happened,
+an attempt to find a non-empty match at the same position is performed. If this
+is not successful, the current position is advanced by one character except
+when CRLF is a valid newline sequence and the next two characters are CR, LF.
+In this case, the current position is advanced by two characters.
.P
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing
to be applied to the replacement string. Without this option, only the dollar
@@ -3087,6 +3092,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 10 November 2015
+Last updated: 13 November 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi
diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
index 88631e2..019a7a5 100644
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "10 November 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "13 November 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -671,8 +671,8 @@ below.
This particular group matches either the two-character sequence CR followed by
LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab,
U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next
-line, U+0085). The two-character sequence is treated as a single unit that
-cannot be split.
+line, U+0085). Because this is an atomic group, the two-character sequence is
+treated as a single unit that cannot be split.
.P
In other modes, two additional characters whose codepoints are greater than 255
are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029).
@@ -1183,6 +1183,18 @@ patterns that are anchored in single line mode because all branches start with
when the \fIstartoffset\fP argument of \fBpcre2_match()\fP is non-zero. The
PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set.
.P
+When the newline convention (see
+.\" HTML <a href="#newlines">
+.\" </a>
+"Newline conventions"
+.\"
+below) recognizes the two-character sequence CRLF as a newline, this is
+preferred, even if the single characters CR and LF are also recognized as
+newlines. For example, if the newline convention is "any", a multiline mode
+circumflex matches before "xyz" in the string "abc\er\enxyz" rather than after
+CR, even though CR on its own is a valid newline. (It also matches at the very
+start of the string, of course.)
+.P
Note that the sequences \eA, \eZ, and \ez can be used to match the start and
end of the subject in both modes, and if all branches of a pattern start with
\eA it is always anchored, whether or not PCRE2_MULTILINE is set.
@@ -3413,6 +3425,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 10 November 2015
+Last updated: 13 November 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi
diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
index 9ece6f6..94a329e 100644
--- a/src/pcre2_substitute.c
+++ b/src/pcre2_substitute.c
@@ -296,8 +296,22 @@ do
if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
if (goptions == 0 || start_offset >= length) break;
+ /* Advance by one code point. Then, if CRLF is a valid newline sequence and
+ we have advanced into the middle of it, advance one more code point. In
+ other words, do not start in the middle of CRLF, even if CR and LF on their
+ own are valid newlines. */
+
save_start = start_offset++;
- if ((code->overall_options & PCRE2_UTF) != 0)
+ if (subject[start_offset-1] == CHAR_CR &&
+ code->newline_convention != PCRE2_NEWLINE_CR &&
+ code->newline_convention != PCRE2_NEWLINE_LF &&
+ start_offset < length &&
+ subject[start_offset] == CHAR_LF)
+ start_offset++;
+
+ /* Otherwise, in UTF mode, advance past any secondary code points. */
+
+ else if ((code->overall_options & PCRE2_UTF) != 0)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
diff --git a/testdata/testinput2 b/testdata/testinput2
index c6ea772..d8ded88 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -498,6 +498,9 @@
/^ab\n/Igm,aftertext
ab\nab\ncd
+/^/gm,newline=any
+ a\rb\nc\r\nxyz\=aftertext
+
/abc/I
/abc|bac/I
@@ -4659,4 +4662,18 @@ a)"xI
/(?|(a)|())/BI
+# Test CRLF handling in empty string substitutions
+
+/^$/gm,newline=anycrlf,replace=-
+ X\r\n\r\nY
+
+/^$/gm,newline=crlf,replace=-
+ X\r\n\r\nY
+
+/^$/gm,newline=any,replace=-
+ X\r\n\r\nY
+
+"(*ANYCRLF)(?m)^(.*[^0-9\r\n].*|)$"g,replace=NaN
+ 15\r\nfoo\r\n20\r\nbar\r\nbaz\r\n\r\n20
+
# End of testinput2
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 2807628..7d560c2 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -1316,6 +1316,17 @@ Subject length lower bound = 3
0: ab\x0a
0+ cd
+/^/gm,newline=any
+ a\rb\nc\r\nxyz\=aftertext
+ 0:
+ 0+ a\x0db\x0ac\x0d\x0axyz
+ 0:
+ 0+ b\x0ac\x0d\x0axyz
+ 0:
+ 0+ c\x0d\x0axyz
+ 0:
+ 0+ xyz
+
/abc/I
Capturing subpattern count = 0
First code unit = 'a'
@@ -14842,4 +14853,22 @@ Capturing subpattern count = 1
May match empty string
Subject length lower bound = 0
+# Test CRLF handling in empty string substitutions
+
+/^$/gm,newline=anycrlf,replace=-
+ X\r\n\r\nY
+ 1: X\x0d\x0a-\x0d\x0aY
+
+/^$/gm,newline=crlf,replace=-
+ X\r\n\r\nY
+ 1: X\x0d\x0a-\x0d\x0aY
+
+/^$/gm,newline=any,replace=-
+ X\r\n\r\nY
+ 1: X\x0d\x0a-\x0d\x0aY
+
+"(*ANYCRLF)(?m)^(.*[^0-9\r\n].*|)$"g,replace=NaN
+ 15\r\nfoo\r\n20\r\nbar\r\nbaz\r\n\r\n20
+ 4: 15\x0d\x0aNaN\x0d\x0a20\x0d\x0aNaN\x0d\x0aNaN\x0d\x0aNaN\x0d\x0a20
+
# End of testinput2