summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-11-03 18:32:55 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-11-03 18:32:55 +0000
commited44c1dfe4d6a49f32fbb2927444306ccf4e0acb (patch)
tree5d80119550dc232d4ed295fd0d2907d4627a11a6
parent1baf641ddd67f78693280553a81a05b69cbb3fff (diff)
downloadpcre-ed44c1dfe4d6a49f32fbb2927444306ccf4e0acb.tar.gz
Fix broken /g code in pcretest and -g code in pcredemo.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@566 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog16
-rw-r--r--doc/pcreapi.37
-rw-r--r--doc/pcretest.111
-rw-r--r--pcredemo.c76
-rw-r--r--pcretest.c19
-rw-r--r--testdata/testinput29
-rw-r--r--testdata/testinput43
-rw-r--r--testdata/testoutput221
-rw-r--r--testdata/testoutput49
9 files changed, 149 insertions, 22 deletions
diff --git a/ChangeLog b/ChangeLog
index 00cff4e..d6d2ff1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -66,6 +66,22 @@ Version 8.11 10-Oct-2010
12. Added an optional parentheses number to the -o and --only-matching options
of pcregrep.
+
+13. Imitating Perl's /g action for multiple matches is tricky when the pattern
+ can match an empty string. The code to do it in pcretest and pcredemo
+ needed fixing:
+
+ (a) When the newline convention was "crlf", pcretest got it wrong, skipping
+ only one byte after an empty string match just before CRLF (this case
+ just got forgotten; "any" and "anycrlf" were OK).
+
+ (b) The pcretest code also had a bug, causing it to loop forever in UTF-8
+ mode when an empty string match preceded an ASCII character followed by
+ a non-ASCII character. (The code for advancing by one character rather
+ than one byte was nonsense.)
+
+ (c) The pcredemo.c sample program did not have any code at all to handle
+ the cases when CRLF is a valid newline sequence.
Version 8.10 25-Jun-2010
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index e94fc62..b56ee7e 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -1450,7 +1450,10 @@ the
.\" HREF
\fBpcredemo\fP
.\"
-sample program.
+sample program. In the most general case, you have to check to see if the
+newline convention recognizes CRLF as a newline, and if so, and the current
+character is CR followed by LF, advance the starting offset by two characters
+instead of one.
.sp
PCRE_NO_START_OPTIMIZE
.sp
@@ -2193,6 +2196,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 22 October 2010
+Last updated: 01 November 2010
Copyright (c) 1997-2010 University of Cambridge.
.fi
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index 75c2611..37b970c 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -219,9 +219,12 @@ begins with a lookbehind assertion (including \eb or \eB).
If any call to \fBpcre_exec()\fP in a \fB/g\fP or \fB/G\fP sequence matches an
empty string, the next call is done with the PCRE_NOTEMPTY_ATSTART and
PCRE_ANCHORED flags set in order to search for another, non-empty, match at the
-same point. If this second match fails, the start offset is advanced by one
-character, and the normal match is retried. This imitates the way Perl handles
-such cases when using the \fB/g\fP modifier or the \fBsplit()\fP function.
+same point. If this second match fails, the start offset is advanced, and the
+normal match is retried. This imitates the way Perl handles such cases when
+using the \fB/g\fP modifier or the \fBsplit()\fP function. Normally, the start
+offset is advanced by one character, but if the newline convention recognizes
+CRLF as a newline, and the current character is CR followed by LF, an advance
+of two is used.
.
.
.SS "Other modifiers"
@@ -767,6 +770,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 22 October 2010
+Last updated: 01 November 2010
Copyright (c) 1997-2010 University of Cambridge.
.fi
diff --git a/pcredemo.c b/pcredemo.c
index d565aec..c20a3d6 100644
--- a/pcredemo.c
+++ b/pcredemo.c
@@ -50,13 +50,16 @@ const char *error;
char *pattern;
char *subject;
unsigned char *name_table;
+unsigned int option_bits;
int erroffset;
int find_all;
+int crlf_is_newline;
int namecount;
int name_entry_size;
int ovector[OVECCOUNT];
int subject_length;
int rc, i;
+int utf8;
/**************************************************************************
@@ -238,15 +241,56 @@ if (namecount <= 0) printf("No named substrings\n"); else
* subject is not a valid match; other possibilities must be tried. The *
* second flag restricts PCRE to one match attempt at the initial string *
* position. If this match succeeds, an alternative to the empty string *
-* match has been found, and we can proceed round the loop. *
+* match has been found, and we can print it and proceed round the loop, *
+* advancing by the length of whatever was found. If this match does not *
+* succeed, we still stay in the loop, advancing by just one character. *
+* In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
+* more than one byte. *
+* *
+* However, there is a complication concerned with newlines. When the *
+* newline convention is such that CRLF is a valid newline, we want must *
+* advance by two characters rather than one. The newline convention can *
+* be set in the regex by (*CR), etc.; if not, we must find the default. *
*************************************************************************/
-if (!find_all)
+if (!find_all) /* Check for -g */
{
pcre_free(re); /* Release the memory used for the compiled pattern */
return 0; /* Finish unless -g was given */
}
+/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
+sequence. First, find the options with which the regex was compiled; extract
+the UTF-8 state, and mask off all but the newline options. */
+
+(void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
+utf8 = option_bits & PCRE_UTF8;
+option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
+ PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
+
+/* If no newline options were set, find the default newline convention from the
+build configuration. */
+
+if (option_bits == 0)
+ {
+ int d;
+ (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
+ /* Note that these values are always the ASCII ones, even in
+ EBCDIC environments. CR = 13, NL = 10. */
+ option_bits = (d == 13)? PCRE_NEWLINE_CR :
+ (d == 10)? PCRE_NEWLINE_LF :
+ (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
+ (d == -2)? PCRE_NEWLINE_ANYCRLF :
+ (d == -1)? PCRE_NEWLINE_ANY : 0;
+ }
+
+/* See if CRLF is a valid newline sequence. */
+
+crlf_is_newline =
+ option_bits == PCRE_NEWLINE_ANY ||
+ option_bits == PCRE_NEWLINE_CRLF ||
+ option_bits == PCRE_NEWLINE_ANYCRLF;
+
/* Loop for second and subsequent matches */
for (;;)
@@ -280,14 +324,32 @@ for (;;)
is zero, it just means we have found all possible matches, so the loop ends.
Otherwise, it means we have failed to find a non-empty-string match at a
point where there was a previous empty-string match. In this case, we do what
- Perl does: advance the matching position by one, and continue. We do this by
- setting the "end of previous match" offset, because that is picked up at the
- top of the loop as the point at which to start again. */
+ Perl does: advance the matching position by one character, and continue. We
+ do this by setting the "end of previous match" offset, because that is picked
+ up at the top of the loop as the point at which to start again.
+
+ There are two complications: (a) When CRLF is a valid newline sequence, and
+ the current position is just before it, advance by an extra byte. (b)
+ Otherwise we must ensure that we skip an entire UTF-8 character if we are in
+ UTF-8 mode. */
if (rc == PCRE_ERROR_NOMATCH)
{
- if (options == 0) break;
- ovector[1] = start_offset + 1;
+ if (options == 0) break; /* All matches found */
+ ovector[1] = start_offset + 1; /* Advance one byte */
+ if (crlf_is_newline && /* If CRLF is newline & */
+ start_offset < subject_length - 1 && /* we are at CRLF, */
+ subject[start_offset] == '\r' &&
+ subject[start_offset + 1] == '\n')
+ ovector[1] += 1; /* Advance by one more. */
+ else if (utf8) /* Otherwise, ensure we */
+ { /* advance a whole UTF-8 */
+ while (ovector[1] < subject_length) /* character. */
+ {
+ if ((subject[ovector[1]] & 0xc0) != 0x80) break;
+ ovector[1] += 1;
+ }
+ }
continue; /* Go round the loop again */
}
diff --git a/pcretest.c b/pcretest.c
index 19458e9..4800876 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -2791,11 +2791,13 @@ while (!done)
to advance the start offset, and continue. We won't be at the end of the
string - that was checked before setting g_notempty.
- Complication arises in the case when the newline option is "any" or
- "anycrlf". If the previous match was at the end of a line terminated by
- CRLF, an advance of one character just passes the \r, whereas we should
- prefer the longer newline sequence, as does the code in pcre_exec().
- Fudge the offset value to achieve this.
+ Complication arises in the case when the newline convention is "any",
+ "crlf", or "anycrlf". If the previous match was at the end of a line
+ terminated by CRLF, an advance of one character just passes the \r,
+ whereas we should prefer the longer newline sequence, as does the code in
+ pcre_exec(). Fudge the offset value to achieve this. We check for a
+ newline setting in the pattern; if none was set, use pcre_config() to
+ find the default.
Otherwise, in the case of UTF-8 matching, the advance must be one
character, not one byte. */
@@ -2820,6 +2822,7 @@ while (!done)
(d == -1)? PCRE_NEWLINE_ANY : 0;
}
if (((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY ||
+ (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_CRLF ||
(obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANYCRLF)
&&
start_offset < len - 1 &&
@@ -2830,10 +2833,8 @@ while (!done)
{
while (start_offset + onechar < len)
{
- int tb = bptr[start_offset+onechar];
- if (tb <= 127) break;
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) onechar++;
+ if ((bptr[start_offset+onechar] & 0xc0) != 0x80) break;
+ onechar++;
}
}
use_offsets[1] = start_offset + onechar;
diff --git a/testdata/testinput2 b/testdata/testinput2
index a16b4d6..8967ba2 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -2346,6 +2346,15 @@ a random value. /Ix
a\nb
a\r\nb
a\x85b
+
+/(*ANY).*/g
+ abc\r\ndef
+
+/(*ANYCRLF).*/g
+ abc\r\ndef
+
+/(*CRLF).*/g
+ abc\r\ndef
/a\Rb/I<bsr_anycrlf>
a\rb
diff --git a/testdata/testinput4 b/testdata/testinput4
index a629387..e2bae42 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -641,4 +641,7 @@
a\x{c0}aaaa/
a\x{c0}a\x{c0}aaa/
+/A*/g8
+ AAB\x{123}BAA
+
/-- End of testinput4 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 6545cfd..33419e5 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -8787,6 +8787,27 @@ No match
No match
a\x85b
No match
+
+/(*ANY).*/g
+ abc\r\ndef
+ 0: abc
+ 0:
+ 0: def
+ 0:
+
+/(*ANYCRLF).*/g
+ abc\r\ndef
+ 0: abc
+ 0:
+ 0: def
+ 0:
+
+/(*CRLF).*/g
+ abc\r\ndef
+ 0: abc
+ 0:
+ 0: def
+ 0:
/a\Rb/I<bsr_anycrlf>
Capturing subpattern count = 0
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index ad331c7..4591026 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1119,4 +1119,13 @@ No match
0: a\x{c0}a\x{c0}
1: a\x{c0}
+/A*/g8
+ AAB\x{123}BAA
+ 0: AA
+ 0:
+ 0:
+ 0:
+ 0: AA
+ 0:
+
/-- End of testinput4 --/