summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-11-06 17:10:00 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-11-06 17:10:00 +0000
commit816309b6a76b4454b9e24dcd47d83960c92ad68b (patch)
treeb5f9918ce2821f54a64ad1cc9a2ccc72e50878bb
parented44c1dfe4d6a49f32fbb2927444306ccf4e0acb (diff)
downloadpcre-816309b6a76b4454b9e24dcd47d83960c92ad68b.tar.gz
Test for ridiculous values of starting offsets; tidy UTF-8 code.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@567 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog10
-rw-r--r--doc/html/index.html16
-rw-r--r--doc/html/pcre.html14
-rw-r--r--doc/html/pcre_exec.html2
-rw-r--r--doc/html/pcreapi.html76
-rw-r--r--doc/html/pcrecompat.html10
-rw-r--r--doc/html/pcredemo.html76
-rw-r--r--doc/html/pcregrep.html65
-rw-r--r--doc/html/pcrematching.html10
-rw-r--r--doc/html/pcrepartial.html115
-rw-r--r--doc/html/pcrepattern.html72
-rw-r--r--doc/html/pcretest.html27
-rw-r--r--doc/pcre.txt1339
-rw-r--r--doc/pcreapi.346
-rw-r--r--doc/pcredemo.376
-rw-r--r--doc/pcregrep.txt206
-rw-r--r--doc/pcretest.18
-rw-r--r--doc/pcretest.txt68
-rw-r--r--pcre.h.in1
-rw-r--r--pcre_dfa_exec.c9
-rw-r--r--pcre_exec.c9
-rw-r--r--pcretest.c7
-rw-r--r--testdata/testinput28
-rw-r--r--testdata/testinput58
-rw-r--r--testdata/testinput78
-rw-r--r--testdata/testinput88
-rw-r--r--testdata/testoutput214
-rw-r--r--testdata/testoutput514
-rw-r--r--testdata/testoutput717
-rw-r--r--testdata/testoutput816
30 files changed, 1486 insertions, 869 deletions
diff --git a/ChangeLog b/ChangeLog
index d6d2ff1..ace825b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -82,6 +82,16 @@ Version 8.11 10-Oct-2010
(c) The pcredemo.c sample program did not have any code at all to handle
the cases when CRLF is a valid newline sequence.
+
+14. Neither pcre_exec() nor pcre_dfa_exec() was checking that the value given
+ as a starting offset was within the subject string. There is now a new
+ error, PCRE_ERROR_BADOFFSET, which is returned if the starting offset is
+ negative or greater than the length of the string. In order to test this,
+ pcretest is extended to allow the setting of negative starting offsets.
+
+15. In both pcre_exec() and pcre_dfa_exec() the code for checking that the
+ starting offset points to the beginning of a UTF-8 character was
+ unnecessarily clumsy. I tidied it up.
Version 8.10 25-Jun-2010
diff --git a/doc/html/index.html b/doc/html/index.html
index d9af7e1..58dfe45 100644
--- a/doc/html/index.html
+++ b/doc/html/index.html
@@ -1,10 +1,10 @@
<html>
-<!-- This is a manually maintained file that is the root of the HTML version of
- the PCRE documentation. When the HTML documents are built from the man
- page versions, the entire doc/html directory is emptied, this file is then
- copied into doc/html/index.html, and the remaining files therein are
+<!-- This is a manually maintained file that is the root of the HTML version of
+ the PCRE documentation. When the HTML documents are built from the man
+ page versions, the entire doc/html directory is emptied, this file is then
+ copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
--->
+-->
<head>
<title>PCRE specification</title>
</head>
@@ -74,11 +74,11 @@ The HTML documentation for PCRE comprises the following pages:
</table>
<p>
-There are also individual pages that summarize the interface for each function
+There are also individual pages that summarize the interface for each function
in the library:
</p>
-<table>
+<table>
<tr><td><a href="pcre_compile.html">pcre_compile</a></td>
<td>&nbsp;&nbsp;Compile a regular expression</td></tr>
@@ -129,7 +129,7 @@ in the library:
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
-
+
<tr><td><a href="pcre_refcount.html">pcre_refcount</a></td>
<td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
diff --git a/doc/html/pcre.html b/doc/html/pcre.html
index 1395573..b7e37f0 100644
--- a/doc/html/pcre.html
+++ b/doc/html/pcre.html
@@ -260,12 +260,12 @@ test characters of any code value, but, by default, the characters that PCRE
recognizes as digits, spaces, or word characters remain the same set as before,
all with values less than 256. This remains true even when PCRE is built to
include Unicode property support, because to do otherwise would slow down PCRE
-in many common cases. Note that this also applies to \b, because it is defined
-in terms of \w and \W. If you really want to test for a wider sense of, say,
-"digit", you can use explicit Unicode property tests such as \p{Nd}.
-Alternatively, if you set the PCRE_UCP option, the way that the character
-escapes work is changed so that Unicode properties are used to determine which
-characters match. There are more details in the section on
+in many common cases. Note in particular that this applies to \b and \B,
+because they are defined in terms of \w and \W. If you really want to test
+for a wider sense of, say, "digit", you can use explicit Unicode property tests
+such as \p{Nd}. Alternatively, if you set the PCRE_UCP option, the way that
+the character escapes work is changed so that Unicode properties are used to
+determine which characters match. There are more details in the section on
<a href="pcrepattern.html#genericchartypes">generic character types</a>
in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
@@ -307,7 +307,7 @@ two digits 10, at the domain cam.ac.uk.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 12 May 2010
+Last updated: 22 October 2010
<br>
Copyright &copy; 1997-2010 University of Cambridge.
<br>
diff --git a/doc/html/pcre_exec.html b/doc/html/pcre_exec.html
index cee8bf4..9f4a930 100644
--- a/doc/html/pcre_exec.html
+++ b/doc/html/pcre_exec.html
@@ -64,7 +64,7 @@ The options are:
PCRE_PARTIAL ) Return PCRE_ERROR_PARTIAL for a partial
PCRE_PARTIAL_SOFT ) match if no full matches are found
PCRE_PARTIAL_HARD Return PCRE_ERROR_PARTIAL for a partial match
- even if there is a full match as well
+ if that is found before a full match
</pre>
For details of partial matching, see the
<a href="pcrepartial.html"><b>pcrepartial</b></a>
diff --git a/doc/html/pcreapi.html b/doc/html/pcreapi.html
index 366fee0..b849f52 100644
--- a/doc/html/pcreapi.html
+++ b/doc/html/pcreapi.html
@@ -1443,7 +1443,10 @@ if that fails, by advancing the starting offset (see below) and trying an
ordinary match again. There is some code that demonstrates how to do this in
the
<a href="pcredemo.html"><b>pcredemo</b></a>
-sample program.
+sample program. In the most general case, you have to check to see if the
+newline convention recognizes CRLF as a newline, and if so, and the current
+character is CR followed by LF, advance the starting offset by two characters
+instead of one.
<pre>
PCRE_NO_START_OPTIMIZE
</pre>
@@ -1501,7 +1504,8 @@ strings in the
in the main
<a href="pcre.html"><b>pcre</b></a>
page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_exec()</b> returns
-the error PCRE_ERROR_BADUTF8. If <i>startoffset</i> contains an invalid value,
+the error PCRE_ERROR_BADUTF8. If <i>startoffset</i> contains a value that does
+not point to the start of a UTF-8 character (or to the end of the subject),
PCRE_ERROR_BADUTF8_OFFSET is returned.
</P>
<P>
@@ -1510,10 +1514,10 @@ checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
calling <b>pcre_exec()</b>. You might want to do this for the second and
subsequent calls to <b>pcre_exec()</b> if you are making repeated calls to find
all the matches in a single subject string. However, you should be sure that
-the value of <i>startoffset</i> points to the start of a UTF-8 character. When
-PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8 string as a
-subject, or a value of <i>startoffset</i> that does not point to the start of a
-UTF-8 character, is undefined. Your program may crash.
+the value of <i>startoffset</i> points to the start of a UTF-8 character (or the
+end of the subject). When PCRE_NO_UTF8_CHECK is set, the effect of passing an
+invalid UTF-8 string as a subject or an invalid value of <i>startoffset</i> is
+undefined. Your program may crash.
<pre>
PCRE_PARTIAL_HARD
PCRE_PARTIAL_SOFT
@@ -1522,12 +1526,23 @@ These options turn on the partial matching feature. For backwards
compatibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. A partial match
occurs if the end of the subject string is reached successfully, but there are
not enough subject characters to complete the match. If this happens when
-PCRE_PARTIAL_HARD is set, <b>pcre_exec()</b> immediately returns
-PCRE_ERROR_PARTIAL. Otherwise, if PCRE_PARTIAL_SOFT is set, matching continues
-by testing any other alternatives. Only if they all fail is PCRE_ERROR_PARTIAL
-returned (instead of PCRE_ERROR_NOMATCH). The portion of the string that
-was inspected when the partial match was found is set as the first matching
-string. There is a more detailed discussion in the
+PCRE_PARTIAL_SOFT (but not PCRE_PARTIAL_HARD) is set, matching continues by
+testing any remaining alternatives. Only if no complete match can be found is
+PCRE_ERROR_PARTIAL returned instead of PCRE_ERROR_NOMATCH. In other words,
+PCRE_PARTIAL_SOFT says that the caller is prepared to handle a partial match,
+but only if no complete match can be found.
+</P>
+<P>
+If PCRE_PARTIAL_HARD is set, it overrides PCRE_PARTIAL_SOFT. In this case, if a
+partial match is found, <b>pcre_exec()</b> immediately returns
+PCRE_ERROR_PARTIAL, without considering any other alternatives. In other words,
+when PCRE_PARTIAL_HARD is set, a partial match is considered to be more
+important that an alternative complete match.
+</P>
+<P>
+In both cases, the portion of the string that was inspected when the partial
+match was found is set as the first matching string. There is a more detailed
+discussion of partial and multi-segment matching, with examples, in the
<a href="pcrepartial.html"><b>pcrepartial</b></a>
documentation.
</P>
@@ -1537,10 +1552,15 @@ The string to be matched by <b>pcre_exec()</b>
<P>
The subject string is passed to <b>pcre_exec()</b> as a pointer in
<i>subject</i>, a length (in bytes) in <i>length</i>, and a starting byte offset
-in <i>startoffset</i>. In UTF-8 mode, the byte offset must point to the start of
-a UTF-8 character. Unlike the pattern string, the subject may contain binary
-zero bytes. When the starting offset is zero, the search for a match starts at
-the beginning of the subject, and this is by far the most common case.
+in <i>startoffset</i>. If this is negative or greater than the length of the
+subject, <b>pcre_exec()</b> returns PCRE_ERROR_BADOFFSET.
+</P>
+<P>
+In UTF-8 mode, the byte offset must point to the start of a UTF-8 character (or
+the end of the subject). Unlike the pattern string, the subject may contain
+binary zero bytes. When the starting offset is zero, the search for a match
+starts at the beginning of the subject, and this is by far the most common
+case.
</P>
<P>
A non-zero starting offset is useful when searching for another match in the
@@ -1562,6 +1582,19 @@ set to 4, it finds the second occurrence of "iss" because it is able to look
behind the starting point to discover that it is preceded by a letter.
</P>
<P>
+Finding all the matches in a subject is tricky when the pattern can match an
+empty string. It is possible to emulate Perl's /g behaviour by first trying the
+match again at the same offset, with the PCRE_NOTEMPTY_ATSTART and
+PCRE_ANCHORED options, and then if that fails, advancing the starting offset
+and trying an ordinary match again. There is some code that demonstrates how to
+do this in the
+<a href="pcredemo.html"><b>pcredemo</b></a>
+sample program. In the most general case, you have to check to see if the
+newline convention recognizes CRLF as a newline, and if so, and the current
+character is CR followed by LF, advance the starting offset by two characters
+instead of one.
+</P>
+<P>
If a non-zero starting offset is passed when the pattern is anchored, one
attempt to match at the given offset is made. This can only succeed if the
pattern does not require the match to be at the start of the subject.
@@ -1756,6 +1789,11 @@ description above.
PCRE_ERROR_BADNEWLINE (-23)
</pre>
An invalid combination of PCRE_NEWLINE_<i>xxx</i> options was given.
+<pre>
+ PCRE_ERROR_BADOFFSET (-24)
+</pre>
+The value of <i>startoffset</i> was negative or greater than the length of the
+subject, that is, the value in <i>length</i>.
</P>
<P>
Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>.
@@ -2050,6 +2088,10 @@ is converted into PCRE_ERROR_PARTIAL if the end of the subject is reached,
there have been no complete matches, but there is still at least one matching
possibility. The portion of the string that was inspected when the longest
partial match was found is set as the first matching string in both cases.
+There is a more detailed discussion of partial and multi-segment matching, with
+examples, in the
+<a href="pcrepartial.html"><b>pcrepartial</b></a>
+documentation.
<pre>
PCRE_DFA_SHORTEST
</pre>
@@ -2161,7 +2203,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 21 June 2010
+Last updated: 06 November 2010
<br>
Copyright &copy; 1997-2010 University of Cambridge.
<br>
diff --git a/doc/html/pcrecompat.html b/doc/html/pcrecompat.html
index d6c9a7b..1b25db9 100644
--- a/doc/html/pcrecompat.html
+++ b/doc/html/pcrecompat.html
@@ -18,7 +18,7 @@ DIFFERENCES BETWEEN PCRE AND PERL
<P>
This document describes the differences in the ways that PCRE and Perl handle
regular expressions. The differences described here are with respect to Perl
-5.10/5.11.
+versions 5.10 and above.
</P>
<P>
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
@@ -113,7 +113,11 @@ names map to capturing subpattern number 1. To avoid this confusing situation,
an error is given at compile time.
</P>
<P>
-12. PCRE provides some extensions to the Perl regular expression facilities.
+12. Perl recognizes comments in some places that PCRE doesn't, for example,
+between the ( and ? at the start of a subpattern.
+</P>
+<P>
+13. PCRE provides some extensions to the Perl regular expression facilities.
Perl 5.10 includes new features that are not in earlier versions of Perl, some
of which (such as named parentheses) have been in PCRE for some time. This list
is with respect to Perl 5.10:
@@ -182,7 +186,7 @@ Cambridge CB2 3QH, England.
REVISION
</b><br>
<P>
-Last updated: 12 May 2010
+Last updated: 31 October 2010
<br>
Copyright &copy; 1997-2010 University of Cambridge.
<br>
diff --git a/doc/html/pcredemo.html b/doc/html/pcredemo.html
index 6277b80..7ce6db0 100644
--- a/doc/html/pcredemo.html
+++ b/doc/html/pcredemo.html
@@ -67,13 +67,16 @@ const char *error;
char *pattern;
char *subject;
unsigned char *name_table;
+unsigned int option_bits;
int erroffset;
int find_all;
+int crlf_is_newline;
int namecount;
int name_entry_size;
int ovector[OVECCOUNT];
int subject_length;
int rc, i;
+int utf8;
/**************************************************************************
@@ -255,15 +258,56 @@ if (namecount &lt;= 0) printf("No named substrings\n"); else
* subject is not a valid match; other possibilities must be tried. The *
* second flag restricts PCRE to one match attempt at the initial string *
* position. If this match succeeds, an alternative to the empty string *
-* match has been found, and we can proceed round the loop. *
+* match has been found, and we can print it and proceed round the loop, *
+* advancing by the length of whatever was found. If this match does not *
+* succeed, we still stay in the loop, advancing by just one character. *
+* In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
+* more than one byte. *
+* *
+* However, there is a complication concerned with newlines. When the *
+* newline convention is such that CRLF is a valid newline, we want must *
+* advance by two characters rather than one. The newline convention can *
+* be set in the regex by (*CR), etc.; if not, we must find the default. *
*************************************************************************/
-if (!find_all)
+if (!find_all) /* Check for -g */
{
pcre_free(re); /* Release the memory used for the compiled pattern */
return 0; /* Finish unless -g was given */
}
+/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
+sequence. First, find the options with which the regex was compiled; extract
+the UTF-8 state, and mask off all but the newline options. */
+
+(void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &amp;option_bits);
+utf8 = option_bits &amp; PCRE_UTF8;
+option_bits &amp;= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
+ PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
+
+/* If no newline options were set, find the default newline convention from the
+build configuration. */
+
+if (option_bits == 0)
+ {
+ int d;
+ (void)pcre_config(PCRE_CONFIG_NEWLINE, &amp;d);
+ /* Note that these values are always the ASCII ones, even in
+ EBCDIC environments. CR = 13, NL = 10. */
+ option_bits = (d == 13)? PCRE_NEWLINE_CR :
+ (d == 10)? PCRE_NEWLINE_LF :
+ (d == (13&lt;&lt;8 | 10))? PCRE_NEWLINE_CRLF :
+ (d == -2)? PCRE_NEWLINE_ANYCRLF :
+ (d == -1)? PCRE_NEWLINE_ANY : 0;
+ }
+
+/* See if CRLF is a valid newline sequence. */
+
+crlf_is_newline =
+ option_bits == PCRE_NEWLINE_ANY ||
+ option_bits == PCRE_NEWLINE_CRLF ||
+ option_bits == PCRE_NEWLINE_ANYCRLF;
+
/* Loop for second and subsequent matches */
for (;;)
@@ -297,14 +341,32 @@ for (;;)
is zero, it just means we have found all possible matches, so the loop ends.
Otherwise, it means we have failed to find a non-empty-string match at a
point where there was a previous empty-string match. In this case, we do what
- Perl does: advance the matching position by one, and continue. We do this by
- setting the "end of previous match" offset, because that is picked up at the
- top of the loop as the point at which to start again. */
+ Perl does: advance the matching position by one character, and continue. We
+ do this by setting the "end of previous match" offset, because that is picked
+ up at the top of the loop as the point at which to start again.
+
+ There are two complications: (a) When CRLF is a valid newline sequence, and
+ the current position is just before it, advance by an extra byte. (b)
+ Otherwise we must ensure that we skip an entire UTF-8 character if we are in
+ UTF-8 mode. */
if (rc == PCRE_ERROR_NOMATCH)
{
- if (options == 0) break;
- ovector[1] = start_offset + 1;
+ if (options == 0) break; /* All matches found */
+ ovector[1] = start_offset + 1; /* Advance one byte */
+ if (crlf_is_newline &amp;&amp; /* If CRLF is newline &amp; */
+ start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */
+ subject[start_offset] == '\r' &amp;&amp;
+ subject[start_offset + 1] == '\n')
+ ovector[1] += 1; /* Advance by one more. */
+ else if (utf8) /* Otherwise, ensure we */
+ { /* advance a whole UTF-8 */
+ while (ovector[1] &lt; subject_length) /* character. */
+ {
+ if ((subject[ovector[1]] &amp; 0xc0) != 0x80) break;
+ ovector[1] += 1;
+ }
+ }
continue; /* Go round the loop again */
}
diff --git a/doc/html/pcregrep.html b/doc/html/pcregrep.html
index 73cf54c..0dcd738 100644
--- a/doc/html/pcregrep.html
+++ b/doc/html/pcregrep.html
@@ -366,6 +366,36 @@ locale is specified, the PCRE library's default (usually the "C" locale) is
used. There is no short form for this option.
</P>
<P>
+<b>--match-limit</b>=<i>number</i>
+Processing some regular expression patterns can require a very large amount of
+memory, leading in some cases to a program crash if not enough is available.
+Other patterns may take a very long time to search for all possible matching
+strings. The <b>pcre_exec()</b> function that is called by <b>pcregrep</b> to do
+the matching has two parameters that can limit the resources that it uses.
+<br>
+<br>
+The <b>--match-limit</b> option provides a means of limiting resource usage
+when processing patterns that are not going to match, but which have a very
+large number of possibilities in their search trees. The classic example is a
+pattern that uses nested unlimited repeats. Internally, PCRE uses a function
+called <b>match()</b> which it calls repeatedly (sometimes recursively). The
+limit set by \fb--match-limit\fP is imposed on the number of times this
+function is called during a match, which has the effect of limiting the amount
+of backtracking that can take place.
+<br>
+<br>
+The <b>--recursion-limit</b> option is similar to <b>--match-limit</b>, but
+instead of limiting the total number of times that <b>match()</b> is called, it
+limits the depth of recursive calls, which in turn limits the amount of memory
+that can be used. The recursion depth is a smaller number than the total number
+of calls, because not all calls to <b>match()</b> are recursive. This limit is
+of use only if it is set smaller than <b>--match-limit</b>.
+<br>
+<br>
+There are no short forms for these options. The default settings are specified
+when the PCRE library is compiled, with the default default being 10 million.
+</P>
+<P>
<b>-M</b>, <b>--multiline</b>
Allow patterns to match more than one line. When this option is given, patterns
may usefully contain literal newline characters and internal occurrences of ^
@@ -380,7 +410,7 @@ are guaranteed to be available for lookbehind assertions. This option does not
work when input is read line by line (see \fP--line-buffered\fP.)
</P>
<P>
-<b>-N</b> <i>newline-type</i>, <b>--newline=</b><i>newline-type</i>
+<b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
The PCRE library supports five different conventions for indicating
the ends of lines. They are the single-character sequences CR (carriage return)
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
@@ -409,13 +439,26 @@ output, it precedes the line number. This option is forced if
</P>
<P>
<b>-o</b>, <b>--only-matching</b>
-Show only the part of the line that matched a pattern. In this mode, no
-context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are
-ignored. If there is more than one match in a line, each of them is shown
-separately. If <b>-o</b> is combined with <b>-v</b> (invert the sense of the
-match to find non-matching lines), no output is generated, but the return code
-is set appropriately. This option is mutually exclusive with
-<b>--file-offsets</b> and <b>--line-offsets</b>.
+Show only the part of the line that matched a pattern instead of the whole
+line. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and
+<b>-C</b> options are ignored. If there is more than one match in a line, each
+of them is shown separately. If <b>-o</b> is combined with <b>-v</b> (invert the
+sense of the match to find non-matching lines), no output is generated, but the
+return code is set appropriately. If the matched portion of the line is empty,
+nothing is output unless the file name or line number are being printed, in
+which case they are shown on an otherwise empty line. This option is mutually
+exclusive with <b>--file-offsets</b> and <b>--line-offsets</b>.
+</P>
+<P>
+<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
+Show only the part of the line that matched the capturing parentheses of the
+given number. Up to 32 capturing parentheses are supported. Because these
+options can be given without an argument (see above), if an argument is
+present, it must be given in the same shell item, for example, -o3 or
+--only-matching=2. The comments given for the non-argument case above also
+apply to this case. If the specified capturing parentheses do not exist in the
+pattern, or were not set in the match, nothing is output unless the file name
+or line number are being printed.
</P>
<P>
<b>-q</b>, <b>--quiet</b>
@@ -431,6 +474,10 @@ immediate end-of-file. This option is a shorthand for setting the <b>-d</b>
option to "recurse".
</P>
<P>
+<b>--recursion-limit</b>=<i>number</i>
+See <b>--match-limit</b> above.
+</P>
+<P>
<b>-s</b>, <b>--no-messages</b>
Suppress error messages about non-existent or unreadable files. Such files are
quietly skipped. However, the return code is still 2, even if matches were
@@ -550,7 +597,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 21 May 2010
+Last updated: 31 October 2010
<br>
Copyright &copy; 1997-2010 University of Cambridge.
<br>
diff --git a/doc/html/pcrematching.html b/doc/html/pcrematching.html
index 6ab3dfb..4d0b3fb 100644
--- a/doc/html/pcrematching.html
+++ b/doc/html/pcrematching.html
@@ -185,9 +185,11 @@ callouts.
2. Because the alternative algorithm scans the subject string just once, and
never needs to backtrack, it is possible to pass very long subject strings to
the matching function in several pieces, checking for partial matching each
-time. The
+time. It is possible to do multi-segment matching using <b>pcre_exec()</b> (by
+retaining partially matched substrings), but it is more complicated. The
<a href="pcrepartial.html"><b>pcrepartial</b></a>
-documentation gives details of partial matching.
+documentation gives details of partial matching and discusses multi-segment
+matching.
</P>
<br><a name="SEC6" href="#TOC1">DISADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
<P>
@@ -216,9 +218,9 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 29 September 2009
+Last updated: 22 October 2010
<br>
-Copyright &copy; 1997-2009 University of Cambridge.
+Copyright &copy; 1997-2010 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcrepartial.html b/doc/html/pcrepartial.html
index 040ac88..48473c7 100644
--- a/doc/html/pcrepartial.html
+++ b/doc/html/pcrepartial.html
@@ -45,8 +45,8 @@ what has been typed so far is potentially valid, it is able to raise an error
as soon as a mistake is made, by beeping and not reflecting the character that
has been typed, for example. This immediate feedback is likely to be a better
user interface than a check that is delayed until the entire string has been
-entered. Partial matching can also sometimes be useful when the subject string
-is very long and is not all available at once.
+entered. Partial matching can also be useful when the subject string is very
+long and is not all available at once.
</P>
<P>
PCRE supports partial matching by means of the PCRE_PARTIAL_SOFT and
@@ -68,20 +68,22 @@ also disabled for partial matching.
</P>
<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre_exec()</a><br>
<P>
-A partial match occurs during a call to <b>pcre_exec()</b> whenever the end of
-the subject string is reached successfully, but matching cannot continue
-because more characters are needed. However, at least one character must have
-been matched. (In other words, a partial match can never be an empty string.)
+A partial match occurs during a call to <b>pcre_exec()</b> when the end of the
+subject string is reached successfully, but matching cannot continue because
+more characters are needed. However, at least one character in the subject must
+have been inspected. This character need not form part of the final matched
+string; lookbehind assertions and the \K escape sequence provide ways of
+inspecting characters before the start of a matched substring. The requirement
+for inspecting at least one character exists because an empty string can always
+be matched; without such a restriction there would always be a partial match of
+an empty string at the end of the subject.
</P>
<P>
-If PCRE_PARTIAL_SOFT is set, the partial match is remembered, but matching
-continues as normal, and other alternatives in the pattern are tried. If no
-complete match can be found, <b>pcre_exec()</b> returns PCRE_ERROR_PARTIAL
-instead of PCRE_ERROR_NOMATCH. If there are at least two slots in the offsets
-vector, the first of them is set to the offset of the earliest character that
-was inspected when the partial match was found. For convenience, the second
-offset points to the end of the string so that a substring can easily be
-identified.
+If there are at least two slots in the offsets vector when <b>pcre_exec()</b>
+returns with a partial match, the first slot is set to the offset of the
+earliest character that was inspected when the partial match was found. For
+convenience, the second offset points to the end of the subject so that a
+substring can easily be identified.
</P>
<P>
For the majority of patterns, the first offset identifies the start of the
@@ -94,7 +96,27 @@ inspected while carrying out the match. For example:
This pattern matches "123", but only if it is preceded by "abc". If the subject
string is "xyzabc12", the offsets after a partial match are for the substring
"abc12", because all these characters are needed if another match is tried
-with extra characters added.
+with extra characters added to the subject.
+</P>
+<P>
+What happens when a partial match is identified depends on which of the two
+partial matching options are set.
+</P>
+<br><b>
+PCRE_PARTIAL_SOFT with pcre_exec()
+</b><br>
+<P>
+If PCRE_PARTIAL_SOFT is set when <b>pcre_exec()</b> identifies a partial match,
+the partial match is remembered, but matching continues as normal, and other
+alternatives in the pattern are tried. If no complete match can be found,
+<b>pcre_exec()</b> returns PCRE_ERROR_PARTIAL instead of PCRE_ERROR_NOMATCH.
+</P>
+<P>
+This option is "soft" because it prefers a complete match over a partial match.
+All the various matching items in a pattern behave as if the subject string is
+potentially complete. For example, \z, \Z, and $ match at the end of the
+subject, as normal, and for \b and \B the end of the subject is treated as a
+non-alphanumeric.
</P>
<P>
If there is more than one partial match, the first one that was found provides
@@ -104,16 +126,29 @@ the data that is returned. Consider this pattern:
</pre>
If this is matched against the subject string "abc123dog", both
alternatives fail to match, but the end of the subject is reached during
-matching, so PCRE_ERROR_PARTIAL is returned instead of PCRE_ERROR_NOMATCH. The
-offsets are set to 3 and 9, identifying "123dog" as the first partial match
-that was found. (In this example, there are two partial matches, because "dog"
-on its own partially matches the second alternative.)
+matching, so PCRE_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
+identifying "123dog" as the first partial match that was found. (In this
+example, there are two partial matches, because "dog" on its own partially
+matches the second alternative.)
</P>
+<br><b>
+PCRE_PARTIAL_HARD with pcre_exec()
+</b><br>
<P>
If PCRE_PARTIAL_HARD is set for <b>pcre_exec()</b>, it returns
PCRE_ERROR_PARTIAL as soon as a partial match is found, without continuing to
-search for possible complete matches. The difference between the two options
-can be illustrated by a pattern such as:
+search for possible complete matches. This option is "hard" because it prefers
+an earlier partial match over a later complete match. For this reason, the
+assumption is made that the end of the supplied subject string may not be the
+true end of the available data, and so, if \z, \Z, \b, \B, or $ are
+encountered at the end of the subject, the result is PCRE_ERROR_PARTIAL.
+</P>
+<br><b>
+Comparing hard and soft partial matching
+</b><br>
+<P>
+The difference between the two partial matching options can be illustrated by a
+pattern such as:
<pre>
/dog(sbody)?/
</pre>
@@ -141,7 +176,7 @@ The <b>pcre_dfa_exec()</b> function moves along the subject string character by
character, without backtracking, searching for all possible matches
simultaneously. If the end of the subject is reached before the end of the
pattern, there is the possibility of a partial match, again provided that at
-least one character has matched.
+least one character has been inspected.
</P>
<P>
When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if there
@@ -262,11 +297,13 @@ From release 8.00, <b>pcre_exec()</b> can also be used to do multi-segment
matching. Unlike <b>pcre_dfa_exec()</b>, it is not possible to restart the
previous match with a new segment of data. Instead, new data must be added to
the previous subject string, and the entire match re-run, starting from the
-point where the partial match occurred. Earlier data can be discarded.
-Consider an unanchored pattern that matches dates:
+point where the partial match occurred. Earlier data can be discarded. It is
+best to use PCRE_PARTIAL_HARD in this situation, because it does not treat the
+end of a segment as the end of the subject when matching \z, \Z, \b, \B,
+and $. Consider an unanchored pattern that matches dates:
<pre>
re&#62; /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
- data&#62; The date is 23ja\P
+ data&#62; The date is 23ja\P\P
Partial match: 23ja
</pre>
At this stage, an application could discard the text preceding "23ja", add on
@@ -287,9 +324,11 @@ Certain types of pattern may give problems with multi-segment matching,
whichever matching function is used.
</P>
<P>
-1. If the pattern contains tests for the beginning or end of a line, you need
-to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropriate, when the
-subject string for any call does not contain the beginning or end of a line.
+1. If the pattern contains a test for the beginning of a line, you need to pass
+the PCRE_NOTBOL option when the subject string for any call does start at the
+beginning of a line. There is also a PCRE_NOTEOL option, but in practice when
+doing multi-segment matching you should be using PCRE_PARTIAL_HARD, which
+includes the effect of PCRE_NOTEOL.
</P>
<P>
2. Lookbehind assertions at the start of a pattern are catered for in the
@@ -305,10 +344,10 @@ always produce exactly the same result as matching over one single long string,
especially when PCRE_PARTIAL_SOFT is used. The section "Partial Matching and
Word Boundaries" above describes an issue that arises if the pattern ends with
\b or \B. Another kind of difference may occur when there are multiple
-matching possibilities, because a partial match result is given only when there
-are no completed matches. This means that as soon as the shortest match has
-been found, continuation to a new subject segment is no longer possible.
-Consider again this <b>pcretest</b> example:
+matching possibilities, because (for PCRE_PARTIAL_SOFT) a partial match result
+is given only when there are no completed matches. This means that as soon as
+the shortest match has been found, continuation to a new subject segment is no
+longer possible. Consider again this <b>pcretest</b> example:
<pre>
re&#62; /dog(sbody)?/
data&#62; dogsb\P
@@ -331,8 +370,8 @@ the other hand, if "dogsbody" is presented as a single string,
<b>pcre_dfa_exec()</b> finds both matches.
</P>
<P>
-Because of these problems, it is probably best to use PCRE_PARTIAL_HARD when
-matching multi-segment data. The example above then behaves differently:
+Because of these problems, it is best to use PCRE_PARTIAL_HARD when matching
+multi-segment data. The example above then behaves differently:
<pre>
re&#62; /dog(sbody)?/
data&#62; dogsb\P\P
@@ -368,12 +407,12 @@ problem if <b>pcre_exec()</b> is used, because the entire match has to be rerun
each time:
<pre>
re&#62; /1234|3789/
- data&#62; ABC123\P
+ data&#62; ABC123\P\P
Partial match: 123
data&#62; 1237890
0: 3789
</pre>
-Of course, instead of using PCRE_DFA_PARTIAL, the same technique of re-running
+Of course, instead of using PCRE_DFA_RESTART, the same technique of re-running
the entire match can also be used with <b>pcre_dfa_exec()</b>. Another
possibility is to work with two buffers. If a partial match at offset <i>n</i>
in the first buffer is followed by "no match" when PCRE_DFA_RESTART is used on
@@ -391,9 +430,9 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC11" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 19 October 2009
+Last updated: 22 October 2010
<br>
-Copyright &copy; 1997-2009 University of Cambridge.
+Copyright &copy; 1997-2010 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcrepattern.html b/doc/html/pcrepattern.html
index 9d52bfc..076c4a0 100644
--- a/doc/html/pcrepattern.html
+++ b/doc/html/pcrepattern.html
@@ -99,7 +99,7 @@ alternative function, and how it differs from the normal function, are
discussed in the
<a href="pcrematching.html"><b>pcrematching</b></a>
page.
-</P>
+<a name="newlines"></a></P>
<br><a name="SEC2" href="#TOC1">NEWLINE CONVENTIONS</a><br>
<P>
PCRE supports five different conventions for indicating line breaks in
@@ -234,6 +234,7 @@ Perl, $ and @ cause variable interpolation. Note the following examples:
\Qabc\E\$\Qxyz\E abc$xyz abc$xyz
</pre>
The \Q...\E sequence is recognized both inside and outside character classes.
+An isolated \E that is not preceded by \Q is ignored.
<a name="digitsafterbackslash"></a></P>
<br><b>
Non-printing characters
@@ -1936,7 +1937,15 @@ already been matched. The two possible forms of conditional subpattern are:
</pre>
If the condition is satisfied, the yes-pattern is used; otherwise the
no-pattern (if present) is used. If there are more than two alternatives in the
-subpattern, a compile-time error occurs.
+subpattern, a compile-time error occurs. Each of the two alternatives may
+itself contain nested subpatterns of any form, including conditional
+subpatterns; the restriction to two alternatives applies only at the level of
+the condition. This pattern fragment is an example where the alternatives are
+complex:
+<pre>
+ (?(1) (A|B|C) | (D | (?(2)E|F) | E) )
+
+</PRE>
</P>
<P>
There are four kinds of condition: references to subpatterns, references to
@@ -2071,14 +2080,32 @@ dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
<a name="comments"></a></P>
<br><a name="SEC20" href="#TOC1">COMMENTS</a><br>
<P>
-The sequence (?# marks the start of a comment that continues up to the next
-closing parenthesis. Nested parentheses are not permitted. The characters
-that make up a comment play no part in the pattern matching at all.
+There are two ways of including comments in patterns that are processed by
+PCRE. In both cases, the start of the comment must not be in a character class,
+nor in the middle of any other sequence of related characters such as (?: or a
+subpattern name or number. The characters that make up a comment play no part
+in the pattern matching.
</P>
<P>
-If the PCRE_EXTENDED option is set, an unescaped # character outside a
-character class introduces a comment that continues to immediately after the
-next newline in the pattern.
+The sequence (?# marks the start of a comment that continues up to the next
+closing parenthesis. Nested parentheses are not permitted. If the PCRE_EXTENDED
+option is set, an unescaped # character also introduces a comment, which in
+this case continues to immediately after the next newline character or
+character sequence in the pattern. Which characters are interpreted as newlines
+is controlled by the options passed to <b>pcre_compile()</b> or by a special
+sequence at the start of the pattern, as described in the section entitled
+<a href="#recursion">"Newline conventions"</a>
+above. Note that end of this type of comment is a literal newline sequence in
+the pattern; escape sequences that happen to represent a newline do not count.
+For example, consider this pattern when PCRE_EXTENDED is set, and the default
+newline convention is in force:
+<pre>
+ abc #comment \n still comment
+</pre>
+On encountering the # character, <b>pcre_compile()</b> skips along, looking for
+a newline in the pattern. The sequence \n is still literal at this stage, so
+it does not terminate the comment. Only an actual character with the code value
+0x0a (the default newline) does so.
<a name="recursion"></a></P>
<br><a name="SEC21" href="#TOC1">RECURSIVE PATTERNS</a><br>
<P>
@@ -2600,10 +2627,10 @@ matching name is found, normal "bumpalong" of one character happens (the
<pre>
(*THEN) or (*THEN:NAME)
</pre>
-This verb causes a skip to the next alternation if the rest of the pattern does
-not match. That is, it cancels pending backtracking, but only within the
-current alternation. Its name comes from the observation that it can be used
-for a pattern-based if-then-else block:
+This verb causes a skip to the next alternation in the innermost enclosing
+group if the rest of the pattern does not match. That is, it cancels pending
+backtracking, but only within the current alternation. Its name comes from the
+observation that it can be used for a pattern-based if-then-else block:
<pre>
( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
</pre>
@@ -2614,6 +2641,25 @@ behaviour of (*THEN:NAME) is exactly the same as (*MARK:NAME)(*THEN) if the
overall match fails. If (*THEN) is not directly inside an alternation, it acts
like (*PRUNE).
</P>
+<P>
+The above verbs provide four different "strengths" of control when subsequent
+matching fails. (*THEN) is the weakest, carrying on the match at the next
+alternation. (*PRUNE) comes next, failing the match at the current starting
+position, but allowing an advance to the next character (for an unanchored
+pattern). (*SKIP) is similar, except that the advance may be more than one
+character. (*COMMIT) is the strongest, causing the entire match to fail.
+</P>
+<P>
+If more than one is present in a pattern, the "stongest" one wins. For example,
+consider this pattern, where A, B, etc. are complex pattern fragments:
+<pre>
+ (A(*COMMIT)B(*THEN)C|D)
+</pre>
+Once A has matched, PCRE is committed to this match, at the current starting
+position. If subsequently B matches, but C does not, the normal (*THEN) action
+of trying the next alternation (that is, D) does not happen because (*COMMIT)
+overrides.
+</P>
<br><a name="SEC26" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcreapi</b>(3), <b>pcrecallout</b>(3), <b>pcrematching</b>(3),
@@ -2630,7 +2676,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC28" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 18 May 2010
+Last updated: 31 October 2010
<br>
Copyright &copy; 1997-2010 University of Cambridge.
<br>
diff --git a/doc/html/pcretest.html b/doc/html/pcretest.html
index ce16d22..8317ba8 100644
--- a/doc/html/pcretest.html
+++ b/doc/html/pcretest.html
@@ -254,9 +254,12 @@ begins with a lookbehind assertion (including \b or \B).
If any call to <b>pcre_exec()</b> in a <b>/g</b> or <b>/G</b> sequence matches an
empty string, the next call is done with the PCRE_NOTEMPTY_ATSTART and
PCRE_ANCHORED flags set in order to search for another, non-empty, match at the
-same point. If this second match fails, the start offset is advanced by one
-character, and the normal match is retried. This imitates the way Perl handles
-such cases when using the <b>/g</b> modifier or the <b>split()</b> function.
+same point. If this second match fails, the start offset is advanced, and the
+normal match is retried. This imitates the way Perl handles such cases when
+using the <b>/g</b> modifier or the <b>split()</b> function. Normally, the start
+offset is advanced by one character, but if the newline convention recognizes
+CRLF as a newline, and the current character is CR followed by LF, an advance
+of two is used.
</P>
<br><b>
Other modifiers
@@ -412,8 +415,8 @@ recognized:
\Y pass the PCRE_NO_START_OPTIMIZE option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\Z pass the PCRE_NOTEOL option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\? pass the PCRE_NO_UTF8_CHECK option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
- \&#62;dd start the match at offset dd (any number of digits);
- this sets the <i>startoffset</i> argument for <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
+ \&#62;dd start the match at offset dd (optional "-"; then any number of digits); this sets the <i>startoffset</i>
+ argument for <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;cr&#62; pass the PCRE_NEWLINE_CR option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;lf&#62; pass the PCRE_NEWLINE_LF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;crlf&#62; pass the PCRE_NEWLINE_CRLF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
@@ -489,9 +492,11 @@ When a match succeeds, pcretest outputs the list of captured substrings that
<b>pcre_exec()</b> returns, starting with number 0 for the string that matched
the whole pattern. Otherwise, it outputs "No match" when the return is
PCRE_ERROR_NOMATCH, and "Partial match:" followed by the partially matching
-substring when <b>pcre_exec()</b> returns PCRE_ERROR_PARTIAL. For any other
-returns, it outputs the PCRE negative error number. Here is an example of an
-interactive <b>pcretest</b> run.
+substring when <b>pcre_exec()</b> returns PCRE_ERROR_PARTIAL. (Note that this is
+the entire substring that was inspected during the partial match; it may
+include characters before the actual match start if a lookbehind assertion,
+\K, \b, or \B was involved.) For any other returns, it outputs the PCRE
+negative error number. Here is an example of an interactive <b>pcretest</b> run.
<pre>
$ pcretest
PCRE version 7.0 30-Nov-2006
@@ -573,7 +578,9 @@ the subject where there is at least one match. For example:
(Using the normal matching function on this data finds only "tang".) The
longest matching string is always given first (and numbered zero). After a
PCRE_ERROR_PARTIAL return, the output is "Partial match:", followed by the
-partially matching substring.
+partially matching substring. (Note that this is the entire substring that was
+inspected during the partial match; it may include characters before the actual
+match start if a lookbehind assertion, \K, \b, or \B was involved.)
</P>
<P>
If <b>/g</b> is present on the pattern, the search for further matches resumes
@@ -740,7 +747,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 14 June 2010
+Last updated: 06 November 2010
<br>
Copyright &copy; 1997-2010 University of Cambridge.
<br>
diff --git a/doc/pcre.txt b/doc/pcre.txt
index 5f65871..c6f4e08 100644
--- a/doc/pcre.txt
+++ b/doc/pcre.txt
@@ -2,7 +2,7 @@
This file contains a concatenation of the PCRE man pages, converted to plain
text format for ease of searching with a text editor, or for use on systems
that do not have a man page processor. The small individual files that give
-synopses of each function in the library have not been included. Neither has
+synopses of each function in the library have not been included. Neither has
the pcredemo program. There are separate text files for the pcregrep and
pcretest commands.
-----------------------------------------------------------------------------
@@ -226,14 +226,14 @@ UTF-8 AND UNICODE PROPERTY SUPPORT
PCRE recognizes as digits, spaces, or word characters remain the same
set as before, all with values less than 256. This remains true even
when PCRE is built to include Unicode property support, because to do
- otherwise would slow down PCRE in many common cases. Note that this
- also applies to \b, because it is defined in terms of \w and \W. If you
- really want to test for a wider sense of, say, "digit", you can use
- explicit Unicode property tests such as \p{Nd}. Alternatively, if you
- set the PCRE_UCP option, the way that the character escapes work is
- changed so that Unicode properties are used to determine which charac-
- ters match. There are more details in the section on generic character
- types in the pcrepattern documentation.
+ otherwise would slow down PCRE in many common cases. Note in particular
+ that this applies to \b and \B, because they are defined in terms of \w
+ and \W. If you really want to test for a wider sense of, say, "digit",
+ you can use explicit Unicode property tests such as \p{Nd}. Alterna-
+ tively, if you set the PCRE_UCP option, the way that the character
+ escapes work is changed so that Unicode properties are used to deter-
+ mine which characters match. There are more details in the section on
+ generic character types in the pcrepattern documentation.
7. Similarly, characters that match the POSIX named character classes
are all low-valued characters, unless the PCRE_UCP option is set.
@@ -267,11 +267,11 @@ AUTHOR
REVISION
- Last updated: 12 May 2010
+ Last updated: 22 October 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREBUILD(3) PCREBUILD(3)
@@ -601,8 +601,8 @@ REVISION
Last updated: 29 September 2009
Copyright (c) 1997-2009 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREMATCHING(3) PCREMATCHING(3)
@@ -771,8 +771,10 @@ ADVANTAGES OF THE ALTERNATIVE ALGORITHM
2. Because the alternative algorithm scans the subject string just
once, and never needs to backtrack, it is possible to pass very long
subject strings to the matching function in several pieces, checking
- for partial matching each time. The pcrepartial documentation gives
- details of partial matching.
+ for partial matching each time. It is possible to do multi-segment
+ matching using pcre_exec() (by retaining partially matched substrings),
+ but it is more complicated. The pcrepartial documentation gives details
+ of partial matching and discusses multi-segment matching.
DISADVANTAGES OF THE ALTERNATIVE ALGORITHM
@@ -798,11 +800,11 @@ AUTHOR
REVISION
- Last updated: 29 September 2009
- Copyright (c) 1997-2009 University of Cambridge.
+ Last updated: 22 October 2010
+ Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREAPI(3) PCREAPI(3)
@@ -2123,64 +2125,68 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
set with PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED, and then if that
fails, by advancing the starting offset (see below) and trying an ordi-
nary match again. There is some code that demonstrates how to do this
- in the pcredemo sample program.
+ in the pcredemo sample program. In the most general case, you have to
+ check to see if the newline convention recognizes CRLF as a newline,
+ and if so, and the current character is CR followed by LF, advance the
+ starting offset by two characters instead of one.
PCRE_NO_START_OPTIMIZE
- There are a number of optimizations that pcre_exec() uses at the start
- of a match, in order to speed up the process. For example, if it is
+ There are a number of optimizations that pcre_exec() uses at the start
+ of a match, in order to speed up the process. For example, if it is
known that an unanchored match must start with a specific character, it
- searches the subject for that character, and fails immediately if it
- cannot find it, without actually running the main matching function.
+ searches the subject for that character, and fails immediately if it
+ cannot find it, without actually running the main matching function.
This means that a special item such as (*COMMIT) at the start of a pat-
- tern is not considered until after a suitable starting point for the
- match has been found. When callouts or (*MARK) items are in use, these
+ tern is not considered until after a suitable starting point for the
+ match has been found. When callouts or (*MARK) items are in use, these
"start-up" optimizations can cause them to be skipped if the pattern is
- never actually used. The start-up optimizations are in effect a pre-
+ never actually used. The start-up optimizations are in effect a pre-
scan of the subject that takes place before the pattern is run.
- The PCRE_NO_START_OPTIMIZE option disables the start-up optimizations,
- possibly causing performance to suffer, but ensuring that in cases
- where the result is "no match", the callouts do occur, and that items
+ The PCRE_NO_START_OPTIMIZE option disables the start-up optimizations,
+ possibly causing performance to suffer, but ensuring that in cases
+ where the result is "no match", the callouts do occur, and that items
such as (*COMMIT) and (*MARK) are considered at every possible starting
- position in the subject string. Setting PCRE_NO_START_OPTIMIZE can
+ position in the subject string. Setting PCRE_NO_START_OPTIMIZE can
change the outcome of a matching operation. Consider the pattern
(*COMMIT)ABC
- When this is compiled, PCRE records the fact that a match must start
- with the character "A". Suppose the subject string is "DEFABC". The
- start-up optimization scans along the subject, finds "A" and runs the
- first match attempt from there. The (*COMMIT) item means that the pat-
- tern must match the current starting position, which in this case, it
- does. However, if the same match is run with PCRE_NO_START_OPTIMIZE
- set, the initial scan along the subject string does not happen. The
- first match attempt is run starting from "D" and when this fails,
- (*COMMIT) prevents any further matches being tried, so the overall
- result is "no match". If the pattern is studied, more start-up opti-
- mizations may be used. For example, a minimum length for the subject
+ When this is compiled, PCRE records the fact that a match must start
+ with the character "A". Suppose the subject string is "DEFABC". The
+ start-up optimization scans along the subject, finds "A" and runs the
+ first match attempt from there. The (*COMMIT) item means that the pat-
+ tern must match the current starting position, which in this case, it
+ does. However, if the same match is run with PCRE_NO_START_OPTIMIZE
+ set, the initial scan along the subject string does not happen. The
+ first match attempt is run starting from "D" and when this fails,
+ (*COMMIT) prevents any further matches being tried, so the overall
+ result is "no match". If the pattern is studied, more start-up opti-
+ mizations may be used. For example, a minimum length for the subject
may be recorded. Consider the pattern
(*MARK:A)(X|Y)
- The minimum length for a match is one character. If the subject is
- "ABC", there will be attempts to match "ABC", "BC", "C", and then
- finally an empty string. If the pattern is studied, the final attempt
- does not take place, because PCRE knows that the subject is too short,
- and so the (*MARK) is never encountered. In this case, studying the
- pattern does not affect the overall match result, which is still "no
+ The minimum length for a match is one character. If the subject is
+ "ABC", there will be attempts to match "ABC", "BC", "C", and then
+ finally an empty string. If the pattern is studied, the final attempt
+ does not take place, because PCRE knows that the subject is too short,
+ and so the (*MARK) is never encountered. In this case, studying the
+ pattern does not affect the overall match result, which is still "no
match", but it does affect the auxiliary information that is returned.
PCRE_NO_UTF8_CHECK
When PCRE_UTF8 is set at compile time, the validity of the subject as a
- UTF-8 string is automatically checked when pcre_exec() is subsequently
- called. The value of startoffset is also checked to ensure that it
- points to the start of a UTF-8 character. There is a discussion about
- the validity of UTF-8 strings in the section on UTF-8 support in the
- main pcre page. If an invalid UTF-8 sequence of bytes is found,
- pcre_exec() returns the error PCRE_ERROR_BADUTF8. If startoffset con-
- tains an invalid value, PCRE_ERROR_BADUTF8_OFFSET is returned.
+ UTF-8 string is automatically checked when pcre_exec() is subsequently
+ called. The value of startoffset is also checked to ensure that it
+ points to the start of a UTF-8 character. There is a discussion about
+ the validity of UTF-8 strings in the section on UTF-8 support in the
+ main pcre page. If an invalid UTF-8 sequence of bytes is found,
+ pcre_exec() returns the error PCRE_ERROR_BADUTF8. If startoffset con-
+ tains a value that does not point to the start of a UTF-8 character (or
+ to the end of the subject), PCRE_ERROR_BADUTF8_OFFSET is returned.
If you already know that your subject is valid, and you want to skip
these checks for performance reasons, you can set the
@@ -2188,10 +2194,10 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
do this for the second and subsequent calls to pcre_exec() if you are
making repeated calls to find all the matches in a single subject
string. However, you should be sure that the value of startoffset
- points to the start of a UTF-8 character. When PCRE_NO_UTF8_CHECK is
- set, the effect of passing an invalid UTF-8 string as a subject, or a
- value of startoffset that does not point to the start of a UTF-8 char-
- acter, is undefined. Your program may crash.
+ points to the start of a UTF-8 character (or the end of the subject).
+ When PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8
+ string as a subject or an invalid value of startoffset is undefined.
+ Your program may crash.
PCRE_PARTIAL_HARD
PCRE_PARTIAL_SOFT
@@ -2200,43 +2206,67 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
patibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. A partial
match occurs if the end of the subject string is reached successfully,
but there are not enough subject characters to complete the match. If
- this happens when PCRE_PARTIAL_HARD is set, pcre_exec() immediately
- returns PCRE_ERROR_PARTIAL. Otherwise, if PCRE_PARTIAL_SOFT is set,
- matching continues by testing any other alternatives. Only if they all
- fail is PCRE_ERROR_PARTIAL returned (instead of PCRE_ERROR_NOMATCH).
- The portion of the string that was inspected when the partial match was
- found is set as the first matching string. There is a more detailed
- discussion in the pcrepartial documentation.
+ this happens when PCRE_PARTIAL_SOFT (but not PCRE_PARTIAL_HARD) is set,
+ matching continues by testing any remaining alternatives. Only if no
+ complete match can be found is PCRE_ERROR_PARTIAL returned instead of
+ PCRE_ERROR_NOMATCH. In other words, PCRE_PARTIAL_SOFT says that the
+ caller is prepared to handle a partial match, but only if no complete
+ match can be found.
+
+ If PCRE_PARTIAL_HARD is set, it overrides PCRE_PARTIAL_SOFT. In this
+ case, if a partial match is found, pcre_exec() immediately returns
+ PCRE_ERROR_PARTIAL, without considering any other alternatives. In
+ other words, when PCRE_PARTIAL_HARD is set, a partial match is consid-
+ ered to be more important that an alternative complete match.
+
+ In both cases, the portion of the string that was inspected when the
+ partial match was found is set as the first matching string. There is a
+ more detailed discussion of partial and multi-segment matching, with
+ examples, in the pcrepartial documentation.
The string to be matched by pcre_exec()
The subject string is passed to pcre_exec() as a pointer in subject, a
length (in bytes) in length, and a starting byte offset in startoffset.
+ If this is negative or greater than the length of the subject,
+ pcre_exec() returns PCRE_ERROR_BADOFFSET.
+
In UTF-8 mode, the byte offset must point to the start of a UTF-8 char-
- acter. Unlike the pattern string, the subject may contain binary zero
- bytes. When the starting offset is zero, the search for a match starts
- at the beginning of the subject, and this is by far the most common
- case.
-
- A non-zero starting offset is useful when searching for another match
- in the same subject by calling pcre_exec() again after a previous suc-
- cess. Setting startoffset differs from just passing over a shortened
- string and setting PCRE_NOTBOL in the case of a pattern that begins
+ acter (or the end of the subject). Unlike the pattern string, the sub-
+ ject may contain binary zero bytes. When the starting offset is zero,
+ the search for a match starts at the beginning of the subject, and this
+ is by far the most common case.
+
+ A non-zero starting offset is useful when searching for another match
+ in the same subject by calling pcre_exec() again after a previous suc-
+ cess. Setting startoffset differs from just passing over a shortened
+ string and setting PCRE_NOTBOL in the case of a pattern that begins
with any kind of lookbehind. For example, consider the pattern
\Biss\B
- which finds occurrences of "iss" in the middle of words. (\B matches
- only if the current position in the subject is not a word boundary.)
- When applied to the string "Mississipi" the first call to pcre_exec()
- finds the first occurrence. If pcre_exec() is called again with just
- the remainder of the subject, namely "issipi", it does not match,
+ which finds occurrences of "iss" in the middle of words. (\B matches
+ only if the current position in the subject is not a word boundary.)
+ When applied to the string "Mississipi" the first call to pcre_exec()
+ finds the first occurrence. If pcre_exec() is called again with just
+ the remainder of the subject, namely "issipi", it does not match,
because \B is always false at the start of the subject, which is deemed
- to be a word boundary. However, if pcre_exec() is passed the entire
+ to be a word boundary. However, if pcre_exec() is passed the entire
string again, but with startoffset set to 4, it finds the second occur-
- rence of "iss" because it is able to look behind the starting point to
+ rence of "iss" because it is able to look behind the starting point to
discover that it is preceded by a letter.
+ Finding all the matches in a subject is tricky when the pattern can
+ match an empty string. It is possible to emulate Perl's /g behaviour by
+ first trying the match again at the same offset, with the
+ PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED options, and then if that
+ fails, advancing the starting offset and trying an ordinary match
+ again. There is some code that demonstrates how to do this in the pcre-
+ demo sample program. In the most general case, you have to check to see
+ if the newline convention recognizes CRLF as a newline, and if so, and
+ the current character is CR followed by LF, advance the starting offset
+ by two characters instead of one.
+
If a non-zero starting offset is passed when the pattern is anchored,
one attempt to match at the given offset is made. This can only succeed
if the pattern does not require the match to be at the start of the
@@ -2420,6 +2450,11 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
An invalid combination of PCRE_NEWLINE_xxx options was given.
+ PCRE_ERROR_BADOFFSET (-24)
+
+ The value of startoffset was negative or greater than the length of the
+ subject, that is, the value in length.
+
Error numbers -16 to -20 and -22 are not used by pcre_exec().
@@ -2436,78 +2471,78 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
int pcre_get_substring_list(const char *subject,
int *ovector, int stringcount, const char ***listptr);
- Captured substrings can be accessed directly by using the offsets
- returned by pcre_exec() in ovector. For convenience, the functions
+ Captured substrings can be accessed directly by using the offsets
+ returned by pcre_exec() in ovector. For convenience, the functions
pcre_copy_substring(), pcre_get_substring(), and pcre_get_sub-
- string_list() are provided for extracting captured substrings as new,
- separate, zero-terminated strings. These functions identify substrings
- by number. The next section describes functions for extracting named
+ string_list() are provided for extracting captured substrings as new,
+ separate, zero-terminated strings. These functions identify substrings
+ by number. The next section describes functions for extracting named
substrings.
- A substring that contains a binary zero is correctly extracted and has
- a further zero added on the end, but the result is not, of course, a C
- string. However, you can process such a string by referring to the
- length that is returned by pcre_copy_substring() and pcre_get_sub-
+ A substring that contains a binary zero is correctly extracted and has
+ a further zero added on the end, but the result is not, of course, a C
+ string. However, you can process such a string by referring to the
+ length that is returned by pcre_copy_substring() and pcre_get_sub-
string(). Unfortunately, the interface to pcre_get_substring_list() is
- not adequate for handling strings containing binary zeros, because the
+ not adequate for handling strings containing binary zeros, because the
end of the final string is not independently indicated.
- The first three arguments are the same for all three of these func-
- tions: subject is the subject string that has just been successfully
+ The first three arguments are the same for all three of these func-
+ tions: subject is the subject string that has just been successfully
matched, ovector is a pointer to the vector of integer offsets that was
passed to pcre_exec(), and stringcount is the number of substrings that
- were captured by the match, including the substring that matched the
+ were captured by the match, including the substring that matched the
entire regular expression. This is the value returned by pcre_exec() if
- it is greater than zero. If pcre_exec() returned zero, indicating that
- it ran out of space in ovector, the value passed as stringcount should
+ it is greater than zero. If pcre_exec() returned zero, indicating that
+ it ran out of space in ovector, the value passed as stringcount should
be the number of elements in the vector divided by three.
- The functions pcre_copy_substring() and pcre_get_substring() extract a
- single substring, whose number is given as stringnumber. A value of
- zero extracts the substring that matched the entire pattern, whereas
- higher values extract the captured substrings. For pcre_copy_sub-
- string(), the string is placed in buffer, whose length is given by
- buffersize, while for pcre_get_substring() a new block of memory is
- obtained via pcre_malloc, and its address is returned via stringptr.
- The yield of the function is the length of the string, not including
+ The functions pcre_copy_substring() and pcre_get_substring() extract a
+ single substring, whose number is given as stringnumber. A value of
+ zero extracts the substring that matched the entire pattern, whereas
+ higher values extract the captured substrings. For pcre_copy_sub-
+ string(), the string is placed in buffer, whose length is given by
+ buffersize, while for pcre_get_substring() a new block of memory is
+ obtained via pcre_malloc, and its address is returned via stringptr.
+ The yield of the function is the length of the string, not including
the terminating zero, or one of these error codes:
PCRE_ERROR_NOMEMORY (-6)
- The buffer was too small for pcre_copy_substring(), or the attempt to
+ The buffer was too small for pcre_copy_substring(), or the attempt to
get memory failed for pcre_get_substring().
PCRE_ERROR_NOSUBSTRING (-7)
There is no substring whose number is stringnumber.
- The pcre_get_substring_list() function extracts all available sub-
- strings and builds a list of pointers to them. All this is done in a
+ The pcre_get_substring_list() function extracts all available sub-
+ strings and builds a list of pointers to them. All this is done in a
single block of memory that is obtained via pcre_malloc. The address of
- the memory block is returned via listptr, which is also the start of
- the list of string pointers. The end of the list is marked by a NULL
- pointer. The yield of the function is zero if all went well, or the
+ the memory block is returned via listptr, which is also the start of
+ the list of string pointers. The end of the list is marked by a NULL
+ pointer. The yield of the function is zero if all went well, or the
error code
PCRE_ERROR_NOMEMORY (-6)
if the attempt to get the memory block failed.
- When any of these functions encounter a substring that is unset, which
- can happen when capturing subpattern number n+1 matches some part of
- the subject, but subpattern n has not been used at all, they return an
+ When any of these functions encounter a substring that is unset, which
+ can happen when capturing subpattern number n+1 matches some part of
+ the subject, but subpattern n has not been used at all, they return an
empty string. This can be distinguished from a genuine zero-length sub-
- string by inspecting the appropriate offset in ovector, which is nega-
+ string by inspecting the appropriate offset in ovector, which is nega-
tive for unset substrings.
- The two convenience functions pcre_free_substring() and pcre_free_sub-
- string_list() can be used to free the memory returned by a previous
+ The two convenience functions pcre_free_substring() and pcre_free_sub-
+ string_list() can be used to free the memory returned by a previous
call of pcre_get_substring() or pcre_get_substring_list(), respec-
- tively. They do nothing more than call the function pointed to by
- pcre_free, which of course could be called directly from a C program.
- However, PCRE is used in some situations where it is linked via a spe-
- cial interface to another programming language that cannot use
- pcre_free directly; it is for these cases that the functions are pro-
+ tively. They do nothing more than call the function pointed to by
+ pcre_free, which of course could be called directly from a C program.
+ However, PCRE is used in some situations where it is linked via a spe-
+ cial interface to another programming language that cannot use
+ pcre_free directly; it is for these cases that the functions are pro-
vided.
@@ -2526,7 +2561,7 @@ EXTRACTING CAPTURED SUBSTRINGS BY NAME
int stringcount, const char *stringname,
const char **stringptr);
- To extract a substring by name, you first have to find associated num-
+ To extract a substring by name, you first have to find associated num-
ber. For example, for this pattern
(a+)b(?<xxx>\d+)...
@@ -2535,35 +2570,35 @@ EXTRACTING CAPTURED SUBSTRINGS BY NAME
be unique (PCRE_DUPNAMES was not set), you can find the number from the
name by calling pcre_get_stringnumber(). The first argument is the com-
piled pattern, and the second is the name. The yield of the function is
- the subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no
+ the subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no
subpattern of that name.
Given the number, you can extract the substring directly, or use one of
the functions described in the previous section. For convenience, there
are also two functions that do the whole job.
- Most of the arguments of pcre_copy_named_substring() and
- pcre_get_named_substring() are the same as those for the similarly
- named functions that extract by number. As these are described in the
- previous section, they are not re-described here. There are just two
+ Most of the arguments of pcre_copy_named_substring() and
+ pcre_get_named_substring() are the same as those for the similarly
+ named functions that extract by number. As these are described in the
+ previous section, they are not re-described here. There are just two
differences:
- First, instead of a substring number, a substring name is given. Sec-
+ First, instead of a substring number, a substring name is given. Sec-
ond, there is an extra argument, given at the start, which is a pointer
- to the compiled pattern. This is needed in order to gain access to the
+ to the compiled pattern. This is needed in order to gain access to the
name-to-number translation table.
- These functions call pcre_get_stringnumber(), and if it succeeds, they
- then call pcre_copy_substring() or pcre_get_substring(), as appropri-
- ate. NOTE: If PCRE_DUPNAMES is set and there are duplicate names, the
+ These functions call pcre_get_stringnumber(), and if it succeeds, they
+ then call pcre_copy_substring() or pcre_get_substring(), as appropri-
+ ate. NOTE: If PCRE_DUPNAMES is set and there are duplicate names, the
behaviour may not be what you want (see the next section).
Warning: If the pattern uses the (?| feature to set up multiple subpat-
- terns with the same number, as described in the section on duplicate
- subpattern numbers in the pcrepattern page, you cannot use names to
- distinguish the different subpatterns, because names are not included
- in the compiled code. The matching process uses only numbers. For this
- reason, the use of different names for subpatterns of the same number
+ terns with the same number, as described in the section on duplicate
+ subpattern numbers in the pcrepattern page, you cannot use names to
+ distinguish the different subpatterns, because names are not included
+ in the compiled code. The matching process uses only numbers. For this
+ reason, the use of different names for subpatterns of the same number
causes an error at compile time.
@@ -2572,51 +2607,51 @@ DUPLICATE SUBPATTERN NAMES
int pcre_get_stringtable_entries(const pcre *code,
const char *name, char **first, char **last);
- When a pattern is compiled with the PCRE_DUPNAMES option, names for
- subpatterns are not required to be unique. (Duplicate names are always
- allowed for subpatterns with the same number, created by using the (?|
- feature. Indeed, if such subpatterns are named, they are required to
+ When a pattern is compiled with the PCRE_DUPNAMES option, names for
+ subpatterns are not required to be unique. (Duplicate names are always
+ allowed for subpatterns with the same number, created by using the (?|
+ feature. Indeed, if such subpatterns are named, they are required to
use the same names.)
Normally, patterns with duplicate names are such that in any one match,
- only one of the named subpatterns participates. An example is shown in
+ only one of the named subpatterns participates. An example is shown in
the pcrepattern documentation.
- When duplicates are present, pcre_copy_named_substring() and
- pcre_get_named_substring() return the first substring corresponding to
- the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING
- (-7) is returned; no data is returned. The pcre_get_stringnumber()
- function returns one of the numbers that are associated with the name,
+ When duplicates are present, pcre_copy_named_substring() and
+ pcre_get_named_substring() return the first substring corresponding to
+ the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING
+ (-7) is returned; no data is returned. The pcre_get_stringnumber()
+ function returns one of the numbers that are associated with the name,
but it is not defined which it is.
- If you want to get full details of all captured substrings for a given
- name, you must use the pcre_get_stringtable_entries() function. The
+ If you want to get full details of all captured substrings for a given
+ name, you must use the pcre_get_stringtable_entries() function. The
first argument is the compiled pattern, and the second is the name. The
- third and fourth are pointers to variables which are updated by the
+ third and fourth are pointers to variables which are updated by the
function. After it has run, they point to the first and last entries in
- the name-to-number table for the given name. The function itself
- returns the length of each entry, or PCRE_ERROR_NOSUBSTRING (-7) if
- there are none. The format of the table is described above in the sec-
- tion entitled Information about a pattern. Given all the relevant
- entries for the name, you can extract each of their numbers, and hence
+ the name-to-number table for the given name. The function itself
+ returns the length of each entry, or PCRE_ERROR_NOSUBSTRING (-7) if
+ there are none. The format of the table is described above in the sec-
+ tion entitled Information about a pattern. Given all the relevant
+ entries for the name, you can extract each of their numbers, and hence
the captured data, if any.
FINDING ALL POSSIBLE MATCHES
- The traditional matching function uses a similar algorithm to Perl,
+ The traditional matching function uses a similar algorithm to Perl,
which stops when it finds the first match, starting at a given point in
- the subject. If you want to find all possible matches, or the longest
- possible match, consider using the alternative matching function (see
- below) instead. If you cannot use the alternative function, but still
- need to find all possible matches, you can kludge it up by making use
+ the subject. If you want to find all possible matches, or the longest
+ possible match, consider using the alternative matching function (see
+ below) instead. If you cannot use the alternative function, but still
+ need to find all possible matches, you can kludge it up by making use
of the callout facility, which is described in the pcrecallout documen-
tation.
What you have to do is to insert a callout right at the end of the pat-
- tern. When your callout function is called, extract and save the cur-
- rent matched substring. Then return 1, which forces pcre_exec() to
- backtrack and try other alternatives. Ultimately, when it runs out of
+ tern. When your callout function is called, extract and save the cur-
+ rent matched substring. Then return 1, which forces pcre_exec() to
+ backtrack and try other alternatives. Ultimately, when it runs out of
matches, pcre_exec() will yield PCRE_ERROR_NOMATCH.
@@ -2627,26 +2662,26 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
int options, int *ovector, int ovecsize,
int *workspace, int wscount);
- The function pcre_dfa_exec() is called to match a subject string
- against a compiled pattern, using a matching algorithm that scans the
- subject string just once, and does not backtrack. This has different
- characteristics to the normal algorithm, and is not compatible with
- Perl. Some of the features of PCRE patterns are not supported. Never-
- theless, there are times when this kind of matching can be useful. For
- a discussion of the two matching algorithms, and a list of features
- that pcre_dfa_exec() does not support, see the pcrematching documenta-
+ The function pcre_dfa_exec() is called to match a subject string
+ against a compiled pattern, using a matching algorithm that scans the
+ subject string just once, and does not backtrack. This has different
+ characteristics to the normal algorithm, and is not compatible with
+ Perl. Some of the features of PCRE patterns are not supported. Never-
+ theless, there are times when this kind of matching can be useful. For
+ a discussion of the two matching algorithms, and a list of features
+ that pcre_dfa_exec() does not support, see the pcrematching documenta-
tion.
- The arguments for the pcre_dfa_exec() function are the same as for
+ The arguments for the pcre_dfa_exec() function are the same as for
pcre_exec(), plus two extras. The ovector argument is used in a differ-
- ent way, and this is described below. The other common arguments are
- used in the same way as for pcre_exec(), so their description is not
+ ent way, and this is described below. The other common arguments are
+ used in the same way as for pcre_exec(), so their description is not
repeated here.
- The two additional arguments provide workspace for the function. The
- workspace vector should contain at least 20 elements. It is used for
+ The two additional arguments provide workspace for the function. The
+ workspace vector should contain at least 20 elements. It is used for
keeping track of multiple paths through the pattern tree. More
- workspace will be needed for patterns and subjects where there are a
+ workspace will be needed for patterns and subjects where there are a
lot of potential matches.
Here is an example of a simple call to pcre_dfa_exec():
@@ -2668,53 +2703,55 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
Option bits for pcre_dfa_exec()
- The unused bits of the options argument for pcre_dfa_exec() must be
- zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEW-
+ The unused bits of the options argument for pcre_dfa_exec() must be
+ zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEW-
LINE_xxx, PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY,
- PCRE_NOTEMPTY_ATSTART, PCRE_NO_UTF8_CHECK, PCRE_BSR_ANYCRLF,
- PCRE_BSR_UNICODE, PCRE_NO_START_OPTIMIZE, PCRE_PARTIAL_HARD, PCRE_PAR-
- TIAL_SOFT, PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last
- four of these are exactly the same as for pcre_exec(), so their
+ PCRE_NOTEMPTY_ATSTART, PCRE_NO_UTF8_CHECK, PCRE_BSR_ANYCRLF,
+ PCRE_BSR_UNICODE, PCRE_NO_START_OPTIMIZE, PCRE_PARTIAL_HARD, PCRE_PAR-
+ TIAL_SOFT, PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last
+ four of these are exactly the same as for pcre_exec(), so their
description is not repeated here.
PCRE_PARTIAL_HARD
PCRE_PARTIAL_SOFT
- These have the same general effect as they do for pcre_exec(), but the
- details are slightly different. When PCRE_PARTIAL_HARD is set for
- pcre_dfa_exec(), it returns PCRE_ERROR_PARTIAL if the end of the sub-
- ject is reached and there is still at least one matching possibility
+ These have the same general effect as they do for pcre_exec(), but the
+ details are slightly different. When PCRE_PARTIAL_HARD is set for
+ pcre_dfa_exec(), it returns PCRE_ERROR_PARTIAL if the end of the sub-
+ ject is reached and there is still at least one matching possibility
that requires additional characters. This happens even if some complete
matches have also been found. When PCRE_PARTIAL_SOFT is set, the return
code PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if the end
- of the subject is reached, there have been no complete matches, but
- there is still at least one matching possibility. The portion of the
- string that was inspected when the longest partial match was found is
- set as the first matching string in both cases.
+ of the subject is reached, there have been no complete matches, but
+ there is still at least one matching possibility. The portion of the
+ string that was inspected when the longest partial match was found is
+ set as the first matching string in both cases. There is a more
+ detailed discussion of partial and multi-segment matching, with exam-
+ ples, in the pcrepartial documentation.
PCRE_DFA_SHORTEST
- Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to
+ Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to
stop as soon as it has found one match. Because of the way the alterna-
- tive algorithm works, this is necessarily the shortest possible match
+ tive algorithm works, this is necessarily the shortest possible match
at the first possible matching point in the subject string.
PCRE_DFA_RESTART
When pcre_dfa_exec() returns a partial match, it is possible to call it
- again, with additional subject characters, and have it continue with
- the same match. The PCRE_DFA_RESTART option requests this action; when
- it is set, the workspace and wscount options must reference the same
- vector as before because data about the match so far is left in them
+ again, with additional subject characters, and have it continue with
+ the same match. The PCRE_DFA_RESTART option requests this action; when
+ it is set, the workspace and wscount options must reference the same
+ vector as before because data about the match so far is left in them
after a partial match. There is more discussion of this facility in the
pcrepartial documentation.
Successful returns from pcre_dfa_exec()
- When pcre_dfa_exec() succeeds, it may have matched more than one sub-
+ When pcre_dfa_exec() succeeds, it may have matched more than one sub-
string in the subject. Note, however, that all the matches from one run
- of the function start at the same point in the subject. The shorter
- matches are all initial substrings of the longer matches. For example,
+ of the function start at the same point in the subject. The shorter
+ matches are all initial substrings of the longer matches. For example,
if the pattern
<.*>
@@ -2729,61 +2766,61 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
<something> <something else>
<something> <something else> <something further>
- On success, the yield of the function is a number greater than zero,
- which is the number of matched substrings. The substrings themselves
- are returned in ovector. Each string uses two elements; the first is
- the offset to the start, and the second is the offset to the end. In
- fact, all the strings have the same start offset. (Space could have
- been saved by giving this only once, but it was decided to retain some
- compatibility with the way pcre_exec() returns data, even though the
+ On success, the yield of the function is a number greater than zero,
+ which is the number of matched substrings. The substrings themselves
+ are returned in ovector. Each string uses two elements; the first is
+ the offset to the start, and the second is the offset to the end. In
+ fact, all the strings have the same start offset. (Space could have
+ been saved by giving this only once, but it was decided to retain some
+ compatibility with the way pcre_exec() returns data, even though the
meaning of the strings is different.)
The strings are returned in reverse order of length; that is, the long-
- est matching string is given first. If there were too many matches to
- fit into ovector, the yield of the function is zero, and the vector is
+ est matching string is given first. If there were too many matches to
+ fit into ovector, the yield of the function is zero, and the vector is
filled with the longest matches.
Error returns from pcre_dfa_exec()
- The pcre_dfa_exec() function returns a negative number when it fails.
- Many of the errors are the same as for pcre_exec(), and these are
- described above. There are in addition the following errors that are
+ The pcre_dfa_exec() function returns a negative number when it fails.
+ Many of the errors are the same as for pcre_exec(), and these are
+ described above. There are in addition the following errors that are
specific to pcre_dfa_exec():
PCRE_ERROR_DFA_UITEM (-16)
- This return is given if pcre_dfa_exec() encounters an item in the pat-
- tern that it does not support, for instance, the use of \C or a back
+ This return is given if pcre_dfa_exec() encounters an item in the pat-
+ tern that it does not support, for instance, the use of \C or a back
reference.
PCRE_ERROR_DFA_UCOND (-17)
- This return is given if pcre_dfa_exec() encounters a condition item
- that uses a back reference for the condition, or a test for recursion
+ This return is given if pcre_dfa_exec() encounters a condition item
+ that uses a back reference for the condition, or a test for recursion
in a specific group. These are not supported.
PCRE_ERROR_DFA_UMLIMIT (-18)
- This return is given if pcre_dfa_exec() is called with an extra block
+ This return is given if pcre_dfa_exec() is called with an extra block
that contains a setting of the match_limit field. This is not supported
(it is meaningless).
PCRE_ERROR_DFA_WSSIZE (-19)
- This return is given if pcre_dfa_exec() runs out of space in the
+ This return is given if pcre_dfa_exec() runs out of space in the
workspace vector.
PCRE_ERROR_DFA_RECURSE (-20)
- When a recursive subpattern is processed, the matching function calls
- itself recursively, using private vectors for ovector and workspace.
- This error is given if the output vector is not large enough. This
+ When a recursive subpattern is processed, the matching function calls
+ itself recursively, using private vectors for ovector and workspace.
+ This error is given if the output vector is not large enough. This
should be extremely rare, as a vector of size 1000 is used.
SEE ALSO
- pcrebuild(3), pcrecallout(3), pcrecpp(3)(3), pcrematching(3), pcrepar-
+ pcrebuild(3), pcrecallout(3), pcrecpp(3)(3), pcrematching(3), pcrepar-
tial(3), pcreposix(3), pcreprecompile(3), pcresample(3), pcrestack(3).
@@ -2796,11 +2833,11 @@ AUTHOR
REVISION
- Last updated: 21 June 2010
+ Last updated: 06 November 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECALLOUT(3) PCRECALLOUT(3)
@@ -2980,8 +3017,8 @@ REVISION
Last updated: 29 September 2009
Copyright (c) 1997-2009 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECOMPAT(3) PCRECOMPAT(3)
@@ -2993,7 +3030,7 @@ DIFFERENCES BETWEEN PCRE AND PERL
This document describes the differences in the ways that PCRE and Perl
handle regular expressions. The differences described here are with
- respect to Perl 5.10/5.11.
+ respect to Perl versions 5.10 and above.
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details
of what it does have are given in the section on UTF-8 support in the
@@ -3075,24 +3112,27 @@ DIFFERENCES BETWEEN PCRE AND PERL
turing subpattern number 1. To avoid this confusing situation, an error
is given at compile time.
- 12. PCRE provides some extensions to the Perl regular expression facil-
- ities. Perl 5.10 includes new features that are not in earlier ver-
- sions of Perl, some of which (such as named parentheses) have been in
+ 12. Perl recognizes comments in some places that PCRE doesn't, for
+ example, between the ( and ? at the start of a subpattern.
+
+ 13. PCRE provides some extensions to the Perl regular expression facil-
+ ities. Perl 5.10 includes new features that are not in earlier ver-
+ sions of Perl, some of which (such as named parentheses) have been in
PCRE for some time. This list is with respect to Perl 5.10:
- (a) Although lookbehind assertions in PCRE must match fixed length
- strings, each alternative branch of a lookbehind assertion can match a
- different length of string. Perl requires them all to have the same
+ (a) Although lookbehind assertions in PCRE must match fixed length
+ strings, each alternative branch of a lookbehind assertion can match a
+ different length of string. Perl requires them all to have the same
length.
- (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
+ (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
meta-character matches only at the very end of the string.
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no spe-
cial meaning is faulted. Otherwise, like Perl, the backslash is quietly
ignored. (Perl can be made to issue a warning.)
- (d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
+ (d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
fiers is inverted, that is, by default they are not greedy, but if fol-
lowed by a question mark they are.
@@ -3100,10 +3140,10 @@ DIFFERENCES BETWEEN PCRE AND PERL
tried only at the first matching position in the subject string.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART,
- and PCRE_NO_AUTO_CAPTURE options for pcre_exec() have no Perl equiva-
+ and PCRE_NO_AUTO_CAPTURE options for pcre_exec() have no Perl equiva-
lents.
- (g) The \R escape sequence can be restricted to match only CR, LF, or
+ (g) The \R escape sequence can be restricted to match only CR, LF, or
CRLF by the PCRE_BSR_ANYCRLF option.
(h) The callout facility is PCRE-specific.
@@ -3113,10 +3153,10 @@ DIFFERENCES BETWEEN PCRE AND PERL
(j) Patterns compiled by PCRE can be saved and re-used at a later time,
even on different hosts that have the other endianness.
- (k) The alternative matching function (pcre_dfa_exec()) matches in a
+ (k) The alternative matching function (pcre_dfa_exec()) matches in a
different way and is not Perl-compatible.
- (l) PCRE recognizes some special sequences such as (*CR) at the start
+ (l) PCRE recognizes some special sequences such as (*CR) at the start
of a pattern that set overall options that cannot be changed within the
pattern.
@@ -3130,11 +3170,11 @@ AUTHOR
REVISION
- Last updated: 12 May 2010
+ Last updated: 31 October 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPATTERN(3) PCREPATTERN(3)
@@ -3324,7 +3364,7 @@ BACKSLASH
\Qabc\E\$\Qxyz\E abc$xyz abc$xyz
The \Q...\E sequence is recognized both inside and outside character
- classes.
+ classes. An isolated \E that is not preceded by \Q is ignored.
Non-printing characters
@@ -4862,7 +4902,14 @@ CONDITIONAL SUBPATTERNS
If the condition is satisfied, the yes-pattern is used; otherwise the
no-pattern (if present) is used. If there are more than two alterna-
- tives in the subpattern, a compile-time error occurs.
+ tives in the subpattern, a compile-time error occurs. Each of the two
+ alternatives may itself contain nested subpatterns of any form, includ-
+ ing conditional subpatterns; the restriction to two alternatives
+ applies only at the level of the condition. This pattern fragment is an
+ example where the alternatives are complex:
+
+ (?(1) (A|B|C) | (D | (?(2)E|F) | E) )
+
There are four kinds of condition: references to subpatterns, refer-
ences to recursion, a pseudo-condition called DEFINE, and assertions.
@@ -4987,27 +5034,44 @@ CONDITIONAL SUBPATTERNS
COMMENTS
- The sequence (?# marks the start of a comment that continues up to the
- next closing parenthesis. Nested parentheses are not permitted. The
- characters that make up a comment play no part in the pattern matching
- at all.
+ There are two ways of including comments in patterns that are processed
+ by PCRE. In both cases, the start of the comment must not be in a char-
+ acter class, nor in the middle of any other sequence of related charac-
+ ters such as (?: or a subpattern name or number. The characters that
+ make up a comment play no part in the pattern matching.
- If the PCRE_EXTENDED option is set, an unescaped # character outside a
- character class introduces a comment that continues to immediately
- after the next newline in the pattern.
+ The sequence (?# marks the start of a comment that continues up to the
+ next closing parenthesis. Nested parentheses are not permitted. If the
+ PCRE_EXTENDED option is set, an unescaped # character also introduces a
+ comment, which in this case continues to immediately after the next
+ newline character or character sequence in the pattern. Which charac-
+ ters are interpreted as newlines is controlled by the options passed to
+ pcre_compile() or by a special sequence at the start of the pattern, as
+ described in the section entitled "Newline conventions" above. Note
+ that end of this type of comment is a literal newline sequence in the
+ pattern; escape sequences that happen to represent a newline do not
+ count. For example, consider this pattern when PCRE_EXTENDED is set,
+ and the default newline convention is in force:
+
+ abc #comment \n still comment
+
+ On encountering the # character, pcre_compile() skips along, looking
+ for a newline in the pattern. The sequence \n is still literal at this
+ stage, so it does not terminate the comment. Only an actual character
+ with the code value 0x0a (the default newline) does so.
RECURSIVE PATTERNS
- Consider the problem of matching a string in parentheses, allowing for
- unlimited nested parentheses. Without the use of recursion, the best
- that can be done is to use a pattern that matches up to some fixed
- depth of nesting. It is not possible to handle an arbitrary nesting
+ Consider the problem of matching a string in parentheses, allowing for
+ unlimited nested parentheses. Without the use of recursion, the best
+ that can be done is to use a pattern that matches up to some fixed
+ depth of nesting. It is not possible to handle an arbitrary nesting
depth.
For some time, Perl has provided a facility that allows regular expres-
- sions to recurse (amongst other things). It does this by interpolating
- Perl code in the expression at run time, and the code can refer to the
+ sions to recurse (amongst other things). It does this by interpolating
+ Perl code in the expression at run time, and the code can refer to the
expression itself. A Perl pattern using code interpolation to solve the
parentheses problem can be created like this:
@@ -5017,182 +5081,182 @@ RECURSIVE PATTERNS
refers recursively to the pattern in which it appears.
Obviously, PCRE cannot support the interpolation of Perl code. Instead,
- it supports special syntax for recursion of the entire pattern, and
- also for individual subpattern recursion. After its introduction in
- PCRE and Python, this kind of recursion was subsequently introduced
+ it supports special syntax for recursion of the entire pattern, and
+ also for individual subpattern recursion. After its introduction in
+ PCRE and Python, this kind of recursion was subsequently introduced
into Perl at release 5.10.
- A special item that consists of (? followed by a number greater than
+ A special item that consists of (? followed by a number greater than
zero and a closing parenthesis is a recursive call of the subpattern of
- the given number, provided that it occurs inside that subpattern. (If
- not, it is a "subroutine" call, which is described in the next sec-
- tion.) The special item (?R) or (?0) is a recursive call of the entire
+ the given number, provided that it occurs inside that subpattern. (If
+ not, it is a "subroutine" call, which is described in the next sec-
+ tion.) The special item (?R) or (?0) is a recursive call of the entire
regular expression.
- This PCRE pattern solves the nested parentheses problem (assume the
+ This PCRE pattern solves the nested parentheses problem (assume the
PCRE_EXTENDED option is set so that white space is ignored):
\( ( [^()]++ | (?R) )* \)
- First it matches an opening parenthesis. Then it matches any number of
- substrings which can either be a sequence of non-parentheses, or a
- recursive match of the pattern itself (that is, a correctly parenthe-
+ First it matches an opening parenthesis. Then it matches any number of
+ substrings which can either be a sequence of non-parentheses, or a
+ recursive match of the pattern itself (that is, a correctly parenthe-
sized substring). Finally there is a closing parenthesis. Note the use
of a possessive quantifier to avoid backtracking into sequences of non-
parentheses.
- If this were part of a larger pattern, you would not want to recurse
+ If this were part of a larger pattern, you would not want to recurse
the entire pattern, so instead you could use this:
( \( ( [^()]++ | (?1) )* \) )
- We have put the pattern into parentheses, and caused the recursion to
+ We have put the pattern into parentheses, and caused the recursion to
refer to them instead of the whole pattern.
- In a larger pattern, keeping track of parenthesis numbers can be
- tricky. This is made easier by the use of relative references (a Perl
- 5.10 feature). Instead of (?1) in the pattern above you can write
+ In a larger pattern, keeping track of parenthesis numbers can be
+ tricky. This is made easier by the use of relative references (a Perl
+ 5.10 feature). Instead of (?1) in the pattern above you can write
(?-2) to refer to the second most recently opened parentheses preceding
- the recursion. In other words, a negative number counts capturing
+ the recursion. In other words, a negative number counts capturing
parentheses leftwards from the point at which it is encountered.
- It is also possible to refer to subsequently opened parentheses, by
- writing references such as (?+2). However, these cannot be recursive
- because the reference is not inside the parentheses that are refer-
- enced. They are always "subroutine" calls, as described in the next
+ It is also possible to refer to subsequently opened parentheses, by
+ writing references such as (?+2). However, these cannot be recursive
+ because the reference is not inside the parentheses that are refer-
+ enced. They are always "subroutine" calls, as described in the next
section.
- An alternative approach is to use named parentheses instead. The Perl
- syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also
+ An alternative approach is to use named parentheses instead. The Perl
+ syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also
supported. We could rewrite the above example as follows:
(?<pn> \( ( [^()]++ | (?&pn) )* \) )
- If there is more than one subpattern with the same name, the earliest
+ If there is more than one subpattern with the same name, the earliest
one is used.
- This particular example pattern that we have been looking at contains
+ This particular example pattern that we have been looking at contains
nested unlimited repeats, and so the use of a possessive quantifier for
matching strings of non-parentheses is important when applying the pat-
- tern to strings that do not match. For example, when this pattern is
+ tern to strings that do not match. For example, when this pattern is
applied to
(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
- it yields "no match" quickly. However, if a possessive quantifier is
- not used, the match runs for a very long time indeed because there are
- so many different ways the + and * repeats can carve up the subject,
+ it yields "no match" quickly. However, if a possessive quantifier is
+ not used, the match runs for a very long time indeed because there are
+ so many different ways the + and * repeats can carve up the subject,
and all have to be tested before failure can be reported.
- At the end of a match, the values of capturing parentheses are those
- from the outermost level. If you want to obtain intermediate values, a
- callout function can be used (see below and the pcrecallout documenta-
+ At the end of a match, the values of capturing parentheses are those
+ from the outermost level. If you want to obtain intermediate values, a
+ callout function can be used (see below and the pcrecallout documenta-
tion). If the pattern above is matched against
(ab(cd)ef)
- the value for the inner capturing parentheses (numbered 2) is "ef",
- which is the last value taken on at the top level. If a capturing sub-
+ the value for the inner capturing parentheses (numbered 2) is "ef",
+ which is the last value taken on at the top level. If a capturing sub-
pattern is not matched at the top level, its final value is unset, even
if it is (temporarily) set at a deeper level.
- If there are more than 15 capturing parentheses in a pattern, PCRE has
- to obtain extra memory to store data during a recursion, which it does
+ If there are more than 15 capturing parentheses in a pattern, PCRE has
+ to obtain extra memory to store data during a recursion, which it does
by using pcre_malloc, freeing it via pcre_free afterwards. If no memory
can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error.
- Do not confuse the (?R) item with the condition (R), which tests for
- recursion. Consider this pattern, which matches text in angle brack-
- ets, allowing for arbitrary nesting. Only digits are allowed in nested
- brackets (that is, when recursing), whereas any characters are permit-
+ Do not confuse the (?R) item with the condition (R), which tests for
+ recursion. Consider this pattern, which matches text in angle brack-
+ ets, allowing for arbitrary nesting. Only digits are allowed in nested
+ brackets (that is, when recursing), whereas any characters are permit-
ted at the outer level.
< (?: (?(R) \d++ | [^<>]*+) | (?R)) * >
- In this pattern, (?(R) is the start of a conditional subpattern, with
- two different alternatives for the recursive and non-recursive cases.
+ In this pattern, (?(R) is the start of a conditional subpattern, with
+ two different alternatives for the recursive and non-recursive cases.
The (?R) item is the actual recursive call.
Recursion difference from Perl
- In PCRE (like Python, but unlike Perl), a recursive subpattern call is
+ In PCRE (like Python, but unlike Perl), a recursive subpattern call is
always treated as an atomic group. That is, once it has matched some of
the subject string, it is never re-entered, even if it contains untried
- alternatives and there is a subsequent matching failure. This can be
- illustrated by the following pattern, which purports to match a palin-
- dromic string that contains an odd number of characters (for example,
+ alternatives and there is a subsequent matching failure. This can be
+ illustrated by the following pattern, which purports to match a palin-
+ dromic string that contains an odd number of characters (for example,
"a", "aba", "abcba", "abcdcba"):
^(.|(.)(?1)\2)$
The idea is that it either matches a single character, or two identical
- characters surrounding a sub-palindrome. In Perl, this pattern works;
- in PCRE it does not if the pattern is longer than three characters.
+ characters surrounding a sub-palindrome. In Perl, this pattern works;
+ in PCRE it does not if the pattern is longer than three characters.
Consider the subject string "abcba":
- At the top level, the first character is matched, but as it is not at
+ At the top level, the first character is matched, but as it is not at
the end of the string, the first alternative fails; the second alterna-
tive is taken and the recursion kicks in. The recursive call to subpat-
- tern 1 successfully matches the next character ("b"). (Note that the
+ tern 1 successfully matches the next character ("b"). (Note that the
beginning and end of line tests are not part of the recursion).
- Back at the top level, the next character ("c") is compared with what
- subpattern 2 matched, which was "a". This fails. Because the recursion
- is treated as an atomic group, there are now no backtracking points,
- and so the entire match fails. (Perl is able, at this point, to re-
- enter the recursion and try the second alternative.) However, if the
+ Back at the top level, the next character ("c") is compared with what
+ subpattern 2 matched, which was "a". This fails. Because the recursion
+ is treated as an atomic group, there are now no backtracking points,
+ and so the entire match fails. (Perl is able, at this point, to re-
+ enter the recursion and try the second alternative.) However, if the
pattern is written with the alternatives in the other order, things are
different:
^((.)(?1)\2|.)$
- This time, the recursing alternative is tried first, and continues to
- recurse until it runs out of characters, at which point the recursion
- fails. But this time we do have another alternative to try at the
- higher level. That is the big difference: in the previous case the
+ This time, the recursing alternative is tried first, and continues to
+ recurse until it runs out of characters, at which point the recursion
+ fails. But this time we do have another alternative to try at the
+ higher level. That is the big difference: in the previous case the
remaining alternative is at a deeper recursion level, which PCRE cannot
use.
To change the pattern so that matches all palindromic strings, not just
- those with an odd number of characters, it is tempting to change the
+ those with an odd number of characters, it is tempting to change the
pattern to this:
^((.)(?1)\2|.?)$
- Again, this works in Perl, but not in PCRE, and for the same reason.
- When a deeper recursion has matched a single character, it cannot be
- entered again in order to match an empty string. The solution is to
- separate the two cases, and write out the odd and even cases as alter-
+ Again, this works in Perl, but not in PCRE, and for the same reason.
+ When a deeper recursion has matched a single character, it cannot be
+ entered again in order to match an empty string. The solution is to
+ separate the two cases, and write out the odd and even cases as alter-
natives at the higher level:
^(?:((.)(?1)\2|)|((.)(?3)\4|.))
- If you want to match typical palindromic phrases, the pattern has to
+ If you want to match typical palindromic phrases, the pattern has to
ignore all non-word characters, which can be done like this:
^\W*+(?:((.)\W*+(?1)\W*+\2|)|((.)\W*+(?3)\W*+\4|\W*+.\W*+))\W*+$
If run with the PCRE_CASELESS option, this pattern matches phrases such
as "A man, a plan, a canal: Panama!" and it works well in both PCRE and
- Perl. Note the use of the possessive quantifier *+ to avoid backtrack-
- ing into sequences of non-word characters. Without this, PCRE takes a
- great deal longer (ten times or more) to match typical phrases, and
+ Perl. Note the use of the possessive quantifier *+ to avoid backtrack-
+ ing into sequences of non-word characters. Without this, PCRE takes a
+ great deal longer (ten times or more) to match typical phrases, and
Perl takes so long that you think it has gone into a loop.
- WARNING: The palindrome-matching patterns above work only if the sub-
- ject string does not start with a palindrome that is shorter than the
- entire string. For example, although "abcba" is correctly matched, if
- the subject is "ababa", PCRE finds the palindrome "aba" at the start,
- then fails at top level because the end of the string does not follow.
- Once again, it cannot jump back into the recursion to try other alter-
+ WARNING: The palindrome-matching patterns above work only if the sub-
+ ject string does not start with a palindrome that is shorter than the
+ entire string. For example, although "abcba" is correctly matched, if
+ the subject is "ababa", PCRE finds the palindrome "aba" at the start,
+ then fails at top level because the end of the string does not follow.
+ Once again, it cannot jump back into the recursion to try other alter-
natives, so the entire match fails.
SUBPATTERNS AS SUBROUTINES
If the syntax for a recursive subpattern reference (either by number or
- by name) is used outside the parentheses to which it refers, it oper-
- ates like a subroutine in a programming language. The "called" subpat-
+ by name) is used outside the parentheses to which it refers, it oper-
+ ates like a subroutine in a programming language. The "called" subpat-
tern may be defined before or after the reference. A numbered reference
can be absolute or relative, as in these examples:
@@ -5204,175 +5268,175 @@ SUBPATTERNS AS SUBROUTINES
(sens|respons)e and \1ibility
- matches "sense and sensibility" and "response and responsibility", but
+ matches "sense and sensibility" and "response and responsibility", but
not "sense and responsibility". If instead the pattern
(sens|respons)e and (?1)ibility
- is used, it does match "sense and responsibility" as well as the other
- two strings. Another example is given in the discussion of DEFINE
+ is used, it does match "sense and responsibility" as well as the other
+ two strings. Another example is given in the discussion of DEFINE
above.
- Like recursive subpatterns, a subroutine call is always treated as an
- atomic group. That is, once it has matched some of the subject string,
- it is never re-entered, even if it contains untried alternatives and
- there is a subsequent matching failure. Any capturing parentheses that
- are set during the subroutine call revert to their previous values
+ Like recursive subpatterns, a subroutine call is always treated as an
+ atomic group. That is, once it has matched some of the subject string,
+ it is never re-entered, even if it contains untried alternatives and
+ there is a subsequent matching failure. Any capturing parentheses that
+ are set during the subroutine call revert to their previous values
afterwards.
- When a subpattern is used as a subroutine, processing options such as
+ When a subpattern is used as a subroutine, processing options such as
case-independence are fixed when the subpattern is defined. They cannot
be changed for different calls. For example, consider this pattern:
(abc)(?i:(?-1))
- It matches "abcabc". It does not match "abcABC" because the change of
+ It matches "abcabc". It does not match "abcABC" because the change of
processing option does not affect the called subpattern.
ONIGURUMA SUBROUTINE SYNTAX
- For compatibility with Oniguruma, the non-Perl syntax \g followed by a
+ For compatibility with Oniguruma, the non-Perl syntax \g followed by a
name or a number enclosed either in angle brackets or single quotes, is
- an alternative syntax for referencing a subpattern as a subroutine,
- possibly recursively. Here are two of the examples used above, rewrit-
+ an alternative syntax for referencing a subpattern as a subroutine,
+ possibly recursively. Here are two of the examples used above, rewrit-
ten using this syntax:
(?<pn> \( ( (?>[^()]+) | \g<pn> )* \) )
(sens|respons)e and \g'1'ibility
- PCRE supports an extension to Oniguruma: if a number is preceded by a
+ PCRE supports an extension to Oniguruma: if a number is preceded by a
plus or a minus sign it is taken as a relative reference. For example:
(abc)(?i:\g<-1>)
- Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
- synonymous. The former is a back reference; the latter is a subroutine
+ Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
+ synonymous. The former is a back reference; the latter is a subroutine
call.
CALLOUTS
Perl has a feature whereby using the sequence (?{...}) causes arbitrary
- Perl code to be obeyed in the middle of matching a regular expression.
+ Perl code to be obeyed in the middle of matching a regular expression.
This makes it possible, amongst other things, to extract different sub-
strings that match the same pair of parentheses when there is a repeti-
tion.
PCRE provides a similar feature, but of course it cannot obey arbitrary
Perl code. The feature is called "callout". The caller of PCRE provides
- an external function by putting its entry point in the global variable
- pcre_callout. By default, this variable contains NULL, which disables
+ an external function by putting its entry point in the global variable
+ pcre_callout. By default, this variable contains NULL, which disables
all calling out.
- Within a regular expression, (?C) indicates the points at which the
- external function is to be called. If you want to identify different
- callout points, you can put a number less than 256 after the letter C.
- The default value is zero. For example, this pattern has two callout
+ Within a regular expression, (?C) indicates the points at which the
+ external function is to be called. If you want to identify different
+ callout points, you can put a number less than 256 after the letter C.
+ The default value is zero. For example, this pattern has two callout
points:
(?C1)abc(?C2)def
If the PCRE_AUTO_CALLOUT flag is passed to pcre_compile(), callouts are
- automatically installed before each item in the pattern. They are all
+ automatically installed before each item in the pattern. They are all
numbered 255.
During matching, when PCRE reaches a callout point (and pcre_callout is
- set), the external function is called. It is provided with the number
- of the callout, the position in the pattern, and, optionally, one item
- of data originally supplied by the caller of pcre_exec(). The callout
- function may cause matching to proceed, to backtrack, or to fail alto-
+ set), the external function is called. It is provided with the number
+ of the callout, the position in the pattern, and, optionally, one item
+ of data originally supplied by the caller of pcre_exec(). The callout
+ function may cause matching to proceed, to backtrack, or to fail alto-
gether. A complete description of the interface to the callout function
is given in the pcrecallout documentation.
BACKTRACKING CONTROL
- Perl 5.10 introduced a number of "Special Backtracking Control Verbs",
+ Perl 5.10 introduced a number of "Special Backtracking Control Verbs",
which are described in the Perl documentation as "experimental and sub-
- ject to change or removal in a future version of Perl". It goes on to
- say: "Their usage in production code should be noted to avoid problems
+ ject to change or removal in a future version of Perl". It goes on to
+ say: "Their usage in production code should be noted to avoid problems
during upgrades." The same remarks apply to the PCRE features described
in this section.
- Since these verbs are specifically related to backtracking, most of
- them can be used only when the pattern is to be matched using
+ Since these verbs are specifically related to backtracking, most of
+ them can be used only when the pattern is to be matched using
pcre_exec(), which uses a backtracking algorithm. With the exception of
(*FAIL), which behaves like a failing negative assertion, they cause an
error if encountered by pcre_dfa_exec().
If any of these verbs are used in an assertion or subroutine subpattern
- (including recursive subpatterns), their effect is confined to that
- subpattern; it does not extend to the surrounding pattern. Note that
- such subpatterns are processed as anchored at the point where they are
+ (including recursive subpatterns), their effect is confined to that
+ subpattern; it does not extend to the surrounding pattern. Note that
+ such subpatterns are processed as anchored at the point where they are
tested.
- The new verbs make use of what was previously invalid syntax: an open-
+ The new verbs make use of what was previously invalid syntax: an open-
ing parenthesis followed by an asterisk. They are generally of the form
- (*VERB) or (*VERB:NAME). Some may take either form, with differing be-
+ (*VERB) or (*VERB:NAME). Some may take either form, with differing be-
haviour, depending on whether or not an argument is present. An name is
- a sequence of letters, digits, and underscores. If the name is empty,
- that is, if the closing parenthesis immediately follows the colon, the
+ a sequence of letters, digits, and underscores. If the name is empty,
+ that is, if the closing parenthesis immediately follows the colon, the
effect is as if the colon were not there. Any number of these verbs may
occur in a pattern.
- PCRE contains some optimizations that are used to speed up matching by
+ PCRE contains some optimizations that are used to speed up matching by
running some checks at the start of each match attempt. For example, it
- may know the minimum length of matching subject, or that a particular
- character must be present. When one of these optimizations suppresses
- the running of a match, any included backtracking verbs will not, of
+ may know the minimum length of matching subject, or that a particular
+ character must be present. When one of these optimizations suppresses
+ the running of a match, any included backtracking verbs will not, of
course, be processed. You can suppress the start-of-match optimizations
by setting the PCRE_NO_START_OPTIMIZE option when calling pcre_exec().
Verbs that act immediately
- The following verbs act as soon as they are encountered. They may not
+ The following verbs act as soon as they are encountered. They may not
be followed by a name.
(*ACCEPT)
- This verb causes the match to end successfully, skipping the remainder
- of the pattern. When inside a recursion, only the innermost pattern is
- ended immediately. If (*ACCEPT) is inside capturing parentheses, the
- data so far is captured. (This feature was added to PCRE at release
+ This verb causes the match to end successfully, skipping the remainder
+ of the pattern. When inside a recursion, only the innermost pattern is
+ ended immediately. If (*ACCEPT) is inside capturing parentheses, the
+ data so far is captured. (This feature was added to PCRE at release
8.00.) For example:
A((?:A|B(*ACCEPT)|C)D)
- This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap-
+ This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap-
tured by the outer parentheses.
(*FAIL) or (*F)
- This verb causes the match to fail, forcing backtracking to occur. It
- is equivalent to (?!) but easier to read. The Perl documentation notes
- that it is probably useful only when combined with (?{}) or (??{}).
- Those are, of course, Perl features that are not present in PCRE. The
- nearest equivalent is the callout feature, as for example in this pat-
+ This verb causes the match to fail, forcing backtracking to occur. It
+ is equivalent to (?!) but easier to read. The Perl documentation notes
+ that it is probably useful only when combined with (?{}) or (??{}).
+ Those are, of course, Perl features that are not present in PCRE. The
+ nearest equivalent is the callout feature, as for example in this pat-
tern:
a+(?C)(*FAIL)
- A match with the string "aaaa" always fails, but the callout is taken
+ A match with the string "aaaa" always fails, but the callout is taken
before each backtrack happens (in this example, 10 times).
Recording which path was taken
- There is one verb whose main purpose is to track how a match was
- arrived at, though it also has a secondary use in conjunction with
+ There is one verb whose main purpose is to track how a match was
+ arrived at, though it also has a secondary use in conjunction with
advancing the match starting point (see (*SKIP) below).
(*MARK:NAME) or (*:NAME)
- A name is always required with this verb. There may be as many
- instances of (*MARK) as you like in a pattern, and their names do not
+ A name is always required with this verb. There may be as many
+ instances of (*MARK) as you like in a pattern, and their names do not
have to be unique.
- When a match succeeds, the name of the last-encountered (*MARK) is
- passed back to the caller via the pcre_extra data structure, as
+ When a match succeeds, the name of the last-encountered (*MARK) is
+ passed back to the caller via the pcre_extra data structure, as
described in the section on pcre_extra in the pcreapi documentation. No
- data is returned for a partial match. Here is an example of pcretest
- output, where the /K modifier requests the retrieval and outputting of
+ data is returned for a partial match. Here is an example of pcretest
+ output, where the /K modifier requests the retrieval and outputting of
(*MARK) data:
/X(*MARK:A)Y|X(*MARK:B)Z/K
@@ -5384,13 +5448,13 @@ BACKTRACKING CONTROL
MK: B
The (*MARK) name is tagged with "MK:" in this output, and in this exam-
- ple it indicates which of the two alternatives matched. This is a more
- efficient way of obtaining this information than putting each alterna-
+ ple it indicates which of the two alternatives matched. This is a more
+ efficient way of obtaining this information than putting each alterna-
tive in its own capturing parentheses.
- A name may also be returned after a failed match if the final path
- through the pattern involves (*MARK). However, unless (*MARK) used in
- conjunction with (*COMMIT), this is unlikely to happen for an unan-
+ A name may also be returned after a failed match if the final path
+ through the pattern involves (*MARK). However, unless (*MARK) used in
+ conjunction with (*COMMIT), this is unlikely to happen for an unan-
chored pattern because, as the starting point for matching is advanced,
the final check is often with an empty string, causing a failure before
(*MARK) is reached. For example:
@@ -5400,56 +5464,56 @@ BACKTRACKING CONTROL
No match
There are three potential starting points for this match (starting with
- X, starting with P, and with an empty string). If the pattern is
+ X, starting with P, and with an empty string). If the pattern is
anchored, the result is different:
/^X(*MARK:A)Y|^X(*MARK:B)Z/K
XP
No match, mark = B
- PCRE's start-of-match optimizations can also interfere with this. For
- example, if, as a result of a call to pcre_study(), it knows the mini-
- mum subject length for a match, a shorter subject will not be scanned
+ PCRE's start-of-match optimizations can also interfere with this. For
+ example, if, as a result of a call to pcre_study(), it knows the mini-
+ mum subject length for a match, a shorter subject will not be scanned
at all.
Note that similar anomalies (though different in detail) exist in Perl,
- no doubt for the same reasons. The use of (*MARK) data after a failed
- match of an unanchored pattern is not recommended, unless (*COMMIT) is
+ no doubt for the same reasons. The use of (*MARK) data after a failed
+ match of an unanchored pattern is not recommended, unless (*COMMIT) is
involved.
Verbs that act after backtracking
The following verbs do nothing when they are encountered. Matching con-
- tinues with what follows, but if there is no subsequent match, causing
- a backtrack to the verb, a failure is forced. That is, backtracking
- cannot pass to the left of the verb. However, when one of these verbs
- appears inside an atomic group, its effect is confined to that group,
- because once the group has been matched, there is never any backtrack-
- ing into it. In this situation, backtracking can "jump back" to the
- left of the entire atomic group. (Remember also, as stated above, that
+ tinues with what follows, but if there is no subsequent match, causing
+ a backtrack to the verb, a failure is forced. That is, backtracking
+ cannot pass to the left of the verb. However, when one of these verbs
+ appears inside an atomic group, its effect is confined to that group,
+ because once the group has been matched, there is never any backtrack-
+ ing into it. In this situation, backtracking can "jump back" to the
+ left of the entire atomic group. (Remember also, as stated above, that
this localization also applies in subroutine calls and assertions.)
- These verbs differ in exactly what kind of failure occurs when back-
+ These verbs differ in exactly what kind of failure occurs when back-
tracking reaches them.
(*COMMIT)
- This verb, which may not be followed by a name, causes the whole match
+ This verb, which may not be followed by a name, causes the whole match
to fail outright if the rest of the pattern does not match. Even if the
pattern is unanchored, no further attempts to find a match by advancing
the starting point take place. Once (*COMMIT) has been passed,
- pcre_exec() is committed to finding a match at the current starting
+ pcre_exec() is committed to finding a match at the current starting
point, or not at all. For example:
a+(*COMMIT)b
- This matches "xxaab" but not "aacaab". It can be thought of as a kind
+ This matches "xxaab" but not "aacaab". It can be thought of as a kind
of dynamic anchor, or "I've started, so I must finish." The name of the
- most recently passed (*MARK) in the path is passed back when (*COMMIT)
+ most recently passed (*MARK) in the path is passed back when (*COMMIT)
forces a match failure.
- Note that (*COMMIT) at the start of a pattern is not the same as an
- anchor, unless PCRE's start-of-match optimizations are turned off, as
+ Note that (*COMMIT) at the start of a pattern is not the same as an
+ anchor, unless PCRE's start-of-match optimizations are turned off, as
shown in this pcretest example:
/(*COMMIT)abc/
@@ -5458,63 +5522,64 @@ BACKTRACKING CONTROL
xyzabc\Y
No match
- PCRE knows that any match must start with "a", so the optimization
- skips along the subject to "a" before running the first match attempt,
- which succeeds. When the optimization is disabled by the \Y escape in
+ PCRE knows that any match must start with "a", so the optimization
+ skips along the subject to "a" before running the first match attempt,
+ which succeeds. When the optimization is disabled by the \Y escape in
the second subject, the match starts at "x" and so the (*COMMIT) causes
it to fail without trying any other starting points.
(*PRUNE) or (*PRUNE:NAME)
- This verb causes the match to fail at the current starting position in
- the subject if the rest of the pattern does not match. If the pattern
- is unanchored, the normal "bumpalong" advance to the next starting
- character then happens. Backtracking can occur as usual to the left of
- (*PRUNE), before it is reached, or when matching to the right of
- (*PRUNE), but if there is no match to the right, backtracking cannot
- cross (*PRUNE). In simple cases, the use of (*PRUNE) is just an alter-
- native to an atomic group or possessive quantifier, but there are some
+ This verb causes the match to fail at the current starting position in
+ the subject if the rest of the pattern does not match. If the pattern
+ is unanchored, the normal "bumpalong" advance to the next starting
+ character then happens. Backtracking can occur as usual to the left of
+ (*PRUNE), before it is reached, or when matching to the right of
+ (*PRUNE), but if there is no match to the right, backtracking cannot
+ cross (*PRUNE). In simple cases, the use of (*PRUNE) is just an alter-
+ native to an atomic group or possessive quantifier, but there are some
uses of (*PRUNE) that cannot be expressed in any other way. The behav-
- iour of (*PRUNE:NAME) is the same as (*MARK:NAME)(*PRUNE) when the
- match fails completely; the name is passed back if this is the final
- attempt. (*PRUNE:NAME) does not pass back a name if the match suc-
- ceeds. In an anchored pattern (*PRUNE) has the same effect as (*COM-
+ iour of (*PRUNE:NAME) is the same as (*MARK:NAME)(*PRUNE) when the
+ match fails completely; the name is passed back if this is the final
+ attempt. (*PRUNE:NAME) does not pass back a name if the match suc-
+ ceeds. In an anchored pattern (*PRUNE) has the same effect as (*COM-
MIT).
(*SKIP)
- This verb, when given without a name, is like (*PRUNE), except that if
- the pattern is unanchored, the "bumpalong" advance is not to the next
+ This verb, when given without a name, is like (*PRUNE), except that if
+ the pattern is unanchored, the "bumpalong" advance is not to the next
character, but to the position in the subject where (*SKIP) was encoun-
- tered. (*SKIP) signifies that whatever text was matched leading up to
+ tered. (*SKIP) signifies that whatever text was matched leading up to
it cannot be part of a successful match. Consider:
a+(*SKIP)b
- If the subject is "aaaac...", after the first match attempt fails
- (starting at the first character in the string), the starting point
+ If the subject is "aaaac...", after the first match attempt fails
+ (starting at the first character in the string), the starting point
skips on to start the next attempt at "c". Note that a possessive quan-
- tifer does not have the same effect as this example; although it would
- suppress backtracking during the first match attempt, the second
- attempt would start at the second character instead of skipping on to
+ tifer does not have the same effect as this example; although it would
+ suppress backtracking during the first match attempt, the second
+ attempt would start at the second character instead of skipping on to
"c".
(*SKIP:NAME)
- When (*SKIP) has an associated name, its behaviour is modified. If the
+ When (*SKIP) has an associated name, its behaviour is modified. If the
following pattern fails to match, the previous path through the pattern
- is searched for the most recent (*MARK) that has the same name. If one
- is found, the "bumpalong" advance is to the subject position that cor-
- responds to that (*MARK) instead of to where (*SKIP) was encountered.
- If no (*MARK) with a matching name is found, normal "bumpalong" of one
+ is searched for the most recent (*MARK) that has the same name. If one
+ is found, the "bumpalong" advance is to the subject position that cor-
+ responds to that (*MARK) instead of to where (*SKIP) was encountered.
+ If no (*MARK) with a matching name is found, normal "bumpalong" of one
character happens (the (*SKIP) is ignored).
(*THEN) or (*THEN:NAME)
- This verb causes a skip to the next alternation if the rest of the pat-
- tern does not match. That is, it cancels pending backtracking, but only
- within the current alternation. Its name comes from the observation
- that it can be used for a pattern-based if-then-else block:
+ This verb causes a skip to the next alternation in the innermost
+ enclosing group if the rest of the pattern does not match. That is, it
+ cancels pending backtracking, but only within the current alternation.
+ Its name comes from the observation that it can be used for a pattern-
+ based if-then-else block:
( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
@@ -5525,6 +5590,25 @@ BACKTRACKING CONTROL
(*MARK:NAME)(*THEN) if the overall match fails. If (*THEN) is not
directly inside an alternation, it acts like (*PRUNE).
+ The above verbs provide four different "strengths" of control when sub-
+ sequent matching fails. (*THEN) is the weakest, carrying on the match
+ at the next alternation. (*PRUNE) comes next, failing the match at the
+ current starting position, but allowing an advance to the next charac-
+ ter (for an unanchored pattern). (*SKIP) is similar, except that the
+ advance may be more than one character. (*COMMIT) is the strongest,
+ causing the entire match to fail.
+
+ If more than one is present in a pattern, the "stongest" one wins. For
+ example, consider this pattern, where A, B, etc. are complex pattern
+ fragments:
+
+ (A(*COMMIT)B(*THEN)C|D)
+
+ Once A has matched, PCRE is committed to this match, at the current
+ starting position. If subsequently B matches, but C does not, the nor-
+ mal (*THEN) action of trying the next alternation (that is, D) does not
+ happen because (*COMMIT) overrides.
+
SEE ALSO
@@ -5540,11 +5624,11 @@ AUTHOR
REVISION
- Last updated: 18 May 2010
+ Last updated: 31 October 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRESYNTAX(3) PCRESYNTAX(3)
@@ -5912,8 +5996,8 @@ REVISION
Last updated: 12 May 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPARTIAL(3) PCREPARTIAL(3)
@@ -5941,8 +6025,8 @@ PARTIAL MATCHING IN PCRE
reflecting the character that has been typed, for example. This immedi-
ate feedback is likely to be a better user interface than a check that
is delayed until the entire string has been entered. Partial matching
- can also sometimes be useful when the subject string is very long and
- is not all available at once.
+ can also be useful when the subject string is very long and is not all
+ available at once.
PCRE supports partial matching by means of the PCRE_PARTIAL_SOFT and
PCRE_PARTIAL_HARD options, which can be set when calling pcre_exec() or
@@ -5964,20 +6048,22 @@ PARTIAL MATCHING IN PCRE
PARTIAL MATCHING USING pcre_exec()
- A partial match occurs during a call to pcre_exec() whenever the end of
- the subject string is reached successfully, but matching cannot con-
- tinue because more characters are needed. However, at least one charac-
- ter must have been matched. (In other words, a partial match can never
- be an empty string.)
-
- If PCRE_PARTIAL_SOFT is set, the partial match is remembered, but
- matching continues as normal, and other alternatives in the pattern are
- tried. If no complete match can be found, pcre_exec() returns
- PCRE_ERROR_PARTIAL instead of PCRE_ERROR_NOMATCH. If there are at least
- two slots in the offsets vector, the first of them is set to the offset
- of the earliest character that was inspected when the partial match was
- found. For convenience, the second offset points to the end of the
- string so that a substring can easily be identified.
+ A partial match occurs during a call to pcre_exec() when the end of the
+ subject string is reached successfully, but matching cannot continue
+ because more characters are needed. However, at least one character in
+ the subject must have been inspected. This character need not form part
+ of the final matched string; lookbehind assertions and the \K escape
+ sequence provide ways of inspecting characters before the start of a
+ matched substring. The requirement for inspecting at least one charac-
+ ter exists because an empty string can always be matched; without such
+ a restriction there would always be a partial match of an empty string
+ at the end of the subject.
+
+ If there are at least two slots in the offsets vector when pcre_exec()
+ returns with a partial match, the first slot is set to the offset of
+ the earliest character that was inspected when the partial match was
+ found. For convenience, the second offset points to the end of the sub-
+ ject so that a substring can easily be identified.
For the majority of patterns, the first offset identifies the start of
the partially matched string. However, for patterns that contain look-
@@ -5989,120 +6075,147 @@ PARTIAL MATCHING USING pcre_exec()
This pattern matches "123", but only if it is preceded by "abc". If the
subject string is "xyzabc12", the offsets after a partial match are for
the substring "abc12", because all these characters are needed if
- another match is tried with extra characters added.
+ another match is tried with extra characters added to the subject.
+
+ What happens when a partial match is identified depends on which of the
+ two partial matching options are set.
+
+ PCRE_PARTIAL_SOFT with pcre_exec()
+
+ If PCRE_PARTIAL_SOFT is set when pcre_exec() identifies a partial
+ match, the partial match is remembered, but matching continues as nor-
+ mal, and other alternatives in the pattern are tried. If no complete
+ match can be found, pcre_exec() returns PCRE_ERROR_PARTIAL instead of
+ PCRE_ERROR_NOMATCH.
- If there is more than one partial match, the first one that was found
+ This option is "soft" because it prefers a complete match over a par-
+ tial match. All the various matching items in a pattern behave as if
+ the subject string is potentially complete. For example, \z, \Z, and $
+ match at the end of the subject, as normal, and for \b and \B the end
+ of the subject is treated as a non-alphanumeric.
+
+ If there is more than one partial match, the first one that was found
provides the data that is returned. Consider this pattern:
/123\w+X|dogY/
- If this is matched against the subject string "abc123dog", both alter-
- natives fail to match, but the end of the subject is reached during
- matching, so PCRE_ERROR_PARTIAL is returned instead of
- PCRE_ERROR_NOMATCH. The offsets are set to 3 and 9, identifying
- "123dog" as the first partial match that was found. (In this example,
- there are two partial matches, because "dog" on its own partially
- matches the second alternative.)
+ If this is matched against the subject string "abc123dog", both alter-
+ natives fail to match, but the end of the subject is reached during
+ matching, so PCRE_ERROR_PARTIAL is returned. The offsets are set to 3
+ and 9, identifying "123dog" as the first partial match that was found.
+ (In this example, there are two partial matches, because "dog" on its
+ own partially matches the second alternative.)
+
+ PCRE_PARTIAL_HARD with pcre_exec()
If PCRE_PARTIAL_HARD is set for pcre_exec(), it returns PCRE_ERROR_PAR-
TIAL as soon as a partial match is found, without continuing to search
- for possible complete matches. The difference between the two options
- can be illustrated by a pattern such as:
+ for possible complete matches. This option is "hard" because it prefers
+ an earlier partial match over a later complete match. For this reason,
+ the assumption is made that the end of the supplied subject string may
+ not be the true end of the available data, and so, if \z, \Z, \b, \B,
+ or $ are encountered at the end of the subject, the result is
+ PCRE_ERROR_PARTIAL.
+
+ Comparing hard and soft partial matching
+
+ The difference between the two partial matching options can be illus-
+ trated by a pattern such as:
/dog(sbody)?/
- This matches either "dog" or "dogsbody", greedily (that is, it prefers
- the longer string if possible). If it is matched against the string
- "dog" with PCRE_PARTIAL_SOFT, it yields a complete match for "dog".
+ This matches either "dog" or "dogsbody", greedily (that is, it prefers
+ the longer string if possible). If it is matched against the string
+ "dog" with PCRE_PARTIAL_SOFT, it yields a complete match for "dog".
However, if PCRE_PARTIAL_HARD is set, the result is PCRE_ERROR_PARTIAL.
- On the other hand, if the pattern is made ungreedy the result is dif-
+ On the other hand, if the pattern is made ungreedy the result is dif-
ferent:
/dog(sbody)??/
- In this case the result is always a complete match because pcre_exec()
- finds that first, and it never continues after finding a match. It
- might be easier to follow this explanation by thinking of the two pat-
+ In this case the result is always a complete match because pcre_exec()
+ finds that first, and it never continues after finding a match. It
+ might be easier to follow this explanation by thinking of the two pat-
terns like this:
/dog(sbody)?/ is the same as /dogsbody|dog/
/dog(sbody)??/ is the same as /dog|dogsbody/
- The second pattern will never match "dogsbody" when pcre_exec() is
+ The second pattern will never match "dogsbody" when pcre_exec() is
used, because it will always find the shorter match first.
PARTIAL MATCHING USING pcre_dfa_exec()
- The pcre_dfa_exec() function moves along the subject string character
- by character, without backtracking, searching for all possible matches
- simultaneously. If the end of the subject is reached before the end of
- the pattern, there is the possibility of a partial match, again pro-
- vided that at least one character has matched.
-
- When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if
- there have been no complete matches. Otherwise, the complete matches
- are returned. However, if PCRE_PARTIAL_HARD is set, a partial match
- takes precedence over any complete matches. The portion of the string
- that was inspected when the longest partial match was found is set as
+ The pcre_dfa_exec() function moves along the subject string character
+ by character, without backtracking, searching for all possible matches
+ simultaneously. If the end of the subject is reached before the end of
+ the pattern, there is the possibility of a partial match, again pro-
+ vided that at least one character has been inspected.
+
+ When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if
+ there have been no complete matches. Otherwise, the complete matches
+ are returned. However, if PCRE_PARTIAL_HARD is set, a partial match
+ takes precedence over any complete matches. The portion of the string
+ that was inspected when the longest partial match was found is set as
the first matching string, provided there are at least two slots in the
offsets vector.
- Because pcre_dfa_exec() always searches for all possible matches, and
- there is no difference between greedy and ungreedy repetition, its be-
+ Because pcre_dfa_exec() always searches for all possible matches, and
+ there is no difference between greedy and ungreedy repetition, its be-
haviour is different from pcre_exec when PCRE_PARTIAL_HARD is set. Con-
- sider the string "dog" matched against the ungreedy pattern shown
+ sider the string "dog" matched against the ungreedy pattern shown
above:
/dog(sbody)??/
- Whereas pcre_exec() stops as soon as it finds the complete match for
+ Whereas pcre_exec() stops as soon as it finds the complete match for
"dog", pcre_dfa_exec() also finds the partial match for "dogsbody", and
so returns that when PCRE_PARTIAL_HARD is set.
PARTIAL MATCHING AND WORD BOUNDARIES
- If a pattern ends with one of sequences \b or \B, which test for word
- boundaries, partial matching with PCRE_PARTIAL_SOFT can give counter-
+ If a pattern ends with one of sequences \b or \B, which test for word
+ boundaries, partial matching with PCRE_PARTIAL_SOFT can give counter-
intuitive results. Consider this pattern:
/\bcat\b/
This matches "cat", provided there is a word boundary at either end. If
the subject string is "the cat", the comparison of the final "t" with a
- following character cannot take place, so a partial match is found.
- However, pcre_exec() carries on with normal matching, which matches \b
- at the end of the subject when the last character is a letter, thus
+ following character cannot take place, so a partial match is found.
+ However, pcre_exec() carries on with normal matching, which matches \b
+ at the end of the subject when the last character is a letter, thus
finding a complete match. The result, therefore, is not PCRE_ERROR_PAR-
- TIAL. The same thing happens with pcre_dfa_exec(), because it also
+ TIAL. The same thing happens with pcre_dfa_exec(), because it also
finds the complete match.
- Using PCRE_PARTIAL_HARD in this case does yield PCRE_ERROR_PARTIAL,
+ Using PCRE_PARTIAL_HARD in this case does yield PCRE_ERROR_PARTIAL,
because then the partial match takes precedence.
FORMERLY RESTRICTED PATTERNS
For releases of PCRE prior to 8.00, because of the way certain internal
- optimizations were implemented in the pcre_exec() function, the
- PCRE_PARTIAL option (predecessor of PCRE_PARTIAL_SOFT) could not be
- used with all patterns. From release 8.00 onwards, the restrictions no
- longer apply, and partial matching with pcre_exec() can be requested
+ optimizations were implemented in the pcre_exec() function, the
+ PCRE_PARTIAL option (predecessor of PCRE_PARTIAL_SOFT) could not be
+ used with all patterns. From release 8.00 onwards, the restrictions no
+ longer apply, and partial matching with pcre_exec() can be requested
for any pattern.
Items that were formerly restricted were repeated single characters and
- repeated metasequences. If PCRE_PARTIAL was set for a pattern that did
- not conform to the restrictions, pcre_exec() returned the error code
- PCRE_ERROR_BADPARTIAL (-13). This error code is no longer in use. The
- PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out if a compiled
+ repeated metasequences. If PCRE_PARTIAL was set for a pattern that did
+ not conform to the restrictions, pcre_exec() returned the error code
+ PCRE_ERROR_BADPARTIAL (-13). This error code is no longer in use. The
+ PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out if a compiled
pattern can be used for partial matching now always returns 1.
EXAMPLE OF PARTIAL MATCHING USING PCRETEST
- If the escape sequence \P is present in a pcretest data line, the
- PCRE_PARTIAL_SOFT option is used for the match. Here is a run of
+ If the escape sequence \P is present in a pcretest data line, the
+ PCRE_PARTIAL_SOFT option is used for the match. Here is a run of
pcretest that uses the date example quoted above:
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@@ -6118,24 +6231,24 @@ EXAMPLE OF PARTIAL MATCHING USING PCRETEST
data> j\P
No match
- The first data string is matched completely, so pcretest shows the
- matched substrings. The remaining four strings do not match the com-
+ The first data string is matched completely, so pcretest shows the
+ matched substrings. The remaining four strings do not match the com-
plete pattern, but the first two are partial matches. Similar output is
obtained when pcre_dfa_exec() is used.
- If the escape sequence \P is present more than once in a pcretest data
+ If the escape sequence \P is present more than once in a pcretest data
line, the PCRE_PARTIAL_HARD option is set for the match.
MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()
When a partial match has been found using pcre_dfa_exec(), it is possi-
- ble to continue the match by providing additional subject data and
- calling pcre_dfa_exec() again with the same compiled regular expres-
- sion, this time setting the PCRE_DFA_RESTART option. You must pass the
+ ble to continue the match by providing additional subject data and
+ calling pcre_dfa_exec() again with the same compiled regular expres-
+ sion, this time setting the PCRE_DFA_RESTART option. You must pass the
same working space as before, because this is where details of the pre-
- vious partial match are stored. Here is an example using pcretest,
- using the \R escape sequence to set the PCRE_DFA_RESTART option (\D
+ vious partial match are stored. Here is an example using pcretest,
+ using the \R escape sequence to set the PCRE_DFA_RESTART option (\D
specifies the use of pcre_dfa_exec()):
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@@ -6144,30 +6257,33 @@ MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()
data> n05\R\D
0: n05
- The first call has "23ja" as the subject, and requests partial match-
- ing; the second call has "n05" as the subject for the continued
- (restarted) match. Notice that when the match is complete, only the
- last part is shown; PCRE does not retain the previously partially-
- matched string. It is up to the calling program to do that if it needs
+ The first call has "23ja" as the subject, and requests partial match-
+ ing; the second call has "n05" as the subject for the continued
+ (restarted) match. Notice that when the match is complete, only the
+ last part is shown; PCRE does not retain the previously partially-
+ matched string. It is up to the calling program to do that if it needs
to.
- You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
- PCRE_DFA_RESTART to continue partial matching over multiple segments.
- This facility can be used to pass very long subject strings to
+ You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
+ PCRE_DFA_RESTART to continue partial matching over multiple segments.
+ This facility can be used to pass very long subject strings to
pcre_dfa_exec().
MULTI-SEGMENT MATCHING WITH pcre_exec()
- From release 8.00, pcre_exec() can also be used to do multi-segment
- matching. Unlike pcre_dfa_exec(), it is not possible to restart the
- previous match with a new segment of data. Instead, new data must be
- added to the previous subject string, and the entire match re-run,
- starting from the point where the partial match occurred. Earlier data
- can be discarded. Consider an unanchored pattern that matches dates:
+ From release 8.00, pcre_exec() can also be used to do multi-segment
+ matching. Unlike pcre_dfa_exec(), it is not possible to restart the
+ previous match with a new segment of data. Instead, new data must be
+ added to the previous subject string, and the entire match re-run,
+ starting from the point where the partial match occurred. Earlier data
+ can be discarded. It is best to use PCRE_PARTIAL_HARD in this situa-
+ tion, because it does not treat the end of a segment as the end of the
+ subject when matching \z, \Z, \b, \B, and $. Consider an unanchored
+ pattern that matches dates:
re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
- data> The date is 23ja\P
+ data> The date is 23ja\P\P
Partial match: 23ja
At this stage, an application could discard the text preceding "23ja",
@@ -6188,29 +6304,30 @@ ISSUES WITH MULTI-SEGMENT MATCHING
Certain types of pattern may give problems with multi-segment matching,
whichever matching function is used.
- 1. If the pattern contains tests for the beginning or end of a line,
- you need to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropri-
- ate, when the subject string for any call does not contain the begin-
- ning or end of a line.
-
- 2. Lookbehind assertions at the start of a pattern are catered for in
- the offsets that are returned for a partial match. However, in theory,
- a lookbehind assertion later in the pattern could require even earlier
- characters to be inspected, and it might not have been reached when a
- partial match occurs. This is probably an extremely unlikely case; you
- could guard against it to a certain extent by always including extra
+ 1. If the pattern contains a test for the beginning of a line, you need
+ to pass the PCRE_NOTBOL option when the subject string for any call
+ does start at the beginning of a line. There is also a PCRE_NOTEOL
+ option, but in practice when doing multi-segment matching you should be
+ using PCRE_PARTIAL_HARD, which includes the effect of PCRE_NOTEOL.
+
+ 2. Lookbehind assertions at the start of a pattern are catered for in
+ the offsets that are returned for a partial match. However, in theory,
+ a lookbehind assertion later in the pattern could require even earlier
+ characters to be inspected, and it might not have been reached when a
+ partial match occurs. This is probably an extremely unlikely case; you
+ could guard against it to a certain extent by always including extra
characters at the start.
- 3. Matching a subject string that is split into multiple segments may
- not always produce exactly the same result as matching over one single
- long string, especially when PCRE_PARTIAL_SOFT is used. The section
- "Partial Matching and Word Boundaries" above describes an issue that
- arises if the pattern ends with \b or \B. Another kind of difference
- may occur when there are multiple matching possibilities, because a
- partial match result is given only when there are no completed matches.
- This means that as soon as the shortest match has been found, continua-
- tion to a new subject segment is no longer possible. Consider again
- this pcretest example:
+ 3. Matching a subject string that is split into multiple segments may
+ not always produce exactly the same result as matching over one single
+ long string, especially when PCRE_PARTIAL_SOFT is used. The section
+ "Partial Matching and Word Boundaries" above describes an issue that
+ arises if the pattern ends with \b or \B. Another kind of difference
+ may occur when there are multiple matching possibilities, because (for
+ PCRE_PARTIAL_SOFT) a partial match result is given only when there are
+ no completed matches. This means that as soon as the shortest match has
+ been found, continuation to a new subject segment is no longer possi-
+ ble. Consider again this pcretest example:
re> /dog(sbody)?/
data> dogsb\P
@@ -6223,18 +6340,18 @@ ISSUES WITH MULTI-SEGMENT MATCHING
0: dogsbody
1: dog
- The first data line passes the string "dogsb" to pcre_exec(), setting
- the PCRE_PARTIAL_SOFT option. Although the string is a partial match
- for "dogsbody", the result is not PCRE_ERROR_PARTIAL, because the
- shorter string "dog" is a complete match. Similarly, when the subject
- is presented to pcre_dfa_exec() in several parts ("do" and "gsb" being
+ The first data line passes the string "dogsb" to pcre_exec(), setting
+ the PCRE_PARTIAL_SOFT option. Although the string is a partial match
+ for "dogsbody", the result is not PCRE_ERROR_PARTIAL, because the
+ shorter string "dog" is a complete match. Similarly, when the subject
+ is presented to pcre_dfa_exec() in several parts ("do" and "gsb" being
the first two) the match stops when "dog" has been found, and it is not
- possible to continue. On the other hand, if "dogsbody" is presented as
+ possible to continue. On the other hand, if "dogsbody" is presented as
a single string, pcre_dfa_exec() finds both matches.
- Because of these problems, it is probably best to use PCRE_PARTIAL_HARD
- when matching multi-segment data. The example above then behaves dif-
- ferently:
+ Because of these problems, it is best to use PCRE_PARTIAL_HARD when
+ matching multi-segment data. The example above then behaves differ-
+ ently:
re> /dog(sbody)?/
data> dogsb\P\P
@@ -6246,39 +6363,39 @@ ISSUES WITH MULTI-SEGMENT MATCHING
4. Patterns that contain alternatives at the top level which do not all
- start with the same pattern item may not work as expected when
- PCRE_DFA_RESTART is used with pcre_dfa_exec(). For example, consider
+ start with the same pattern item may not work as expected when
+ PCRE_DFA_RESTART is used with pcre_dfa_exec(). For example, consider
this pattern:
1234|3789
- If the first part of the subject is "ABC123", a partial match of the
- first alternative is found at offset 3. There is no partial match for
+ If the first part of the subject is "ABC123", a partial match of the
+ first alternative is found at offset 3. There is no partial match for
the second alternative, because such a match does not start at the same
- point in the subject string. Attempting to continue with the string
- "7890" does not yield a match because only those alternatives that
- match at one point in the subject are remembered. The problem arises
- because the start of the second alternative matches within the first
- alternative. There is no problem with anchored patterns or patterns
+ point in the subject string. Attempting to continue with the string
+ "7890" does not yield a match because only those alternatives that
+ match at one point in the subject are remembered. The problem arises
+ because the start of the second alternative matches within the first
+ alternative. There is no problem with anchored patterns or patterns
such as:
1234|ABCD
- where no string can be a partial match for both alternatives. This is
- not a problem if pcre_exec() is used, because the entire match has to
+ where no string can be a partial match for both alternatives. This is
+ not a problem if pcre_exec() is used, because the entire match has to
be rerun each time:
re> /1234|3789/
- data> ABC123\P
+ data> ABC123\P\P
Partial match: 123
data> 1237890
0: 3789
- Of course, instead of using PCRE_DFA_PARTIAL, the same technique of re-
+ Of course, instead of using PCRE_DFA_RESTART, the same technique of re-
running the entire match can also be used with pcre_dfa_exec(). Another
possibility is to work with two buffers. If a partial match at offset n
- in the first buffer is followed by "no match" when PCRE_DFA_RESTART is
- used on the second buffer, you can then try a new match starting at
+ in the first buffer is followed by "no match" when PCRE_DFA_RESTART is
+ used on the second buffer, you can then try a new match starting at
offset n+1 in the first buffer.
@@ -6291,11 +6408,11 @@ AUTHOR
REVISION
- Last updated: 19 October 2009
- Copyright (c) 1997-2009 University of Cambridge.
+ Last updated: 22 October 2010
+ Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPRECOMPILE(3) PCREPRECOMPILE(3)
@@ -6418,8 +6535,8 @@ REVISION
Last updated: 13 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPERFORM(3) PCREPERFORM(3)
@@ -6586,8 +6703,8 @@ REVISION
Last updated: 16 May 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPOSIX(3) PCREPOSIX(3)
@@ -6849,8 +6966,8 @@ REVISION
Last updated: 16 May 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECPP(3) PCRECPP(3)
@@ -7190,8 +7307,8 @@ REVISION
Last updated: 17 March 2009
------------------------------------------------------------------------------
-
-
+
+
PCRESAMPLE(3) PCRESAMPLE(3)
@@ -7426,5 +7543,5 @@ REVISION
Last updated: 03 January 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index b56ee7e..e3a9869 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -1515,7 +1515,8 @@ in the main
\fBpcre\fP
.\"
page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_exec()\fP returns
-the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP contains an invalid value,
+the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP contains a value that does
+not point to the start of a UTF-8 character (or to the end of the subject),
PCRE_ERROR_BADUTF8_OFFSET is returned.
.P
If you already know that your subject is valid, and you want to skip these
@@ -1523,10 +1524,10 @@ checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
calling \fBpcre_exec()\fP. You might want to do this for the second and
subsequent calls to \fBpcre_exec()\fP if you are making repeated calls to find
all the matches in a single subject string. However, you should be sure that
-the value of \fIstartoffset\fP points to the start of a UTF-8 character. When
-PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8 string as a
-subject, or a value of \fIstartoffset\fP that does not point to the start of a
-UTF-8 character, is undefined. Your program may crash.
+the value of \fIstartoffset\fP points to the start of a UTF-8 character (or the
+end of the subject). When PCRE_NO_UTF8_CHECK is set, the effect of passing an
+invalid UTF-8 string as a subject or an invalid value of \fIstartoffset\fP is
+undefined. Your program may crash.
.sp
PCRE_PARTIAL_HARD
PCRE_PARTIAL_SOFT
@@ -1555,15 +1556,20 @@ discussion of partial and multi-segment matching, with examples, in the
.\"
documentation.
.
+.
.SS "The string to be matched by \fBpcre_exec()\fP"
.rs
.sp
The subject string is passed to \fBpcre_exec()\fP as a pointer in
\fIsubject\fP, a length (in bytes) in \fIlength\fP, and a starting byte offset
-in \fIstartoffset\fP. In UTF-8 mode, the byte offset must point to the start of
-a UTF-8 character. Unlike the pattern string, the subject may contain binary
-zero bytes. When the starting offset is zero, the search for a match starts at
-the beginning of the subject, and this is by far the most common case.
+in \fIstartoffset\fP. If this is negative or greater than the length of the
+subject, \fBpcre_exec()\fP returns PCRE_ERROR_BADOFFSET.
+.P
+In UTF-8 mode, the byte offset must point to the start of a UTF-8 character (or
+the end of the subject). Unlike the pattern string, the subject may contain
+binary zero bytes. When the starting offset is zero, the search for a match
+starts at the beginning of the subject, and this is by far the most common
+case.
.P
A non-zero starting offset is useful when searching for another match in the
same subject by calling \fBpcre_exec()\fP again after a previous success.
@@ -1583,10 +1589,25 @@ start of the subject, which is deemed to be a word boundary. However, if
set to 4, it finds the second occurrence of "iss" because it is able to look
behind the starting point to discover that it is preceded by a letter.
.P
+Finding all the matches in a subject is tricky when the pattern can match an
+empty string. It is possible to emulate Perl's /g behaviour by first trying the
+match again at the same offset, with the PCRE_NOTEMPTY_ATSTART and
+PCRE_ANCHORED options, and then if that fails, advancing the starting offset
+and trying an ordinary match again. There is some code that demonstrates how to
+do this in the
+.\" HREF
+\fBpcredemo\fP
+.\"
+sample program. In the most general case, you have to check to see if the
+newline convention recognizes CRLF as a newline, and if so, and the current
+character is CR followed by LF, advance the starting offset by two characters
+instead of one.
+.P
If a non-zero starting offset is passed when the pattern is anchored, one
attempt to match at the given offset is made. This can only succeed if the
pattern does not require the match to be at the start of the subject.
.
+.
.SS "How \fBpcre_exec()\fP returns captured substrings"
.rs
.sp
@@ -1769,6 +1790,11 @@ description above.
PCRE_ERROR_BADNEWLINE (-23)
.sp
An invalid combination of PCRE_NEWLINE_\fIxxx\fP options was given.
+.sp
+ PCRE_ERROR_BADOFFSET (-24)
+.sp
+The value of \fIstartoffset\fP was negative or greater than the length of the
+subject, that is, the value in \fIlength\fP.
.P
Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
.
@@ -2196,6 +2222,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 01 November 2010
+Last updated: 06 November 2010
Copyright (c) 1997-2010 University of Cambridge.
.fi
diff --git a/doc/pcredemo.3 b/doc/pcredemo.3
index 01decf7..c419086 100644
--- a/doc/pcredemo.3
+++ b/doc/pcredemo.3
@@ -67,13 +67,16 @@ const char *error;
char *pattern;
char *subject;
unsigned char *name_table;
+unsigned int option_bits;
int erroffset;
int find_all;
+int crlf_is_newline;
int namecount;
int name_entry_size;
int ovector[OVECCOUNT];
int subject_length;
int rc, i;
+int utf8;
/**************************************************************************
@@ -255,15 +258,56 @@ if (namecount <= 0) printf("No named substrings\en"); else
* subject is not a valid match; other possibilities must be tried. The *
* second flag restricts PCRE to one match attempt at the initial string *
* position. If this match succeeds, an alternative to the empty string *
-* match has been found, and we can proceed round the loop. *
+* match has been found, and we can print it and proceed round the loop, *
+* advancing by the length of whatever was found. If this match does not *
+* succeed, we still stay in the loop, advancing by just one character. *
+* In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
+* more than one byte. *
+* *
+* However, there is a complication concerned with newlines. When the *
+* newline convention is such that CRLF is a valid newline, we want must *
+* advance by two characters rather than one. The newline convention can *
+* be set in the regex by (*CR), etc.; if not, we must find the default. *
*************************************************************************/
-if (!find_all)
+if (!find_all) /* Check for -g */
{
pcre_free(re); /* Release the memory used for the compiled pattern */
return 0; /* Finish unless -g was given */
}
+/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
+sequence. First, find the options with which the regex was compiled; extract
+the UTF-8 state, and mask off all but the newline options. */
+
+(void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
+utf8 = option_bits & PCRE_UTF8;
+option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
+ PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
+
+/* If no newline options were set, find the default newline convention from the
+build configuration. */
+
+if (option_bits == 0)
+ {
+ int d;
+ (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
+ /* Note that these values are always the ASCII ones, even in
+ EBCDIC environments. CR = 13, NL = 10. */
+ option_bits = (d == 13)? PCRE_NEWLINE_CR :
+ (d == 10)? PCRE_NEWLINE_LF :
+ (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
+ (d == -2)? PCRE_NEWLINE_ANYCRLF :
+ (d == -1)? PCRE_NEWLINE_ANY : 0;
+ }
+
+/* See if CRLF is a valid newline sequence. */
+
+crlf_is_newline =
+ option_bits == PCRE_NEWLINE_ANY ||
+ option_bits == PCRE_NEWLINE_CRLF ||
+ option_bits == PCRE_NEWLINE_ANYCRLF;
+
/* Loop for second and subsequent matches */
for (;;)
@@ -297,14 +341,32 @@ for (;;)
is zero, it just means we have found all possible matches, so the loop ends.
Otherwise, it means we have failed to find a non-empty-string match at a
point where there was a previous empty-string match. In this case, we do what
- Perl does: advance the matching position by one, and continue. We do this by
- setting the "end of previous match" offset, because that is picked up at the
- top of the loop as the point at which to start again. */
+ Perl does: advance the matching position by one character, and continue. We
+ do this by setting the "end of previous match" offset, because that is picked
+ up at the top of the loop as the point at which to start again.
+
+ There are two complications: (a) When CRLF is a valid newline sequence, and
+ the current position is just before it, advance by an extra byte. (b)
+ Otherwise we must ensure that we skip an entire UTF-8 character if we are in
+ UTF-8 mode. */
if (rc == PCRE_ERROR_NOMATCH)
{
- if (options == 0) break;
- ovector[1] = start_offset + 1;
+ if (options == 0) break; /* All matches found */
+ ovector[1] = start_offset + 1; /* Advance one byte */
+ if (crlf_is_newline && /* If CRLF is newline & */
+ start_offset < subject_length - 1 && /* we are at CRLF, */
+ subject[start_offset] == '\er' &&
+ subject[start_offset + 1] == '\en')
+ ovector[1] += 1; /* Advance by one more. */
+ else if (utf8) /* Otherwise, ensure we */
+ { /* advance a whole UTF-8 */
+ while (ovector[1] < subject_length) /* character. */
+ {
+ if ((subject[ovector[1]] & 0xc0) != 0x80) break;
+ ovector[1] += 1;
+ }
+ }
continue; /* Go round the loop again */
}
diff --git a/doc/pcregrep.txt b/doc/pcregrep.txt
index acc5fc5..7f9c31d 100644
--- a/doc/pcregrep.txt
+++ b/doc/pcregrep.txt
@@ -347,90 +347,140 @@ OPTIONS
library's default (usually the "C" locale) is used. There is
no short form for this option.
+ --match-limit=number
+ Processing some regular expression patterns can require a
+ very large amount of memory, leading in some cases to a pro-
+ gram crash if not enough is available. Other patterns may
+ take a very long time to search for all possible matching
+ strings. The pcre_exec() function that is called by pcregrep
+ to do the matching has two parameters that can limit the
+ resources that it uses.
+
+ The --match-limit option provides a means of limiting
+ resource usage when processing patterns that are not going to
+ match, but which have a very large number of possibilities in
+ their search trees. The classic example is a pattern that
+ uses nested unlimited repeats. Internally, PCRE uses a func-
+ tion called match() which it calls repeatedly (sometimes
+ recursively). The limit set by --match-limit is imposed on
+ the number of times this function is called during a match,
+ which has the effect of limiting the amount of backtracking
+ that can take place.
+
+ The --recursion-limit option is similar to --match-limit, but
+ instead of limiting the total number of times that match() is
+ called, it limits the depth of recursive calls, which in turn
+ limits the amount of memory that can be used. The recursion
+ depth is a smaller number than the total number of calls,
+ because not all calls to match() are recursive. This limit is
+ of use only if it is set smaller than --match-limit.
+
+ There are no short forms for these options. The default set-
+ tings are specified when the PCRE library is compiled, with
+ the default default being 10 million.
+
-M, --multiline
- Allow patterns to match more than one line. When this option
+ Allow patterns to match more than one line. When this option
is given, patterns may usefully contain literal newline char-
- acters and internal occurrences of ^ and $ characters. The
- output for any one match may consist of more than one line.
- When this option is set, the PCRE library is called in "mul-
- tiline" mode. There is a limit to the number of lines that
- can be matched, imposed by the way that pcregrep buffers the
- input file as it scans it. However, pcregrep ensures that at
+ acters and internal occurrences of ^ and $ characters. The
+ output for any one match may consist of more than one line.
+ When this option is set, the PCRE library is called in "mul-
+ tiline" mode. There is a limit to the number of lines that
+ can be matched, imposed by the way that pcregrep buffers the
+ input file as it scans it. However, pcregrep ensures that at
least 8K characters or the rest of the document (whichever is
- the shorter) are available for forward matching, and simi-
+ the shorter) are available for forward matching, and simi-
larly the previous 8K characters (or all the previous charac-
- ters, if fewer than 8K) are guaranteed to be available for
- lookbehind assertions. This option does not work when input
+ ters, if fewer than 8K) are guaranteed to be available for
+ lookbehind assertions. This option does not work when input
is read line by line (see --line-buffered.)
-N newline-type, --newline=newline-type
- The PCRE library supports five different conventions for
- indicating the ends of lines. They are the single-character
- sequences CR (carriage return) and LF (linefeed), the two-
- character sequence CRLF, an "anycrlf" convention, which rec-
- ognizes any of the preceding three types, and an "any" con-
+ The PCRE library supports five different conventions for
+ indicating the ends of lines. They are the single-character
+ sequences CR (carriage return) and LF (linefeed), the two-
+ character sequence CRLF, an "anycrlf" convention, which rec-
+ ognizes any of the preceding three types, and an "any" con-
vention, in which any Unicode line ending sequence is assumed
- to end a line. The Unicode sequences are the three just men-
- tioned, plus VT (vertical tab, U+000B), FF (formfeed,
- U+000C), NEL (next line, U+0085), LS (line separator,
+ to end a line. The Unicode sequences are the three just men-
+ tioned, plus VT (vertical tab, U+000B), FF (formfeed,
+ U+000C), NEL (next line, U+0085), LS (line separator,
U+2028), and PS (paragraph separator, U+2029).
When the PCRE library is built, a default line-ending
- sequence is specified. This is normally the standard
+ sequence is specified. This is normally the standard
sequence for the operating system. Unless otherwise specified
- by this option, pcregrep uses the library's default. The
+ by this option, pcregrep uses the library's default. The
possible values for this option are CR, LF, CRLF, ANYCRLF, or
- ANY. This makes it possible to use pcregrep on files that
- have come from other environments without having to modify
- their line endings. If the data that is being scanned does
- not agree with the convention set by this option, pcregrep
+ ANY. This makes it possible to use pcregrep on files that
+ have come from other environments without having to modify
+ their line endings. If the data that is being scanned does
+ not agree with the convention set by this option, pcregrep
may behave in strange ways.
-n, --line-number
Precede each output line by its line number in the file, fol-
- lowed by a colon for matching lines or a hyphen for context
- lines. If the filename is also being output, it precedes the
+ lowed by a colon for matching lines or a hyphen for context
+ lines. If the filename is also being output, it precedes the
line number. This option is forced if --line-offsets is used.
-o, --only-matching
- Show only the part of the line that matched a pattern. In
- this mode, no context is shown. That is, the -A, -B, and -C
- options are ignored. If there is more than one match in a
- line, each of them is shown separately. If -o is combined
- with -v (invert the sense of the match to find non-matching
- lines), no output is generated, but the return code is set
- appropriately. This option is mutually exclusive with --file-
- offsets and --line-offsets.
+ Show only the part of the line that matched a pattern instead
+ of the whole line. In this mode, no context is shown. That
+ is, the -A, -B, and -C options are ignored. If there is more
+ than one match in a line, each of them is shown separately.
+ If -o is combined with -v (invert the sense of the match to
+ find non-matching lines), no output is generated, but the
+ return code is set appropriately. If the matched portion of
+ the line is empty, nothing is output unless the file name or
+ line number are being printed, in which case they are shown
+ on an otherwise empty line. This option is mutually exclusive
+ with --file-offsets and --line-offsets.
+
+ -onumber, --only-matching=number
+ Show only the part of the line that matched the capturing
+ parentheses of the given number. Up to 32 capturing parenthe-
+ ses are supported. Because these options can be given without
+ an argument (see above), if an argument is present, it must
+ be given in the same shell item, for example, -o3 or --only-
+ matching=2. The comments given for the non-argument case
+ above also apply to this case. If the specified capturing
+ parentheses do not exist in the pattern, or were not set in
+ the match, nothing is output unless the file name or line
+ number are being printed.
-q, --quiet
Work quietly, that is, display nothing except error messages.
- The exit status indicates whether or not any matches were
+ The exit status indicates whether or not any matches were
found.
-r, --recursive
- If any given path is a directory, recursively scan the files
- it contains, taking note of any --include and --exclude set-
- tings. By default, a directory is read as a normal file; in
- some operating systems this gives an immediate end-of-file.
- This option is a shorthand for setting the -d option to
+ If any given path is a directory, recursively scan the files
+ it contains, taking note of any --include and --exclude set-
+ tings. By default, a directory is read as a normal file; in
+ some operating systems this gives an immediate end-of-file.
+ This option is a shorthand for setting the -d option to
"recurse".
+ --recursion-limit=number
+ See --match-limit above.
+
-s, --no-messages
- Suppress error messages about non-existent or unreadable
- files. Such files are quietly skipped. However, the return
+ Suppress error messages about non-existent or unreadable
+ files. Such files are quietly skipped. However, the return
code is still 2, even if matches were found in other files.
-u, --utf-8
- Operate in UTF-8 mode. This option is available only if PCRE
- has been compiled with UTF-8 support. Both patterns and sub-
+ Operate in UTF-8 mode. This option is available only if PCRE
+ has been compiled with UTF-8 support. Both patterns and sub-
ject lines must be valid strings of UTF-8 characters.
-V, --version
- Write the version numbers of pcregrep and the PCRE library
+ Write the version numbers of pcregrep and the PCRE library
that is being used to the standard error stream.
-v, --invert-match
- Invert the sense of the match, so that lines which do not
+ Invert the sense of the match, so that lines which do not
match any of the patterns are the ones that are found.
-w, --word-regex, --word-regexp
@@ -438,38 +488,38 @@ OPTIONS
lent to having \b at the start and end of the pattern.
-x, --line-regex, --line-regexp
- Force the patterns to be anchored (each must start matching
- at the beginning of a line) and in addition, require them to
- match entire lines. This is equivalent to having ^ and $
+ Force the patterns to be anchored (each must start matching
+ at the beginning of a line) and in addition, require them to
+ match entire lines. This is equivalent to having ^ and $
characters at the start and end of each alternative branch in
every pattern.
ENVIRONMENT VARIABLES
- The environment variables LC_ALL and LC_CTYPE are examined, in that
- order, for a locale. The first one that is set is used. This can be
- overridden by the --locale option. If no locale is set, the PCRE
+ The environment variables LC_ALL and LC_CTYPE are examined, in that
+ order, for a locale. The first one that is set is used. This can be
+ overridden by the --locale option. If no locale is set, the PCRE
library's default (usually the "C" locale) is used.
NEWLINES
- The -N (--newline) option allows pcregrep to scan files with different
- newline conventions from the default. However, the setting of this
- option does not affect the way in which pcregrep writes information to
- the standard error and output streams. It uses the string "\n" in C
- printf() calls to indicate newlines, relying on the C I/O library to
- convert this to an appropriate sequence if the output is sent to a
+ The -N (--newline) option allows pcregrep to scan files with different
+ newline conventions from the default. However, the setting of this
+ option does not affect the way in which pcregrep writes information to
+ the standard error and output streams. It uses the string "\n" in C
+ printf() calls to indicate newlines, relying on the C I/O library to
+ convert this to an appropriate sequence if the output is sent to a
file.
OPTIONS COMPATIBILITY
The majority of short and long forms of pcregrep's options are the same
- as in the GNU grep program. Any long option of the form --xxx-regexp
- (GNU terminology) is also available as --xxx-regex (PCRE terminology).
- However, the --locale, -M, --multiline, -u, and --utf-8 options are
+ as in the GNU grep program. Any long option of the form --xxx-regexp
+ (GNU terminology) is also available as --xxx-regex (PCRE terminology).
+ However, the --locale, -M, --multiline, -u, and --utf-8 options are
specific to pcregrep. If both the -c and -l options are given, GNU grep
lists only file names, without counts, but pcregrep gives the counts.
@@ -477,48 +527,48 @@ OPTIONS COMPATIBILITY
OPTIONS WITH DATA
There are four different ways in which an option with data can be spec-
- ified. If a short form option is used, the data may follow immedi-
+ ified. If a short form option is used, the data may follow immedi-
ately, or in the next command line item. For example:
-f/some/file
-f /some/file
- If a long form option is used, the data may appear in the same command
+ If a long form option is used, the data may appear in the same command
line item, separated by an equals character, or (with one exception) it
may appear in the next command line item. For example:
--file=/some/file
--file /some/file
- Note, however, that if you want to supply a file name beginning with ~
- as data in a shell command, and have the shell expand ~ to a home
+ Note, however, that if you want to supply a file name beginning with ~
+ as data in a shell command, and have the shell expand ~ to a home
directory, you must separate the file name from the option, because the
shell does not treat ~ specially unless it is at the start of an item.
- The exception to the above is the --colour (or --color) option, for
- which the data is optional. If this option does have data, it must be
- given in the first form, using an equals character. Otherwise it will
+ The exception to the above is the --colour (or --color) option, for
+ which the data is optional. If this option does have data, it must be
+ given in the first form, using an equals character. Otherwise it will
be assumed that it has no data.
MATCHING ERRORS
- It is possible to supply a regular expression that takes a very long
- time to fail to match certain lines. Such patterns normally involve
- nested indefinite repeats, for example: (a+)*\d when matched against a
- line of a's with no final digit. The PCRE matching function has a
- resource limit that causes it to abort in these circumstances. If this
+ It is possible to supply a regular expression that takes a very long
+ time to fail to match certain lines. Such patterns normally involve
+ nested indefinite repeats, for example: (a+)*\d when matched against a
+ line of a's with no final digit. The PCRE matching function has a
+ resource limit that causes it to abort in these circumstances. If this
happens, pcregrep outputs an error message and the line that caused the
- problem to the standard error stream. If there are more than 20 such
+ problem to the standard error stream. If there are more than 20 such
errors, pcregrep gives up.
DIAGNOSTICS
Exit status is 0 if any matches were found, 1 if no matches were found,
- and 2 for syntax errors and non-existent or inacessible files (even if
- matches were found in other files) or too many matching errors. Using
- the -s option to suppress error messages about inaccessble files does
+ and 2 for syntax errors and non-existent or inacessible files (even if
+ matches were found in other files) or too many matching errors. Using
+ the -s option to suppress error messages about inaccessble files does
not affect the return code.
@@ -536,5 +586,5 @@ AUTHOR
REVISION
- Last updated: 21 May 2010
+ Last updated: 31 October 2010
Copyright (c) 1997-2010 University of Cambridge.
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index 37b970c..42fc4fd 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -414,10 +414,10 @@ recognized:
.\" JOIN
\e? pass the PCRE_NO_UTF8_CHECK option to
\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP
- \e>dd start the match at offset dd (any number of digits);
.\" JOIN
- this sets the \fIstartoffset\fP argument for \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
+ \e>dd start the match at offset dd (optional "-"; then
+ any number of digits); this sets the \fIstartoffset\fP
+ argument for \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP
.\" JOIN
\e<cr> pass the PCRE_NEWLINE_CR option to \fBpcre_exec()\fP
or \fBpcre_dfa_exec()\fP
@@ -770,6 +770,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 01 November 2010
+Last updated: 06 November 2010
Copyright (c) 1997-2010 University of Cambridge.
.fi
diff --git a/doc/pcretest.txt b/doc/pcretest.txt
index ad362cb..181f1be 100644
--- a/doc/pcretest.txt
+++ b/doc/pcretest.txt
@@ -205,9 +205,11 @@ PATTERN MODIFIERS
string, the next call is done with the PCRE_NOTEMPTY_ATSTART and
PCRE_ANCHORED flags set in order to search for another, non-empty,
match at the same point. If this second match fails, the start offset
- is advanced by one character, and the normal match is retried. This
- imitates the way Perl handles such cases when using the /g modifier or
- the split() function.
+ is advanced, and the normal match is retried. This imitates the way
+ Perl handles such cases when using the /g modifier or the split() func-
+ tion. Normally, the start offset is advanced by one character, but if
+ the newline convention recognizes CRLF as a newline, and the current
+ character is CR followed by LF, an advance of two is used.
Other modifiers
@@ -370,9 +372,9 @@ DATA LINES
or pcre_dfa_exec()
\? pass the PCRE_NO_UTF8_CHECK option to
pcre_exec() or pcre_dfa_exec()
- \>dd start the match at offset dd (any number of digits);
- this sets the startoffset argument for pcre_exec()
- or pcre_dfa_exec()
+ \>dd start the match at offset dd (optional "-"; then
+ any number of digits); this sets the startoffset
+ argument for pcre_exec() or pcre_dfa_exec()
\<cr> pass the PCRE_NEWLINE_CR option to pcre_exec()
or pcre_dfa_exec()
\<lf> pass the PCRE_NEWLINE_LF option to pcre_exec()
@@ -449,8 +451,11 @@ DEFAULT OUTPUT FROM PCRETEST
matched the whole pattern. Otherwise, it outputs "No match" when the
return is PCRE_ERROR_NOMATCH, and "Partial match:" followed by the par-
tially matching substring when pcre_exec() returns PCRE_ERROR_PARTIAL.
- For any other returns, it outputs the PCRE negative error number. Here
- is an example of an interactive pcretest run.
+ (Note that this is the entire substring that was inspected during the
+ partial match; it may include characters before the actual match start
+ if a lookbehind assertion, \K, \b, or \B was involved.) For any other
+ returns, it outputs the PCRE negative error number. Here is an example
+ of an interactive pcretest run.
$ pcretest
PCRE version 7.0 30-Nov-2006
@@ -462,11 +467,11 @@ DEFAULT OUTPUT FROM PCRETEST
data> xyz
No match
- Note that unset capturing substrings that are not followed by one that
- is set are not returned by pcre_exec(), and are not shown by pcretest.
- In the following example, there are two capturing substrings, but when
- the first data line is matched, the second, unset substring is not
- shown. An "internal" unset substring is shown as "<unset>", as for the
+ Note that unset capturing substrings that are not followed by one that
+ is set are not returned by pcre_exec(), and are not shown by pcretest.
+ In the following example, there are two capturing substrings, but when
+ the first data line is matched, the second, unset substring is not
+ shown. An "internal" unset substring is shown as "<unset>", as for the
second data line.
re> /(a)|(b)/
@@ -478,11 +483,11 @@ DEFAULT OUTPUT FROM PCRETEST
1: <unset>
2: b
- If the strings contain any non-printing characters, they are output as
- \0x escapes, or as \x{...} escapes if the /8 modifier was present on
- the pattern. See below for the definition of non-printing characters.
- If the pattern has the /+ modifier, the output for substring 0 is fol-
- lowed by the the rest of the subject string, identified by "0+" like
+ If the strings contain any non-printing characters, they are output as
+ \0x escapes, or as \x{...} escapes if the /8 modifier was present on
+ the pattern. See below for the definition of non-printing characters.
+ If the pattern has the /+ modifier, the output for substring 0 is fol-
+ lowed by the the rest of the subject string, identified by "0+" like
this:
re> /cat/+
@@ -490,7 +495,7 @@ DEFAULT OUTPUT FROM PCRETEST
0: cat
0+ aract
- If the pattern has the /g or /G modifier, the results of successive
+ If the pattern has the /g or /G modifier, the results of successive
matching attempts are output in sequence, like this:
re> /\Bi(\w\w)/g
@@ -504,24 +509,24 @@ DEFAULT OUTPUT FROM PCRETEST
"No match" is output only if the first match attempt fails.
- If any of the sequences \C, \G, or \L are present in a data line that
- is successfully matched, the substrings extracted by the convenience
+ If any of the sequences \C, \G, or \L are present in a data line that
+ is successfully matched, the substrings extracted by the convenience
functions are output with C, G, or L after the string number instead of
a colon. This is in addition to the normal full list. The string length
- (that is, the return from the extraction function) is given in paren-
+ (that is, the return from the extraction function) is given in paren-
theses after each string for \C and \G.
Note that whereas patterns can be continued over several lines (a plain
">" prompt is used for continuations), data lines may not. However new-
- lines can be included in data by means of the \n escape (or \r, \r\n,
+ lines can be included in data by means of the \n escape (or \r, \r\n,
etc., depending on the newline sequence setting).
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
- When the alternative matching function, pcre_dfa_exec(), is used (by
- means of the \D escape sequence or the -dfa command line option), the
- output consists of a list of all the matches that start at the first
+ When the alternative matching function, pcre_dfa_exec(), is used (by
+ means of the \D escape sequence or the -dfa command line option), the
+ output consists of a list of all the matches that start at the first
point in the subject where there is at least one match. For example:
re> /(tang|tangerine|tan)/
@@ -530,10 +535,13 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
1: tang
2: tan
- (Using the normal matching function on this data finds only "tang".)
- The longest matching string is always given first (and numbered zero).
+ (Using the normal matching function on this data finds only "tang".)
+ The longest matching string is always given first (and numbered zero).
After a PCRE_ERROR_PARTIAL return, the output is "Partial match:", fol-
- lowed by the partially matching substring.
+ lowed by the partially matching substring. (Note that this is the
+ entire substring that was inspected during the partial match; it may
+ include characters before the actual match start if a lookbehind asser-
+ tion, \K, \b, or \B was involved.)
If /g is present on the pattern, the search for further matches resumes
at the end of the longest match. For example:
@@ -692,5 +700,5 @@ AUTHOR
REVISION
- Last updated: 14 June 2010
+ Last updated: 06 November 2010
Copyright (c) 1997-2010 University of Cambridge.
diff --git a/pcre.h.in b/pcre.h.in
index c5999f3..deecc7d 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -161,6 +161,7 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_ERROR_RECURSIONLIMIT (-21)
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
#define PCRE_ERROR_BADNEWLINE (-23)
+#define PCRE_ERROR_BADOFFSET (-24)
/* Request types for pcre_fullinfo() */
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 3bb2451..d7fbbbf 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -2844,6 +2844,7 @@ if (re == NULL || subject == NULL || workspace == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
/* We need to find the pointer to any study data before we test for byte
flipping, so we scan the extra_data block first. This may set two fields in the
@@ -2966,12 +2967,8 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
return PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
- int tb = ((uschar *)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
+ int tb = ((USPTR)subject)[start_offset] & 0xc0;
+ if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
#endif
diff --git a/pcre_exec.c b/pcre_exec.c
index b1d895d..c4618a6 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -5634,6 +5634,7 @@ if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
if (re == NULL || subject == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
/* This information is for finding all the numbers associated with a given
name, for condition testing. */
@@ -5804,12 +5805,8 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
return PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
- int tb = ((USPTR)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
+ int tb = ((USPTR)subject)[start_offset] & 0xc0;
+ if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
#endif
diff --git a/pcretest.c b/pcretest.c
index 4800876..8802bdd 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -2132,6 +2132,7 @@ while (!done)
int getlist = 0;
int gmatched = 0;
int start_offset = 0;
+ int start_offset_sign = 1;
int g_notempty = 0;
int use_dfa = 0;
@@ -2264,7 +2265,13 @@ while (!done)
continue;
case '>':
+ if (*p == '-')
+ {
+ start_offset_sign = -1;
+ p++;
+ }
while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
+ start_offset *= start_offset_sign;
continue;
case 'A': /* Option setting */
diff --git a/testdata/testinput2 b/testdata/testinput2
index 8967ba2..caa664b 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -3544,4 +3544,12 @@ with \Y. ---/
abc\P
abc\P\P
+/.+/
+ abc\>0
+ abc\>1
+ abc\>2
+ abc\>3
+ abc\>4
+ abc\>-4
+
/-- End of testinput2 --/
diff --git a/testdata/testinput5 b/testdata/testinput5
index b2b2ffc..aca97d1 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -816,4 +816,12 @@ correctly, but that messes up comparisons). --/
/\g{A}xxx#bх(?'A'123) (?'A'456)/8x<any>BZ
+/a+/8
+ a\x{123}aa\>1
+ a\x{123}aa\>2
+ a\x{123}aa\>3
+ a\x{123}aa\>4
+ a\x{123}aa\>5
+ a\x{123}aa\>6
+
/-- End of testinput5 --/
diff --git a/testdata/testinput7 b/testdata/testinput7
index 163f2a5..012af7f 100644
--- a/testdata/testinput7
+++ b/testdata/testinput7
@@ -4596,4 +4596,12 @@
abc\P
abc\P\P
+/.+/
+ abc\>0
+ abc\>1
+ abc\>2
+ abc\>3
+ abc\>4
+ abc\>-4
+
/-- End of testinput7 --/
diff --git a/testdata/testinput8 b/testdata/testinput8
index 2a6bef3..bae2f57 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -689,4 +689,12 @@
the cat\P
the cat\P\P
+/a+/8
+ a\x{123}aa\>1
+ a\x{123}aa\>2
+ a\x{123}aa\>3
+ a\x{123}aa\>4
+ a\x{123}aa\>5
+ a\x{123}aa\>6
+
/-- End of testinput8 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 33419e5..2cd7bd9 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -11217,4 +11217,18 @@ Partial match: abc
abc\P\P
Partial match: abc
+/.+/
+ abc\>0
+ 0: abc
+ abc\>1
+ 0: bc
+ abc\>2
+ 0: c
+ abc\>3
+No match
+ abc\>4
+Error -24
+ abc\>-4
+Error -24
+
/-- End of testinput2 --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index d5bc270..fc2409a 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2292,4 +2292,18 @@ Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e
End
------------------------------------------------------------------
+/a+/8
+ a\x{123}aa\>1
+ 0: aa
+ a\x{123}aa\>2
+Error -11
+ a\x{123}aa\>3
+ 0: aa
+ a\x{123}aa\>4
+ 0: a
+ a\x{123}aa\>5
+No match
+ a\x{123}aa\>6
+Error -24
+
/-- End of testinput5 --/
diff --git a/testdata/testoutput7 b/testdata/testoutput7
index 3b12f07..add4dfb 100644
--- a/testdata/testoutput7
+++ b/testdata/testoutput7
@@ -7668,4 +7668,21 @@ Partial match: abc
abc\P\P
Partial match: abc
+/.+/
+ abc\>0
+ 0: abc
+ 1: ab
+ 2: a
+ abc\>1
+ 0: bc
+ 1: b
+ abc\>2
+ 0: c
+ abc\>3
+No match
+ abc\>4
+Error -24
+ abc\>-4
+Error -24
+
/-- End of testinput7 --/
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index 0e60e7a..d663025 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -1326,4 +1326,20 @@ Partial match: abcde
the cat\P\P
Partial match: the cat
+/a+/8
+ a\x{123}aa\>1
+ 0: aa
+ 1: a
+ a\x{123}aa\>2
+Error -11
+ a\x{123}aa\>3
+ 0: aa
+ 1: a
+ a\x{123}aa\>4
+ 0: a
+ a\x{123}aa\>5
+No match
+ a\x{123}aa\>6
+Error -24
+
/-- End of testinput8 --/