summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-11-09 16:54:52 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-11-09 16:54:52 +0000
commit7de890de6074833fd0b0ed433c69a431cd7bf0cb (patch)
tree5de8295036ebc0ec83944a9a181a38ba88bc8008
parent25ab1d4dc4c7d82c3431b37d52bc924c5362721d (diff)
downloadpcre-7de890de6074833fd0b0ed433c69a431cd7bf0cb.tar.gz
Add (*NO_AUTO_POSSESS) and document interaction between auto-possessification
and callouts. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1395 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog3
-rw-r--r--doc/pcreapi.39
-rw-r--r--doc/pcrecallout.359
-rw-r--r--doc/pcrepattern.339
-rw-r--r--pcre_compile.c2
-rw-r--r--pcre_internal.h62
-rw-r--r--testdata/testinput24
-rw-r--r--testdata/testoutput211
8 files changed, 131 insertions, 58 deletions
diff --git a/ChangeLog b/ChangeLog
index 6d2793a..ebfb0fc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -86,7 +86,8 @@ Version 8.34 xx-xxxx-201x
on a patch by Zoltan Herczeg. It now happens after instead of during
compilation. The code is cleaner, and more cases are handled. The option
PCRE_NO_AUTO_POSSESSIFY is added for testing purposes, and the -O and /O
- options in pcretest are provided to set it.
+ options in pcretest are provided to set it. It can also be set by
+ (*NO_AUTO_POSSESS) at the start of a pattern.
18. The character VT has been added to the set of characters that match \s and
are generally treated as white space, following this same change in Perl
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index feabfb3..6c2576d 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -1,4 +1,4 @@
-.TH PCREAPI 3 "05 November 2013" "PCRE 8.34"
+.TH PCREAPI 3 "09 November 2013" "PCRE 8.34"
.SH NAME
PCRE - Perl-compatible regular expressions
.sp
@@ -809,7 +809,10 @@ in Perl.
.sp
If this option is set, it disables "auto-possessification". This is an
optimization that, for example, turns a+b into a++b in order to avoid
-backtracks into a+ that can never be successful.
+backtracks into a+ that can never be successful. However, if callouts are in
+use, auto-possessification means that some of them are never taken. You can set
+this option if you want the matching functions to do a full unoptimized search
+and run all the callouts, but it is mainly provided for testing purposes.
.sp
PCRE_NO_START_OPTIMIZE
.sp
@@ -2879,6 +2882,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 05 November 2013
+Last updated: 09 November 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
diff --git a/doc/pcrecallout.3 b/doc/pcrecallout.3
index 79e2bb9..65eef76 100644
--- a/doc/pcrecallout.3
+++ b/doc/pcrecallout.3
@@ -1,4 +1,4 @@
-.TH PCRECALLOUT 3 "03 March 2013" "PCRE 8.33"
+.TH PCRECALLOUT 3 "09 November 2013" "PCRE 8.34"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
@@ -55,17 +55,50 @@ The
.\" HREF
\fBpcretest\fP
.\"
-command has an option that sets automatic callouts; when it is used, the output
-indicates how the pattern is matched. This is useful information when you are
-trying to optimize the performance of a particular pattern.
+program has a pattern qualifier (/C) that sets automatic callouts; when it is
+used, the output indicates how the pattern is being matched. This is useful
+information when you are trying to optimize the performance of a particular
+pattern.
.
.
.SH "MISSING CALLOUTS"
.rs
.sp
-You should be aware that, because of optimizations in the way PCRE matches
-patterns by default, callouts sometimes do not happen. For example, if the
-pattern is
+You should be aware that, because of optimizations in the way PCRE compiles and
+matches patterns, callouts sometimes do not happen exactly as you might expect.
+.P
+At compile time, PCRE "auto-possessifies" repeated items when it knows that
+what follows cannot be part of the repeat. For example, a+[bc] is compiled as
+if it were a++[bc]. The \fBpcretest\fP output when this pattern is anchored and
+then applied with automatic callouts to the string "aaaa" is:
+.sp
+ --->aaaa
+ +0 ^ ^
+ +1 ^ a+
+ +3 ^ ^ [bc]
+ No match
+.sp
+This indicates that when matching [bc] fails, there is no backtracking into a+
+and therefore the callouts that would be taken for the backtracks do not occur.
+You can disable the auto-possessify feature by passing PCRE_NO_AUTO_POSSESSIFY
+to \fBpcre_compile()\fP, or starting the pattern with (*NO_AUTO_POSSESS). If
+this is done in \fBpcretest\fP (using the /O qualifier), the output changes to
+this:
+.sp
+ --->aaaa
+ +0 ^ ^
+ +1 ^ a+
+ +3 ^ ^ [bc]
+ +3 ^ ^ [bc]
+ +3 ^ ^ [bc]
+ +3 ^^ [bc]
+ No match
+.sp
+This time, when matching [bc] fails, the matcher backtracks into a+ and tries
+again, repeatedly, until a+ itself fails.
+.P
+Other optimizations that provide fast "no match" results also affect callouts.
+For example, if the pattern is
.sp
ab(?C4)cd
.sp
@@ -89,11 +122,11 @@ callouts such as the example above are obeyed.
.rs
.sp
During matching, when PCRE reaches a callout point, the external function
-defined by \fIpcre_callout\fP or \fIpcre[16|32]_callout\fP is called
-(if it is set). This applies to both normal and DFA matching. The only
-argument to the callout function is a pointer to a \fBpcre_callout\fP
-or \fBpcre[16|32]_callout\fP block.
-These structures contains the following fields:
+defined by \fIpcre_callout\fP or \fIpcre[16|32]_callout\fP is called (if it is
+set). This applies to both normal and DFA matching. The only argument to the
+callout function is a pointer to a \fBpcre_callout\fP or
+\fBpcre[16|32]_callout\fP block. These structures contains the following
+fields:
.sp
int \fIversion\fP;
int \fIcallout_number\fP;
@@ -217,6 +250,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 03 March 2013
+Last updated: 09 November 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index db85c4d..741bb34 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "08 November 2013" "PCRE 8.34"
+.TH PCREPATTERN 3 "09 November 2013" "PCRE 8.34"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -80,21 +80,37 @@ appearance causes an error.
.SS "Unicode property support"
.rs
.sp
-Another special sequence that may appear at the start of a pattern is
-.sp
- (*UCP)
-.sp
+Another special sequence that may appear at the start of a pattern is (*UCP).
This has the same effect as setting the PCRE_UCP option: it causes sequences
such as \ed and \ew to use Unicode properties to determine character types,
instead of recognizing only characters with codes less than 128 via a lookup
table.
.
.
+.SS "Disabling auto-possessification"
+.rs
+.sp
+If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting
+the PCRE_NO_AUTO_POSSESSIFY option at compile time. This stops PCRE from making
+quantifiers possessive when what follows cannot match the repeated item. For
+example, by default a+b is treated as a++b. For more details, see the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation.
+.
+.
.SS "Disabling start-up optimizations"
.rs
.sp
If a pattern starts with (*NO_START_OPT), it has the same effect as setting the
-PCRE_NO_START_OPTIMIZE option either at compile or matching time.
+PCRE_NO_START_OPTIMIZE option either at compile or matching time. This disables
+several optimizations for quickly reaching "no match" results. For more
+details, see the
+.\" HREF
+\fBpcreapi\fP
+.\"
+documentation.
.
.
.\" HTML <a name="newlines"></a>
@@ -2754,8 +2770,13 @@ During matching, when PCRE reaches a callout point, the external function is
called. It is provided with the number of the callout, the position in the
pattern, and, optionally, one item of data originally supplied by the caller of
the matching function. The callout function may cause matching to proceed, to
-backtrack, or to fail altogether. A complete description of the interface to
-the callout function is given in the
+backtrack, or to fail altogether.
+.P
+By default, PCRE implements a number of optimizations at compile time and
+matching time, and one side-effect is that sometimes callouts are skipped. If
+you need all possible callouts to happen, you need to set options that disable
+the relevant optimizations. More details, and a complete description of the
+interface to the callout function, are given in the
.\" HREF
\fBpcrecallout\fP
.\"
@@ -3201,6 +3222,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 08 November 2013
+Last updated: 09 November 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
diff --git a/pcre_compile.c b/pcre_compile.c
index f4ab80d..903b466 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -8741,6 +8741,8 @@ PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
{ skipatstart += 6; options |= PCRE_UTF8; continue; }
else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
{ skipatstart += 6; options |= PCRE_UCP; continue; }
+ else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
+ { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESSIFY; continue; }
else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
{ skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
diff --git a/pcre_internal.h b/pcre_internal.h
index 9ca5362..7c80185 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -1534,21 +1534,22 @@ a positive value. */
#define STRING_DEFINE "DEFINE"
-#define STRING_CR_RIGHTPAR "CR)"
-#define STRING_LF_RIGHTPAR "LF)"
-#define STRING_CRLF_RIGHTPAR "CRLF)"
-#define STRING_ANY_RIGHTPAR "ANY)"
-#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
-#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
-#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
-#define STRING_UTF8_RIGHTPAR "UTF8)"
-#define STRING_UTF16_RIGHTPAR "UTF16)"
-#define STRING_UTF32_RIGHTPAR "UTF32)"
-#define STRING_UTF_RIGHTPAR "UTF)"
-#define STRING_UCP_RIGHTPAR "UCP)"
-#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
-#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH="
-#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION="
+#define STRING_CR_RIGHTPAR "CR)"
+#define STRING_LF_RIGHTPAR "LF)"
+#define STRING_CRLF_RIGHTPAR "CRLF)"
+#define STRING_ANY_RIGHTPAR "ANY)"
+#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
+#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
+#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
+#define STRING_UTF8_RIGHTPAR "UTF8)"
+#define STRING_UTF16_RIGHTPAR "UTF16)"
+#define STRING_UTF32_RIGHTPAR "UTF32)"
+#define STRING_UTF_RIGHTPAR "UTF)"
+#define STRING_UCP_RIGHTPAR "UCP)"
+#define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)"
+#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
+#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH="
+#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION="
#else /* SUPPORT_UTF */
@@ -1797,21 +1798,22 @@ only. */
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
-#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
-#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
-#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
-#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
-#define STRING_UTF16_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS
-#define STRING_UTF32_RIGHTPAR STR_U STR_T STR_F STR_3 STR_2 STR_RIGHT_PARENTHESIS
-#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_RIGHT_PARENTHESIS
-#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
-#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
-#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN
-#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN
+#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
+#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
+#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
+#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_UTF16_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS
+#define STRING_UTF32_RIGHTPAR STR_U STR_T STR_F STR_3 STR_2 STR_RIGHT_PARENTHESIS
+#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_RIGHT_PARENTHESIS
+#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
+#define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS
+#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
+#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN
+#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN
#endif /* SUPPORT_UTF */
diff --git a/testdata/testinput2 b/testdata/testinput2
index be8cdaa..7558279 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -2545,7 +2545,9 @@ a random value. /Ix
abcxypqr\Y
/(*NO_START_OPT)xyz/C
- abcxyz
+ abcxyz
+
+/(*NO_AUTO_POSSESS)a+b/BZ
/xyz/CY
abcxyz
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index ba59197..4c3ae5b 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -9416,7 +9416,7 @@ No match
No match
/(*NO_START_OPT)xyz/C
- abcxyz
+ abcxyz
--->abcxyz
+15 ^ x
+15 ^ x
@@ -9426,6 +9426,15 @@ No match
+17 ^ ^ z
+18 ^ ^
0: xyz
+
+/(*NO_AUTO_POSSESS)a+b/BZ
+------------------------------------------------------------------
+ Bra
+ a+
+ b
+ Ket
+ End
+------------------------------------------------------------------
/xyz/CY
abcxyz