summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-06-13 15:09:54 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-06-13 15:09:54 +0000
commit534f2ef23d3192cd74ec86f44c60ff5a7cb957a0 (patch)
tree5b29d0fe9b45bef3e8ae979251ddfcc9dbe3a39e
parenta24e9c9aff88d3b9f6022cbdfee49d758cfde0f7 (diff)
downloadpcre-534f2ef23d3192cd74ec86f44c60ff5a7cb957a0.tar.gz
More document tidies, pre-release.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@182 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog14
-rw-r--r--NEWS12
-rw-r--r--NON-UNIX-USE10
-rw-r--r--doc/html/pcre.html8
-rw-r--r--doc/html/pcre_fullinfo.html5
-rw-r--r--doc/html/pcreapi.html9
-rw-r--r--doc/html/pcrebuild.html21
-rw-r--r--doc/html/pcrecompat.html10
-rw-r--r--doc/html/pcrepattern.html152
-rw-r--r--doc/html/pcreprecompile.html18
-rw-r--r--doc/html/pcresample.html9
-rw-r--r--doc/html/pcrestack.html10
-rw-r--r--doc/pcre.32
-rw-r--r--doc/pcre.txt240
-rw-r--r--doc/pcre_fullinfo.32
-rw-r--r--doc/pcreapi.32
-rw-r--r--doc/pcrebuild.32
-rw-r--r--doc/pcrepattern.338
-rw-r--r--doc/pcreprecompile.34
-rw-r--r--doc/pcrestack.34
-rw-r--r--pcre_compile.c16
-rw-r--r--pcre_dfa_exec.c62
-rw-r--r--pcre_exec.c36
23 files changed, 428 insertions, 258 deletions
diff --git a/ChangeLog b/ChangeLog
index e1197f4..fb2747f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -40,11 +40,11 @@ Version 7.2 13-June-07
(f) \g{name} is another synonym - part of Perl 5.10's unification of
reference syntax.
-
+
(g) (?| introduces a group in which the numbering of parentheses in each
- alternative starts with the same number.
-
- (h) \h, \H, \v, and \V match horizontal and vertical whitespace.
+ alternative starts with the same number.
+
+ (h) \h, \H, \v, and \V match horizontal and vertical whitespace.
7. Added two new calls to pcre_fullinfo(): PCRE_INFO_OKPARTIAL and
PCRE_INFO_JCHANGED.
@@ -59,15 +59,15 @@ Version 7.2 13-June-07
bit of new cunning has reduced the workspace needed for groups with
alternatives. The 1000-alternative test pattern now uses 12 bytes of
workspace instead of running out of the 4096 that are available.
-
+
10. Inserted some missing (unsigned int) casts to get rid of compiler warnings.
11. Applied patch from Google to remove an optimization that didn't quite work.
The report of the bug said:
-
+
pcrecpp::RE("a*").FullMatch("aaa") matches, while
pcrecpp::RE("a*?").FullMatch("aaa") does not, and
- pcrecpp::RE("a*?\\z").FullMatch("aaa") does again.
+ pcrecpp::RE("a*?\\z").FullMatch("aaa") does again.
Version 7.1 24-Apr-07
diff --git a/NEWS b/NEWS
index 867a623..26d0999 100644
--- a/NEWS
+++ b/NEWS
@@ -22,17 +22,17 @@ Some more features from Perl 5.10 have been added:
(?-n) and (?+n) relative references for recursion and subroutines.
(?(-n) and (?(+n) relative references as conditions.
-
+
\k{name} and \g{name} are synonyms for \k<name>.
\K to reset the start of the matched string; for example, (foo)\Kbar
matches bar preceded by foo, but only sets bar as the matched string.
-
- (?| introduces a group where the capturing parentheses in each alternative
- start from the same number; for example, (?|(abc)|(xyz)) sets capturing
+
+ (?| introduces a group where the capturing parentheses in each alternative
+ start from the same number; for example, (?|(abc)|(xyz)) sets capturing
parentheses number 1 in both cases.
-
- \h, \H, \v, \V match horizontal and vertical whitespace, respectively.
+
+ \h, \H, \v, \V match horizontal and vertical whitespace, respectively.
Release 7.1 24-Apr-07
diff --git a/NON-UNIX-USE b/NON-UNIX-USE
index f2ead00..a10c704 100644
--- a/NON-UNIX-USE
+++ b/NON-UNIX-USE
@@ -45,7 +45,7 @@ The following are generic comments about building the PCRE C library "by hand".
An alternative approach is not to edit config.h, but to use -D on the
compiler command line to make any changes that you need.
-
+
NOTE: There have been occasions when the way in which certain parameters in
config.h are used has changed between releases. (In the configure/make
world, this is handled automatically.) When upgrading to a new release, you
@@ -165,10 +165,10 @@ On both MinGW and Cygwin, PCRE should build correctly using:
./configure && make && make install
This should create two libraries called libpcre and libpcreposix, and, if you
-have enabled building the C++ wrapper, a third one called libpcrecpp. These are
-independent libraries: when you like with libpcreposix or libpcrecpp you must
-also link with libpcre, which contains the basic functions. (Some earlier
-releases of PCRE included the basic libpcre functions in libpcreposix. This no
+have enabled building the C++ wrapper, a third one called libpcrecpp. These are
+independent libraries: when you like with libpcreposix or libpcrecpp you must
+also link with libpcre, which contains the basic functions. (Some earlier
+releases of PCRE included the basic libpcre functions in libpcreposix. This no
longer happens.)
If you want to statically link your program against a non-dll .a file, you must
diff --git a/doc/html/pcre.html b/doc/html/pcre.html
index 5859b68..7b24f78 100644
--- a/doc/html/pcre.html
+++ b/doc/html/pcre.html
@@ -228,7 +228,11 @@ must use Unicode property tests such as \p{Nd}.
low-valued characters.
</P>
<P>
-9. Case-insensitive matching applies only to characters whose values are less
+9. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
+(\h, \H, \v, and \V) do match all the appropriate Unicode characters.
+</P>
+<P>
+10. Case-insensitive matching applies only to characters whose values are less
than 128, unless PCRE is built with Unicode property support. Even when Unicode
property support is available, PCRE still uses its own character tables when
checking the case of low-valued characters, so as not to degrade performance.
@@ -254,7 +258,7 @@ two digits 10, at the domain cam.ac.uk.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 18 April 2007
+Last updated: 13 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
diff --git a/doc/html/pcre_fullinfo.html b/doc/html/pcre_fullinfo.html
index ac24f8e..7cda0d3 100644
--- a/doc/html/pcre_fullinfo.html
+++ b/doc/html/pcre_fullinfo.html
@@ -42,12 +42,13 @@ The following information is available:
-1 for start of string
or after newline, or
-2 otherwise
- PCRE_INFO_FIRSTTABLE Table of first bytes
- (after studying)
+ PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
+ PCRE_INFO_JCHANGED Return 1 if (?J) was used
PCRE_INFO_LASTLITERAL Literal last byte required
PCRE_INFO_NAMECOUNT Number of named subpatterns
PCRE_INFO_NAMEENTRYSIZE Size of name table entry
PCRE_INFO_NAMETABLE Pointer to name table
+ PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
PCRE_INFO_OPTIONS Option bits used for compilation
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
diff --git a/doc/html/pcreapi.html b/doc/html/pcreapi.html
index 7830ef8..da06476 100644
--- a/doc/html/pcreapi.html
+++ b/doc/html/pcreapi.html
@@ -658,7 +658,7 @@ out of use. To avoid confusion, they have not been re-used.
26 malformed number or name after (?(
27 conditional group contains more than two branches
28 assertion expected after (?(
- 29 (?R or (?digits must be followed by )
+ 29 (?R or (?[+-]digits must be followed by )
30 unknown POSIX class name
31 POSIX collating elements are not supported
32 this version of PCRE is not compiled with PCRE_UTF8 support
@@ -686,6 +686,9 @@ out of use. To avoid confusion, they have not been re-used.
54 DEFINE group contains more than one branch
55 repeating a DEFINE group is not allowed
56 inconsistent NEWLINE options"
+ 57 \g is not followed by a braced name or an optionally braced
+ non-zero number
+ 58 (?+ or (?- or (?(+ or (?(- must be followed by a non-zero number
</PRE>
</P>
<br><a name="SEC9" href="#TOC1">STUDYING A PATTERN</a><br>
@@ -892,7 +895,7 @@ fourth argument should point to an <b>unsigned char *</b> variable.
</pre>
Return 1 if the (?J) option setting is used in the pattern, otherwise 0. The
fourth argument should point to an <b>int</b> variable. The (?J) internal option
-setting changes the local PCRE_DUPNAMES value.
+setting changes the local PCRE_DUPNAMES option.
<pre>
PCRE_INFO_LASTLITERAL
</pre>
@@ -1873,7 +1876,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 04 June 2007
+Last updated: 13 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
diff --git a/doc/html/pcrebuild.html b/doc/html/pcrebuild.html
index 1284646..b2a013e 100644
--- a/doc/html/pcrebuild.html
+++ b/doc/html/pcrebuild.html
@@ -180,13 +180,18 @@ build a version of PCRE that works this way, add
</pre>
to the <b>configure</b> command. With this configuration, PCRE will use the
<b>pcre_stack_malloc</b> and <b>pcre_stack_free</b> variables to call memory
-management functions. Separate functions are provided because the usage is very
-predictable: the block sizes requested are always the same, and the blocks are
-always freed in reverse order. A calling program might be able to implement
-optimized functions that perform better than the standard <b>malloc()</b> and
-<b>free()</b> functions. PCRE runs noticeably more slowly when built in this
-way. This option affects only the <b>pcre_exec()</b> function; it is not
-relevant for the the <b>pcre_dfa_exec()</b> function.
+management functions. By default these point to <b>malloc()</b> and
+<b>free()</b>, but you can replace the pointers so that your own functions are
+used.
+</P>
+<P>
+Separate functions are provided rather than using <b>pcre_malloc</b> and
+<b>pcre_free</b> because the usage is very predictable: the block sizes
+requested are always the same, and the blocks are always freed in reverse
+order. A calling program might be able to implement optimized functions that
+perform better than <b>malloc()</b> and <b>free()</b>. PCRE runs noticeably more
+slowly when built in this way. This option affects only the <b>pcre_exec()</b>
+function; it is not relevant for the the <b>pcre_dfa_exec()</b> function.
</P>
<br><a name="SEC10" href="#TOC1">LIMITING PCRE RESOURCE USAGE</a><br>
<P>
@@ -260,7 +265,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 16 April 2007
+Last updated: 05 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
diff --git a/doc/html/pcrecompat.html b/doc/html/pcrecompat.html
index 638658a..6e1089d 100644
--- a/doc/html/pcrecompat.html
+++ b/doc/html/pcrecompat.html
@@ -18,8 +18,8 @@ DIFFERENCES BETWEEN PCRE AND PERL
<P>
This document describes the differences in the ways that PCRE and Perl handle
regular expressions. The differences described here are mainly with respect to
-Perl 5.8, though PCRE version 7.0 contains some features that are expected to
-be in the forthcoming Perl 5.10.
+Perl 5.8, though PCRE versions 7.0 and later contain some features that are
+expected to be in the forthcoming Perl 5.10.
</P>
<P>
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
@@ -111,8 +111,8 @@ meta-character matches only at the very end of the string.
<br>
<br>
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
-meaning is faulted. Otherwise, like Perl, the backslash is ignored. (Perl can
-be made to issue a warning.)
+meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
+(Perl can be made to issue a warning.)
<br>
<br>
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
@@ -156,7 +156,7 @@ Cambridge CB2 3QH, England.
REVISION
</b><br>
<P>
-Last updated: 06 March 2007
+Last updated: 13 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
diff --git a/doc/html/pcrepattern.html b/doc/html/pcrepattern.html
index 15ccf85..8d603a1 100644
--- a/doc/html/pcrepattern.html
+++ b/doc/html/pcrepattern.html
@@ -24,19 +24,20 @@ man page, in case the conversion went wrong.
<li><a name="TOC9" href="#SEC9">VERTICAL BAR</a>
<li><a name="TOC10" href="#SEC10">INTERNAL OPTION SETTING</a>
<li><a name="TOC11" href="#SEC11">SUBPATTERNS</a>
-<li><a name="TOC12" href="#SEC12">NAMED SUBPATTERNS</a>
-<li><a name="TOC13" href="#SEC13">REPETITION</a>
-<li><a name="TOC14" href="#SEC14">ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS</a>
-<li><a name="TOC15" href="#SEC15">BACK REFERENCES</a>
-<li><a name="TOC16" href="#SEC16">ASSERTIONS</a>
-<li><a name="TOC17" href="#SEC17">CONDITIONAL SUBPATTERNS</a>
-<li><a name="TOC18" href="#SEC18">COMMENTS</a>
-<li><a name="TOC19" href="#SEC19">RECURSIVE PATTERNS</a>
-<li><a name="TOC20" href="#SEC20">SUBPATTERNS AS SUBROUTINES</a>
-<li><a name="TOC21" href="#SEC21">CALLOUTS</a>
-<li><a name="TOC22" href="#SEC22">SEE ALSO</a>
-<li><a name="TOC23" href="#SEC23">AUTHOR</a>
-<li><a name="TOC24" href="#SEC24">REVISION</a>
+<li><a name="TOC12" href="#SEC12">DUPLICATE SUBPATTERN NUMBERS</a>
+<li><a name="TOC13" href="#SEC13">NAMED SUBPATTERNS</a>
+<li><a name="TOC14" href="#SEC14">REPETITION</a>
+<li><a name="TOC15" href="#SEC15">ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS</a>
+<li><a name="TOC16" href="#SEC16">BACK REFERENCES</a>
+<li><a name="TOC17" href="#SEC17">ASSERTIONS</a>
+<li><a name="TOC18" href="#SEC18">CONDITIONAL SUBPATTERNS</a>
+<li><a name="TOC19" href="#SEC19">COMMENTS</a>
+<li><a name="TOC20" href="#SEC20">RECURSIVE PATTERNS</a>
+<li><a name="TOC21" href="#SEC21">SUBPATTERNS AS SUBROUTINES</a>
+<li><a name="TOC22" href="#SEC22">CALLOUTS</a>
+<li><a name="TOC23" href="#SEC23">SEE ALSO</a>
+<li><a name="TOC24" href="#SEC24">AUTHOR</a>
+<li><a name="TOC25" href="#SEC25">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE REGULAR EXPRESSION DETAILS</a><br>
<P>
@@ -270,8 +271,12 @@ following are always recognized:
<pre>
\d any decimal digit
\D any character that is not a decimal digit
+ \h any horizontal whitespace character
+ \H any character that is not a horizontal whitespace character
\s any whitespace character
\S any character that is not a whitespace character
+ \v any vertical whitespace character
+ \V any character that is not a vertical whitespace character
\w any "word" character
\W any "non-word" character
</pre>
@@ -287,9 +292,52 @@ there is no character to match.
<P>
For compatibility with Perl, \s does not match the VT character (code 11).
This makes it different from the the POSIX "space" class. The \s characters
-are HT (9), LF (10), FF (12), CR (13), and space (32). (If "use locale;" is
+are HT (9), LF (10), FF (12), CR (13), and space (32). If "use locale;" is
included in a Perl script, \s may match the VT character. In PCRE, it never
-does.)
+does.
+</P>
+<P>
+In UTF-8 mode, characters with values greater than 128 never match \d, \s, or
+\w, and always match \D, \S, and \W. This is true even when Unicode
+character property support is available. These sequences retain their original
+meanings from before UTF-8 support was available, mainly for efficiency
+reasons.
+</P>
+<P>
+The sequences \h, \H, \v, and \V are Perl 5.10 features. In contrast to the
+other sequences, these do match certain high-valued codepoints in UTF-8 mode.
+The horizontal space characters are:
+<pre>
+ U+0009 Horizontal tab
+ U+0020 Space
+ U+00A0 Non-break space
+ U+1680 Ogham space mark
+ U+180E Mongolian vowel separator
+ U+2000 En quad
+ U+2001 Em quad
+ U+2002 En space
+ U+2003 Em space
+ U+2004 Three-per-em space
+ U+2005 Four-per-em space
+ U+2006 Six-per-em space
+ U+2007 Figure space
+ U+2008 Punctuation space
+ U+2009 Thin space
+ U+200A Hair space
+ U+202F Narrow no-break space
+ U+205F Medium mathematical space
+ U+3000 Ideographic space
+</pre>
+The vertical space characters are:
+<pre>
+ U+000A Linefeed
+ U+000B Vertical tab
+ U+000C Formfeed
+ U+000D Carriage return
+ U+0085 Next line
+ U+2028 Line separator
+ U+2029 Paragraph separator
+</PRE>
</P>
<P>
A "word" character is an underscore or any character less than 256 that is a
@@ -301,20 +349,15 @@ in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page). For example, in a French locale such as "fr_FR" in Unix-like systems,
or "french" in Windows, some character codes greater than 128 are used for
-accented letters, and these are matched by \w.
-</P>
-<P>
-In UTF-8 mode, characters with values greater than 128 never match \d, \s, or
-\w, and always match \D, \S, and \W. This is true even when Unicode
-character property support is available. The use of locales with Unicode is
-discouraged.
+accented letters, and these are matched by \w. The use of locales with Unicode
+is discouraged.
</P>
<br><b>
Newline sequences
</b><br>
<P>
Outside a character class, the escape sequence \R matches any Unicode newline
-sequence. This is an extension to Perl. In non-UTF-8 mode \R is equivalent to
+sequence. This is a Perl 5.10 feature. In non-UTF-8 mode \R is equivalent to
the following:
<pre>
(?&#62;\r\n|\n|\x0b|\f|\r|\x85)
@@ -966,7 +1009,38 @@ from left to right, and options are not reset until the end of the subpattern
is reached, an option setting in one branch does affect subsequent branches, so
the above patterns match "SUNDAY" as well as "Saturday".
</P>
-<br><a name="SEC12" href="#TOC1">NAMED SUBPATTERNS</a><br>
+<br><a name="SEC12" href="#TOC1">DUPLICATE SUBPATTERN NUMBERS</a><br>
+<P>
+Perl 5.10 introduced a feature whereby each alternative in a subpattern uses
+the same numbers for its capturing parentheses. Such a subpattern starts with
+(?| and is itself a non-capturing subpattern. For example, consider this
+pattern:
+<pre>
+ (?|(Sat)ur|(Sun))day
+</pre>
+Because the two alternatives are inside a (?| group, both sets of capturing
+parentheses are numbered one. Thus, when the pattern matches, you can look
+at captured substring number one, whichever alternative matched. This construct
+is useful when you want to capture part, but not all, of one of a number of
+alternatives. Inside a (?| group, parentheses are numbered as usual, but the
+number is reset at the start of each branch. The numbers of any capturing
+buffers that follow the subpattern start after the highest number used in any
+branch. The following example is taken from the Perl documentation.
+The numbers underneath show in which buffer the captured content will be
+stored.
+<pre>
+ # before ---------------branch-reset----------- after
+ / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
+ # 1 2 2 3 2 3 4
+</pre>
+A backreference or a recursive call to a numbered subpattern always refers to
+the first one in the pattern with the given number.
+</P>
+<P>
+An alternative approach to using this "branch reset" feature is to use
+duplicate named subpatterns, as described in the next section.
+</P>
+<br><a name="SEC13" href="#TOC1">NAMED SUBPATTERNS</a><br>
<P>
Identifying capturing parentheses by number is simple, but it can be very hard
to keep track of the numbers in complicated regular expressions. Furthermore,
@@ -1008,6 +1082,10 @@ abbreviation. This pattern (ignoring the line breaks) does the job:
(?&#60;DN&#62;Sat)(?:urday)?
</pre>
There are five capturing substrings, but only one is ever set after a match.
+(An alternative way of solving this problem is to use a "branch reset"
+subpattern, as described in the previous section.)
+</P>
+<P>
The convenience function for extracting the data by name returns the substring
for the first (and in this example, the only) subpattern of that name that
matched. This saves searching to find which numbered subpattern it was. If you
@@ -1017,7 +1095,7 @@ details of the interfaces for handling named subpatterns, see the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation.
</P>
-<br><a name="SEC13" href="#TOC1">REPETITION</a><br>
+<br><a name="SEC14" href="#TOC1">REPETITION</a><br>
<P>
Repetition is specified by quantifiers, which can follow any of the following
items:
@@ -1168,7 +1246,7 @@ example, after
</pre>
matches "aba" the value of the second captured substring is "b".
<a name="atomicgroup"></a></P>
-<br><a name="SEC14" href="#TOC1">ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS</a><br>
+<br><a name="SEC15" href="#TOC1">ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS</a><br>
<P>
With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
repetition, failure of what follows normally causes the repeated item to be
@@ -1267,7 +1345,7 @@ an atomic group, like this:
</pre>
sequences of non-digits cannot be broken, and failure happens quickly.
<a name="backreferences"></a></P>
-<br><a name="SEC15" href="#TOC1">BACK REFERENCES</a><br>
+<br><a name="SEC16" href="#TOC1">BACK REFERENCES</a><br>
<P>
Outside a character class, a backslash followed by a digit greater than 0 (and
possibly further digits) is a back reference to a capturing subpattern earlier
@@ -1380,7 +1458,7 @@ that the first iteration does not need to match the back reference. This can be
done using alternation, as in the example above, or by a quantifier with a
minimum of zero.
<a name="bigassertions"></a></P>
-<br><a name="SEC16" href="#TOC1">ASSERTIONS</a><br>
+<br><a name="SEC17" href="#TOC1">ASSERTIONS</a><br>
<P>
An assertion is a test on the characters following or preceding the current
matching point that does not actually consume any characters. The simple
@@ -1540,7 +1618,7 @@ preceded by "foo", while
is another pattern that matches "foo" preceded by three digits and any three
characters that are not "999".
<a name="conditions"></a></P>
-<br><a name="SEC17" href="#TOC1">CONDITIONAL SUBPATTERNS</a><br>
+<br><a name="SEC18" href="#TOC1">CONDITIONAL SUBPATTERNS</a><br>
<P>
It is possible to cause the matching process to obey a subpattern
conditionally or to choose between two alternative subpatterns, depending on
@@ -1678,7 +1756,7 @@ subject is matched against the first alternative; otherwise it is matched
against the second. This pattern matches strings in one of the two forms
dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
<a name="comments"></a></P>
-<br><a name="SEC18" href="#TOC1">COMMENTS</a><br>
+<br><a name="SEC19" href="#TOC1">COMMENTS</a><br>
<P>
The sequence (?# marks the start of a comment that continues up to the next
closing parenthesis. Nested parentheses are not permitted. The characters
@@ -1689,7 +1767,7 @@ If the PCRE_EXTENDED option is set, an unescaped # character outside a
character class introduces a comment that continues to immediately after the
next newline in the pattern.
<a name="recursion"></a></P>
-<br><a name="SEC19" href="#TOC1">RECURSIVE PATTERNS</a><br>
+<br><a name="SEC20" href="#TOC1">RECURSIVE PATTERNS</a><br>
<P>
Consider the problem of matching a string in parentheses, allowing for
unlimited nested parentheses. Without the use of recursion, the best that can
@@ -1819,7 +1897,7 @@ In this pattern, (?(R) is the start of a conditional subpattern, with two
different alternatives for the recursive and non-recursive cases. The (?R) item
is the actual recursive call.
<a name="subpatternsassubroutines"></a></P>
-<br><a name="SEC20" href="#TOC1">SUBPATTERNS AS SUBROUTINES</a><br>
+<br><a name="SEC21" href="#TOC1">SUBPATTERNS AS SUBROUTINES</a><br>
<P>
If the syntax for a recursive subpattern reference (either by number or by
name) is used outside the parentheses to which it refers, it operates like a
@@ -1859,7 +1937,7 @@ changed for different calls. For example, consider this pattern:
It matches "abcabc". It does not match "abcABC" because the change of
processing option does not affect the called subpattern.
</P>
-<br><a name="SEC21" href="#TOC1">CALLOUTS</a><br>
+<br><a name="SEC22" href="#TOC1">CALLOUTS</a><br>
<P>
Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl
code to be obeyed in the middle of matching a regular expression. This makes it
@@ -1894,11 +1972,11 @@ description of the interface to the callout function is given in the
<a href="pcrecallout.html"><b>pcrecallout</b></a>
documentation.
</P>
-<br><a name="SEC22" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC23" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcreapi</b>(3), <b>pcrecallout</b>(3), <b>pcrematching</b>(3), <b>pcre</b>(3).
</P>
-<br><a name="SEC23" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC24" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@@ -1907,9 +1985,9 @@ University Computing Service
Cambridge CB2 3QH, England.
<br>
</P>
-<br><a name="SEC24" href="#TOC1">REVISION</a><br>
+<br><a name="SEC25" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 29 May 2007
+Last updated: 13 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
diff --git a/doc/html/pcreprecompile.html b/doc/html/pcreprecompile.html
index 0e4cb74..83da226 100644
--- a/doc/html/pcreprecompile.html
+++ b/doc/html/pcreprecompile.html
@@ -124,19 +124,9 @@ usual way.
</P>
<br><a name="SEC4" href="#TOC1">COMPATIBILITY WITH DIFFERENT PCRE RELEASES</a><br>
<P>
-The layout of the control block that is at the start of the data that makes up
-a compiled pattern was changed for release 5.0. If you have any saved patterns
-that were compiled with previous releases (not a facility that was previously
-advertised), you will have to recompile them for release 5.0 and above.
-</P>
-<P>
-If you have any saved patterns in UTF-8 mode that use \p or \P that were
-compiled with any release up to and including 6.4, you will have to recompile
-them for release 6.5 and above.
-</P>
-<P>
-All saved patterns from earlier releases must be recompiled for release 7.0 or
-higher, because there was an internal reorganization at that release.
+In general, it is safest to recompile all saved patterns when you update to a
+new PCRE release, though not all updates actually require this. Recompiling is
+definitely needed for release 7.2.
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
@@ -149,7 +139,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 24 April 2007
+Last updated: 13 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
diff --git a/doc/html/pcresample.html b/doc/html/pcresample.html
index b3c924d..44c5bfb 100644
--- a/doc/html/pcresample.html
+++ b/doc/html/pcresample.html
@@ -33,9 +33,10 @@ string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on.
</P>
<P>
-If PCRE is installed in the standard include and library directories for your
-system, you should be able to compile the demonstration program using this
-command:
+The demonstration program is automatically built if you use "./configure;make"
+to build PCRE. Otherwise, if PCRE is installed in the standard include and
+library directories for your system, you should be able to compile the
+demonstration program using this command:
<pre>
gcc -o pcredemo pcredemo.c -lpcre
</pre>
@@ -87,7 +88,7 @@ Cambridge CB2 3QH, England.
REVISION
</b><br>
<P>
-Last updated: 06 March 2007
+Last updated: 13 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
diff --git a/doc/html/pcrestack.html b/doc/html/pcrestack.html
index 7236400..2cc7d26 100644
--- a/doc/html/pcrestack.html
+++ b/doc/html/pcrestack.html
@@ -83,7 +83,13 @@ PCRE to use heap memory instead of stack for remembering back-up points. This
makes it run a lot more slowly, however. Details of how to do this are given in
the
<a href="pcrebuild.html"><b>pcrebuild</b></a>
-documentation.
+documentation. When built in this way, instead of using the stack, PCRE obtains
+and frees memory by calling the functions that are pointed to by the
+<b>pcre_stack_malloc</b> and <b>pcre_stack_free</b> variables. By default, these
+point to <b>malloc()</b> and <b>free()</b>, but you can replace the pointers to
+cause PCRE to use your own functions. Since the block sizes are always the
+same, and are always freed in reverse order, it may be possible to implement
+customized memory handlers that are more efficient than the standard functions.
</P>
<P>
In Unix-like environments, there is not often a problem with the stack unless
@@ -139,7 +145,7 @@ Cambridge CB2 3QH, England.
REVISION
</b><br>
<P>
-Last updated: 12 March 2007
+Last updated: 05 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
diff --git a/doc/pcre.3 b/doc/pcre.3
index 40aec35..f731b16 100644
--- a/doc/pcre.3
+++ b/doc/pcre.3
@@ -219,7 +219,7 @@ must use Unicode property tests such as \ep{Nd}.
8. Similarly, characters that match the POSIX named character classes are all
low-valued characters.
.P
-9. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
+9. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
(\eh, \eH, \ev, and \eV) do match all the appropriate Unicode characters.
.P
10. Case-insensitive matching applies only to characters whose values are less
diff --git a/doc/pcre.txt b/doc/pcre.txt
index 601812c..e55cf01 100644
--- a/doc/pcre.txt
+++ b/doc/pcre.txt
@@ -197,7 +197,11 @@ UTF-8 AND UNICODE PROPERTY SUPPORT
8. Similarly, characters that match the POSIX named character classes
are all low-valued characters.
- 9. Case-insensitive matching applies only to characters whose values
+ 9. However, the Perl 5.10 horizontal and vertical whitespace matching
+ escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
+ acters.
+
+ 10. Case-insensitive matching applies only to characters whose values
are less than 128, unless PCRE is built with Unicode property support.
Even when Unicode property support is available, PCRE still uses its
own character tables when checking the case of low-valued characters,
@@ -222,7 +226,7 @@ AUTHOR
REVISION
- Last updated: 18 April 2007
+ Last updated: 13 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
@@ -390,13 +394,17 @@ AVOIDING EXCESSIVE STACK USAGE
to the configure command. With this configuration, PCRE will use the
pcre_stack_malloc and pcre_stack_free variables to call memory manage-
- ment functions. Separate functions are provided because the usage is
- very predictable: the block sizes requested are always the same, and
- the blocks are always freed in reverse order. A calling program might
- be able to implement optimized functions that perform better than the
- standard malloc() and free() functions. PCRE runs noticeably more
- slowly when built in this way. This option affects only the pcre_exec()
- function; it is not relevant for the the pcre_dfa_exec() function.
+ ment functions. By default these point to malloc() and free(), but you
+ can replace the pointers so that your own functions are used.
+
+ Separate functions are provided rather than using pcre_malloc and
+ pcre_free because the usage is very predictable: the block sizes
+ requested are always the same, and the blocks are always freed in
+ reverse order. A calling program might be able to implement optimized
+ functions that perform better than malloc() and free(). PCRE runs
+ noticeably more slowly when built in this way. This option affects only
+ the pcre_exec() function; it is not relevant for the the
+ pcre_dfa_exec() function.
LIMITING PCRE RESOURCE USAGE
@@ -474,7 +482,7 @@ AUTHOR
REVISION
- Last updated: 16 April 2007
+ Last updated: 05 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
@@ -1259,7 +1267,7 @@ COMPILATION ERROR CODES
26 malformed number or name after (?(
27 conditional group contains more than two branches
28 assertion expected after (?(
- 29 (?R or (?digits must be followed by )
+ 29 (?R or (?[+-]digits must be followed by )
30 unknown POSIX class name
31 POSIX collating elements are not supported
32 this version of PCRE is not compiled with PCRE_UTF8 support
@@ -1288,6 +1296,9 @@ COMPILATION ERROR CODES
54 DEFINE group contains more than one branch
55 repeating a DEFINE group is not allowed
56 inconsistent NEWLINE options"
+ 57 \g is not followed by a braced name or an optionally braced
+ non-zero number
+ 58 (?+ or (?- or (?(+ or (?(- must be followed by a non-zero number
STUDYING A PATTERN
@@ -1480,7 +1491,7 @@ INFORMATION ABOUT A PATTERN
Return 1 if the (?J) option setting is used in the pattern, otherwise
0. The fourth argument should point to an int variable. The (?J) inter-
- nal option setting changes the local PCRE_DUPNAMES value.
+ nal option setting changes the local PCRE_DUPNAMES option.
PCRE_INFO_LASTLITERAL
@@ -2406,7 +2417,7 @@ AUTHOR
REVISION
- Last updated: 04 June 2007
+ Last updated: 13 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
@@ -2593,8 +2604,8 @@ DIFFERENCES BETWEEN PCRE AND PERL
This document describes the differences in the ways that PCRE and Perl
handle regular expressions. The differences described here are mainly
- with respect to Perl 5.8, though PCRE version 7.0 contains some fea-
- tures that are expected to be in the forthcoming Perl 5.10.
+ with respect to Perl 5.8, though PCRE versions 7.0 and later contain
+ some features that are expected to be in the forthcoming Perl 5.10.
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details
of what it does have are given in the section on UTF-8 support in the
@@ -2672,8 +2683,8 @@ DIFFERENCES BETWEEN PCRE AND PERL
meta-character matches only at the very end of the string.
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no spe-
- cial meaning is faulted. Otherwise, like Perl, the backslash is
- ignored. (Perl can be made to issue a warning.)
+ cial meaning is faulted. Otherwise, like Perl, the backslash is quietly
+ ignored. (Perl can be made to issue a warning.)
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
fiers is inverted, that is, by default they are not greedy, but if fol-
@@ -2705,7 +2716,7 @@ AUTHOR
REVISION
- Last updated: 06 March 2007
+ Last updated: 13 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
@@ -2938,8 +2949,12 @@ BACKSLASH
\d any decimal digit
\D any character that is not a decimal digit
+ \h any horizontal whitespace character
+ \H any character that is not a horizontal whitespace character
\s any whitespace character
\S any character that is not a whitespace character
+ \v any vertical whitespace character
+ \V any character that is not a vertical whitespace character
\w any "word" character
\W any "non-word" character
@@ -2954,9 +2969,49 @@ BACKSLASH
For compatibility with Perl, \s does not match the VT character (code
11). This makes it different from the the POSIX "space" class. The \s
- characters are HT (9), LF (10), FF (12), CR (13), and space (32). (If
+ characters are HT (9), LF (10), FF (12), CR (13), and space (32). If
"use locale;" is included in a Perl script, \s may match the VT charac-
- ter. In PCRE, it never does.)
+ ter. In PCRE, it never does.
+
+ In UTF-8 mode, characters with values greater than 128 never match \d,
+ \s, or \w, and always match \D, \S, and \W. This is true even when Uni-
+ code character property support is available. These sequences retain
+ their original meanings from before UTF-8 support was available, mainly
+ for efficiency reasons.
+
+ The sequences \h, \H, \v, and \V are Perl 5.10 features. In contrast to
+ the other sequences, these do match certain high-valued codepoints in
+ UTF-8 mode. The horizontal space characters are:
+
+ U+0009 Horizontal tab
+ U+0020 Space
+ U+00A0 Non-break space
+ U+1680 Ogham space mark
+ U+180E Mongolian vowel separator
+ U+2000 En quad
+ U+2001 Em quad
+ U+2002 En space
+ U+2003 Em space
+ U+2004 Three-per-em space
+ U+2005 Four-per-em space
+ U+2006 Six-per-em space
+ U+2007 Figure space
+ U+2008 Punctuation space
+ U+2009 Thin space
+ U+200A Hair space
+ U+202F Narrow no-break space
+ U+205F Medium mathematical space
+ U+3000 Ideographic space
+
+ The vertical space characters are:
+
+ U+000A Linefeed
+ U+000B Vertical tab
+ U+000C Formfeed
+ U+000D Carriage return
+ U+0085 Next line
+ U+2028 Line separator
+ U+2029 Paragraph separator
A "word" character is an underscore or any character less than 256 that
is a letter or digit. The definition of letters and digits is con-
@@ -2964,17 +3019,13 @@ BACKSLASH
specific matching is taking place (see "Locale support" in the pcreapi
page). For example, in a French locale such as "fr_FR" in Unix-like
systems, or "french" in Windows, some character codes greater than 128
- are used for accented letters, and these are matched by \w.
-
- In UTF-8 mode, characters with values greater than 128 never match \d,
- \s, or \w, and always match \D, \S, and \W. This is true even when Uni-
- code character property support is available. The use of locales with
- Unicode is discouraged.
+ are used for accented letters, and these are matched by \w. The use of
+ locales with Unicode is discouraged.
Newline sequences
Outside a character class, the escape sequence \R matches any Unicode
- newline sequence. This is an extension to Perl. In non-UTF-8 mode \R is
+ newline sequence. This is a Perl 5.10 feature. In non-UTF-8 mode \R is
equivalent to the following:
(?>\r\n|\n|\x0b|\f|\r|\x85)
@@ -3537,6 +3588,37 @@ SUBPATTERNS
"Saturday".
+DUPLICATE SUBPATTERN NUMBERS
+
+ Perl 5.10 introduced a feature whereby each alternative in a subpattern
+ uses the same numbers for its capturing parentheses. Such a subpattern
+ starts with (?| and is itself a non-capturing subpattern. For example,
+ consider this pattern:
+
+ (?|(Sat)ur|(Sun))day
+
+ Because the two alternatives are inside a (?| group, both sets of cap-
+ turing parentheses are numbered one. Thus, when the pattern matches,
+ you can look at captured substring number one, whichever alternative
+ matched. This construct is useful when you want to capture part, but
+ not all, of one of a number of alternatives. Inside a (?| group, paren-
+ theses are numbered as usual, but the number is reset at the start of
+ each branch. The numbers of any capturing buffers that follow the sub-
+ pattern start after the highest number used in any branch. The follow-
+ ing example is taken from the Perl documentation. The numbers under-
+ neath show in which buffer the captured content will be stored.
+
+ # before ---------------branch-reset----------- after
+ / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
+ # 1 2 2 3 2 3 4
+
+ A backreference or a recursive call to a numbered subpattern always
+ refers to the first one in the pattern with the given number.
+
+ An alternative approach to using this "branch reset" feature is to use
+ duplicate named subpatterns, as described in the next section.
+
+
NAMED SUBPATTERNS
Identifying capturing parentheses by number is simple, but it can be
@@ -3576,14 +3658,16 @@ NAMED SUBPATTERNS
(?<DN>Sat)(?:urday)?
There are five capturing substrings, but only one is ever set after a
- match. The convenience function for extracting the data by name
- returns the substring for the first (and in this example, the only)
- subpattern of that name that matched. This saves searching to find
- which numbered subpattern it was. If you make a reference to a non-
- unique named subpattern from elsewhere in the pattern, the one that
- corresponds to the lowest number is used. For further details of the
- interfaces for handling named subpatterns, see the pcreapi documenta-
- tion.
+ match. (An alternative way of solving this problem is to use a "branch
+ reset" subpattern, as described in the previous section.)
+
+ The convenience function for extracting the data by name returns the
+ substring for the first (and in this example, the only) subpattern of
+ that name that matched. This saves searching to find which numbered
+ subpattern it was. If you make a reference to a non-unique named sub-
+ pattern from elsewhere in the pattern, the one that corresponds to the
+ lowest number is used. For further details of the interfaces for han-
+ dling named subpatterns, see the pcreapi documentation.
REPETITION
@@ -4455,7 +4539,7 @@ AUTHOR
REVISION
- Last updated: 29 May 2007
+ Last updated: 13 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
@@ -4786,19 +4870,9 @@ RE-USING A PRECOMPILED PATTERN
COMPATIBILITY WITH DIFFERENT PCRE RELEASES
- The layout of the control block that is at the start of the data that
- makes up a compiled pattern was changed for release 5.0. If you have
- any saved patterns that were compiled with previous releases (not a
- facility that was previously advertised), you will have to recompile
- them for release 5.0 and above.
-
- If you have any saved patterns in UTF-8 mode that use \p or \P that
- were compiled with any release up to and including 6.4, you will have
- to recompile them for release 6.5 and above.
-
- All saved patterns from earlier releases must be recompiled for release
- 7.0 or higher, because there was an internal reorganization at that
- release.
+ In general, it is safest to recompile all saved patterns when you
+ update to a new PCRE release, though not all updates actually require
+ this. Recompiling is definitely needed for release 7.2.
AUTHOR
@@ -4810,7 +4884,7 @@ AUTHOR
REVISION
- Last updated: 24 April 2007
+ Last updated: 13 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
@@ -5545,28 +5619,29 @@ PCRE SAMPLE PROGRAM
bility of matching an empty string. Comments in the code explain what
is going on.
- If PCRE is installed in the standard include and library directories
- for your system, you should be able to compile the demonstration pro-
- gram using this command:
+ The demonstration program is automatically built if you use "./config-
+ ure;make" to build PCRE. Otherwise, if PCRE is installed in the stan-
+ dard include and library directories for your system, you should be
+ able to compile the demonstration program using this command:
gcc -o pcredemo pcredemo.c -lpcre
- If PCRE is installed elsewhere, you may need to add additional options
- to the command line. For example, on a Unix-like system that has PCRE
- installed in /usr/local, you can compile the demonstration program
+ If PCRE is installed elsewhere, you may need to add additional options
+ to the command line. For example, on a Unix-like system that has PCRE
+ installed in /usr/local, you can compile the demonstration program
using a command like this:
gcc -o pcredemo -I/usr/local/include pcredemo.c \
-L/usr/local/lib -lpcre
- Once you have compiled the demonstration program, you can run simple
+ Once you have compiled the demonstration program, you can run simple
tests like this:
./pcredemo 'cat|dog' 'the cat sat on the mat'
./pcredemo -g 'cat|dog' 'the dog sat on the cat'
- Note that there is a much more comprehensive test program, called
- pcretest, which supports many more facilities for testing regular
+ Note that there is a much more comprehensive test program, called
+ pcretest, which supports many more facilities for testing regular
expressions and the PCRE library. The pcredemo program is provided as a
simple coding example.
@@ -5574,10 +5649,10 @@ PCRE SAMPLE PROGRAM
the standard library directory, you may get an error like this when you
try to run pcredemo:
- ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or
+ ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or
directory
- This is caused by the way shared library support works on those sys-
+ This is caused by the way shared library support works on those sys-
tems. You need to add
-R/usr/local/lib
@@ -5594,7 +5669,7 @@ AUTHOR
REVISION
- Last updated: 06 March 2007
+ Last updated: 13 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
PCRESTACK(3) PCRESTACK(3)
@@ -5664,17 +5739,24 @@ PCRE DISCUSSION OF STACK USAGE
In environments where stack memory is constrained, you might want to
compile PCRE to use heap memory instead of stack for remembering back-
up points. This makes it run a lot more slowly, however. Details of how
- to do this are given in the pcrebuild documentation.
-
- In Unix-like environments, there is not often a problem with the stack
- unless very long strings are involved, though the default limit on
- stack size varies from system to system. Values from 8Mb to 64Mb are
+ to do this are given in the pcrebuild documentation. When built in this
+ way, instead of using the stack, PCRE obtains and frees memory by call-
+ ing the functions that are pointed to by the pcre_stack_malloc and
+ pcre_stack_free variables. By default, these point to malloc() and
+ free(), but you can replace the pointers to cause PCRE to use your own
+ functions. Since the block sizes are always the same, and are always
+ freed in reverse order, it may be possible to implement customized mem-
+ ory handlers that are more efficient than the standard functions.
+
+ In Unix-like environments, there is not often a problem with the stack
+ unless very long strings are involved, though the default limit on
+ stack size varies from system to system. Values from 8Mb to 64Mb are
common. You can find your default limit by running the command:
ulimit -s
- Unfortunately, the effect of running out of stack is often SIGSEGV,
- though sometimes a more explicit error message is given. You can nor-
+ Unfortunately, the effect of running out of stack is often SIGSEGV,
+ though sometimes a more explicit error message is given. You can nor-
mally increase the limit on stack size by code such as this:
struct rlimit rlim;
@@ -5682,21 +5764,21 @@ PCRE DISCUSSION OF STACK USAGE
rlim.rlim_cur = 100*1024*1024;
setrlimit(RLIMIT_STACK, &rlim);
- This reads the current limits (soft and hard) using getrlimit(), then
- attempts to increase the soft limit to 100Mb using setrlimit(). You
+ This reads the current limits (soft and hard) using getrlimit(), then
+ attempts to increase the soft limit to 100Mb using setrlimit(). You
must do this before calling pcre_exec().
- PCRE has an internal counter that can be used to limit the depth of
- recursion, and thus cause pcre_exec() to give an error code before it
- runs out of stack. By default, the limit is very large, and unlikely
- ever to operate. It can be changed when PCRE is built, and it can also
+ PCRE has an internal counter that can be used to limit the depth of
+ recursion, and thus cause pcre_exec() to give an error code before it
+ runs out of stack. By default, the limit is very large, and unlikely
+ ever to operate. It can be changed when PCRE is built, and it can also
be set when pcre_exec() is called. For details of these interfaces, see
the pcrebuild and pcreapi documentation.
As a very rough rule of thumb, you should reckon on about 500 bytes per
- recursion. Thus, if you want to limit your stack usage to 8Mb, you
- should set the limit at 16000 recursions. A 64Mb stack, on the other
- hand, can support around 128000 recursions. The pcretest test program
+ recursion. Thus, if you want to limit your stack usage to 8Mb, you
+ should set the limit at 16000 recursions. A 64Mb stack, on the other
+ hand, can support around 128000 recursions. The pcretest test program
has a command line option (-S) that can be used to increase the size of
its stack.
@@ -5710,7 +5792,7 @@ AUTHOR
REVISION
- Last updated: 12 March 2007
+ Last updated: 05 June 2007
Copyright (c) 1997-2007 University of Cambridge.
------------------------------------------------------------------------------
diff --git a/doc/pcre_fullinfo.3 b/doc/pcre_fullinfo.3
index 08b770f..067a6a8 100644
--- a/doc/pcre_fullinfo.3
+++ b/doc/pcre_fullinfo.3
@@ -36,7 +36,7 @@ The following information is available:
PCRE_INFO_NAMECOUNT Number of named subpatterns
PCRE_INFO_NAMEENTRYSIZE Size of name table entry
PCRE_INFO_NAMETABLE Pointer to name table
- PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
+ PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
PCRE_INFO_OPTIONS Option bits used for compilation
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 0c976f4..bfa4beb 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -676,7 +676,7 @@ out of use. To avoid confusion, they have not been re-used.
54 DEFINE group contains more than one branch
55 repeating a DEFINE group is not allowed
56 inconsistent NEWLINE options"
- 57 \g is not followed by a braced name or an optionally braced
+ 57 \eg is not followed by a braced name or an optionally braced
non-zero number
58 (?+ or (?- or (?(+ or (?(- must be followed by a non-zero number
.
diff --git a/doc/pcrebuild.3 b/doc/pcrebuild.3
index 7f10911..c111bf4 100644
--- a/doc/pcrebuild.3
+++ b/doc/pcrebuild.3
@@ -166,7 +166,7 @@ to the \fBconfigure\fP command. With this configuration, PCRE will use the
\fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP variables to call memory
management functions. By default these point to \fBmalloc()\fP and
\fBfree()\fP, but you can replace the pointers so that your own functions are
-used.
+used.
.P
Separate functions are provided rather than using \fBpcre_malloc\fP and
\fBpcre_free\fP because the usage is very predictable: the block sizes
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 1acb7d7..09e2da0 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -260,14 +260,14 @@ parenthesized subpatterns.
Another use of backslash is for specifying generic character types. The
following are always recognized:
.sp
- \ed any decimal digit
+ \ed any decimal digit
\eD any character that is not a decimal digit
\eh any horizontal whitespace character
- \eH any character that is not a horizontal whitespace character
+ \eH any character that is not a horizontal whitespace character
\es any whitespace character
\eS any character that is not a whitespace character
\ev any vertical whitespace character
- \eV any character that is not a vertical whitespace character
+ \eV any character that is not a vertical whitespace character
\ew any "word" character
\eW any "non-word" character
.sp
@@ -287,11 +287,11 @@ does.
.P
In UTF-8 mode, characters with values greater than 128 never match \ed, \es, or
\ew, and always match \eD, \eS, and \eW. This is true even when Unicode
-character property support is available. These sequences retain their original
-meanings from before UTF-8 support was available, mainly for efficiency
+character property support is available. These sequences retain their original
+meanings from before UTF-8 support was available, mainly for efficiency
reasons.
.P
-The sequences \eh, \eH, \ev, and \eV are Perl 5.10 features. In contrast to the
+The sequences \eh, \eH, \ev, and \eV are Perl 5.10 features. In contrast to the
other sequences, these do match certain high-valued codepoints in UTF-8 mode.
The horizontal space characters are:
.sp
@@ -1001,28 +1001,28 @@ the above patterns match "SUNDAY" as well as "Saturday".
.SH "DUPLICATE SUBPATTERN NUMBERS"
.rs
.sp
-Perl 5.10 introduced a feature whereby each alternative in a subpattern uses
-the same numbers for its capturing parentheses. Such a subpattern starts with
-(?| and is itself a non-capturing subpattern. For example, consider this
+Perl 5.10 introduced a feature whereby each alternative in a subpattern uses
+the same numbers for its capturing parentheses. Such a subpattern starts with
+(?| and is itself a non-capturing subpattern. For example, consider this
pattern:
.sp
(?|(Sat)ur|(Sun))day
-.sp
-Because the two alternatives are inside a (?| group, both sets of capturing
-parentheses are numbered one. Thus, when the pattern matches, you can look
-at captured substring number one, whichever alternative matched. This construct
-is useful when you want to capture part, but not all, of one of a number of
-alternatives. Inside a (?| group, parentheses are numbered as usual, but the
+.sp
+Because the two alternatives are inside a (?| group, both sets of capturing
+parentheses are numbered one. Thus, when the pattern matches, you can look
+at captured substring number one, whichever alternative matched. This construct
+is useful when you want to capture part, but not all, of one of a number of
+alternatives. Inside a (?| group, parentheses are numbered as usual, but the
number is reset at the start of each branch. The numbers of any capturing
-buffers that follow the subpattern start after the highest number used in any
-branch. The following example is taken from the Perl documentation.
+buffers that follow the subpattern start after the highest number used in any
+branch. The following example is taken from the Perl documentation.
The numbers underneath show in which buffer the captured content will be
stored.
.sp
# before ---------------branch-reset----------- after
/ ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
# 1 2 2 3 2 3 4
-.sp
+.sp
A backreference or a recursive call to a numbered subpattern always refers to
the first one in the pattern with the given number.
.P
@@ -1079,7 +1079,7 @@ abbreviation. This pattern (ignoring the line breaks) does the job:
(?<DN>Sat)(?:urday)?
.sp
There are five capturing substrings, but only one is ever set after a match.
-(An alternative way of solving this problem is to use a "branch reset"
+(An alternative way of solving this problem is to use a "branch reset"
subpattern, as described in the previous section.)
.P
The convenience function for extracting the data by name returns the substring
diff --git a/doc/pcreprecompile.3 b/doc/pcreprecompile.3
index e0ff922..aa52542 100644
--- a/doc/pcreprecompile.3
+++ b/doc/pcreprecompile.3
@@ -117,8 +117,8 @@ usual way.
.SH "COMPATIBILITY WITH DIFFERENT PCRE RELEASES"
.rs
.sp
-In general, it is safest to recompile all saved patterns when you update to a
-new PCRE release, though not all updates actually require this. Recompiling is
+In general, it is safest to recompile all saved patterns when you update to a
+new PCRE release, though not all updates actually require this. Recompiling is
definitely needed for release 7.2.
.
.
diff --git a/doc/pcrestack.3 b/doc/pcrestack.3
index 1c5955c..7e9bfc9 100644
--- a/doc/pcrestack.3
+++ b/doc/pcrestack.3
@@ -76,8 +76,8 @@ documentation. When built in this way, instead of using the stack, PCRE obtains
and frees memory by calling the functions that are pointed to by the
\fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP variables. By default, these
point to \fBmalloc()\fP and \fBfree()\fP, but you can replace the pointers to
-cause PCRE to use your own functions. Since the block sizes are always the
-same, and are always freed in reverse order, it may be possible to implement
+cause PCRE to use your own functions. Since the block sizes are always the
+same, and are always freed in reverse order, it may be possible to implement
customized memory handlers that are more efficient than the standard functions.
.P
In Unix-like environments, there is not often a problem with the stack unless
diff --git a/pcre_compile.c b/pcre_compile.c
index bdc4120..c191539 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -2026,7 +2026,7 @@ switch(op_code)
case ESC_W:
return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
-
+
case ESC_h:
case ESC_H:
switch(item)
@@ -2053,8 +2053,8 @@ switch(op_code)
return -next != ESC_h;
default:
return -next == ESC_h;
- }
-
+ }
+
case ESC_v:
case ESC_V:
switch(item)
@@ -2069,7 +2069,7 @@ switch(op_code)
return -next != ESC_v;
default:
return -next == ESC_v;
- }
+ }
default:
return FALSE;
@@ -2093,20 +2093,20 @@ switch(op_code)
case OP_NOT_HSPACE:
return next == -ESC_h;
-
+
/* Can't have \S in here because VT matches \S (Perl anomaly) */
- case OP_VSPACE:
+ case OP_VSPACE:
return next == -ESC_V || next == -ESC_d || next == -ESC_w;
case OP_NOT_VSPACE:
- return next == -ESC_v;
+ return next == -ESC_v;
case OP_WORDCHAR:
return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
case OP_NOT_WORDCHAR:
return next == -ESC_w || next == -ESC_d;
-
+
default:
return FALSE;
}
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 120c2f6..87f9746 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -63,7 +63,7 @@ applications. */
/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
into others, under special conditions. A gap of 20 between the blocks should be
-enough. The resulting opcodes don't have to be less than 256 because they are
+enough. The resulting opcodes don't have to be less than 256 because they are
never stored, so we push them well clear of the normal opcodes. */
#define OP_PROP_EXTRA 300
@@ -585,9 +585,9 @@ for (;;)
case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
case OP_NOT_HSPACE:
- case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
+ case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
case OP_NOT_VSPACE:
- case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
+ case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
default: break;
}
}
@@ -1105,7 +1105,7 @@ for (;;)
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0)
{
- BOOL OK;
+ BOOL OK;
switch (c)
{
case 0x000a:
@@ -1116,15 +1116,15 @@ for (;;)
case 0x2028:
case 0x2029:
OK = TRUE;
- break;
+ break;
default:
OK = FALSE;
- break;
+ break;
}
if (OK == (d == OP_VSPACE))
- {
+ {
if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
{
active_count--; /* Remove non-match possibility */
@@ -1144,7 +1144,7 @@ for (;;)
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0)
{
- BOOL OK;
+ BOOL OK;
switch (c)
{
case 0x09: /* HT */
@@ -1168,14 +1168,14 @@ for (;;)
case 0x3000: /* IDEOGRAPHIC SPACE */
OK = TRUE;
break;
-
+
default:
OK = FALSE;
break;
}
-
+
if (OK == (d == OP_HSPACE))
- {
+ {
if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
{
active_count--; /* Remove non-match possibility */
@@ -1346,7 +1346,7 @@ for (;;)
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
- BOOL OK;
+ BOOL OK;
switch (c)
{
case 0x000a:
@@ -1358,13 +1358,13 @@ for (;;)
case 0x2029:
OK = TRUE;
break;
-
+
default:
OK = FALSE;
break;
}
if (OK == (d == OP_VSPACE))
- {
+ {
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
{
@@ -1392,7 +1392,7 @@ for (;;)
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
- BOOL OK;
+ BOOL OK;
switch (c)
{
case 0x09: /* HT */
@@ -1416,14 +1416,14 @@ for (;;)
case 0x3000: /* IDEOGRAPHIC SPACE */
OK = TRUE;
break;
-
+
default:
OK = FALSE;
break;
}
-
+
if (OK == (d == OP_HSPACE))
- {
+ {
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
{
@@ -1574,7 +1574,7 @@ for (;;)
count = current_state->count; /* Number already matched */
if (clen > 0)
{
- BOOL OK;
+ BOOL OK;
switch (c)
{
case 0x000a:
@@ -1586,13 +1586,13 @@ for (;;)
case 0x2029:
OK = TRUE;
break;
-
+
default:
OK = FALSE;
}
-
+
if (OK == (d == OP_VSPACE))
- {
+ {
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
{
active_count--; /* Remove non-match possibility */
@@ -1616,7 +1616,7 @@ for (;;)
count = current_state->count; /* Number already matched */
if (clen > 0)
{
- BOOL OK;
+ BOOL OK;
switch (c)
{
case 0x09: /* HT */
@@ -1640,14 +1640,14 @@ for (;;)
case 0x3000: /* IDEOGRAPHIC SPACE */
OK = TRUE;
break;
-
+
default:
OK = FALSE;
break;
}
-
+
if (OK == (d == OP_HSPACE))
- {
+ {
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
{
active_count--; /* Remove non-match possibility */
@@ -1771,8 +1771,8 @@ for (;;)
case 0x2028:
case 0x2029:
break;
-
- default:
+
+ default:
ADD_NEW(state_offset + 1, 0);
break;
}
@@ -1791,7 +1791,7 @@ for (;;)
case 0x2029:
ADD_NEW(state_offset + 1, 0);
break;
-
+
default: break;
}
break;
@@ -1820,8 +1820,8 @@ for (;;)
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
break;
-
- default:
+
+ default:
ADD_NEW(state_offset + 1, 0);
break;
}
diff --git a/pcre_exec.c b/pcre_exec.c
index f5a2340..f62b5fc 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -2941,7 +2941,7 @@ for (;;)
}
}
break;
-
+
case OP_HSPACE:
for (i = 1; i <= min; i++)
{
@@ -2973,7 +2973,7 @@ for (;;)
}
}
break;
-
+
case OP_NOT_VSPACE:
for (i = 1; i <= min; i++)
{
@@ -2993,7 +2993,7 @@ for (;;)
}
}
break;
-
+
case OP_VSPACE:
for (i = 1; i <= min; i++)
{
@@ -3009,7 +3009,7 @@ for (;;)
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
+ break;
}
}
break;
@@ -3150,7 +3150,7 @@ for (;;)
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
- break;
+ break;
}
}
break;
@@ -3184,7 +3184,7 @@ for (;;)
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
- break;
+ break;
}
}
break;
@@ -3845,16 +3845,16 @@ for (;;)
break;
case OP_NOT_HSPACE:
- case OP_HSPACE:
+ case OP_HSPACE:
for (i = min; i < max; i++)
{
- BOOL gotspace;
+ BOOL gotspace;
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
switch(c)
- {
- default: gotspace = FALSE; break;
+ {
+ default: gotspace = FALSE; break;
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
@@ -3875,7 +3875,7 @@ for (;;)
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
gotspace = TRUE;
- break;
+ break;
}
if (gotspace == (ctype == OP_NOT_HSPACE)) break;
eptr += len;
@@ -3883,16 +3883,16 @@ for (;;)
break;
case OP_NOT_VSPACE:
- case OP_VSPACE:
+ case OP_VSPACE:
for (i = min; i < max; i++)
{
- BOOL gotspace;
+ BOOL gotspace;
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
switch(c)
{
- default: gotspace = FALSE; break;
+ default: gotspace = FALSE; break;
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
@@ -3903,7 +3903,7 @@ for (;;)
gotspace = TRUE;
break;
}
- if (gotspace == (ctype == OP_NOT_VSPACE)) break;
+ if (gotspace == (ctype == OP_NOT_VSPACE)) break;
eptr += len;
}
break;
@@ -4040,7 +4040,7 @@ for (;;)
if (eptr >= md->end_subject) break;
c = *eptr;
if (c == 0x09 || c == 0x20 || c == 0xa0) break;
- eptr++;
+ eptr++;
}
break;
@@ -4050,7 +4050,7 @@ for (;;)
if (eptr >= md->end_subject) break;
c = *eptr;
if (c != 0x09 && c != 0x20 && c != 0xa0) break;
- eptr++;
+ eptr++;
}
break;
@@ -4061,7 +4061,7 @@ for (;;)
c = *eptr;
if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
break;
- eptr++;
+ eptr++;
}
break;