summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-08-02 11:00:40 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-08-02 11:00:40 +0000
commit9c65843dde6af3b331acdf8518a6020df32f45af (patch)
treef4938ee9a3d4ca4b7282f86370a5a39875a3a562
parent2c1db477501a36945e05bc50a1d563c96c4e13f4 (diff)
downloadpcre-9c65843dde6af3b331acdf8518a6020df32f45af.tar.gz
Documentation and general text tidies in preparation for test release.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@654 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--AUTHORS4
-rw-r--r--ChangeLog254
-rw-r--r--LICENCE4
-rw-r--r--NEWS10
-rw-r--r--README19
-rwxr-xr-xRunTest54
-rw-r--r--RunTest.bat2
-rw-r--r--config-cmake.h.in2
-rw-r--r--configure.ac4
-rw-r--r--doc/html/pcre.html25
-rw-r--r--doc/html/pcreapi.html150
-rw-r--r--doc/html/pcrebuild.html78
-rw-r--r--doc/html/pcrecallout.html38
-rw-r--r--doc/html/pcrecompat.html45
-rw-r--r--doc/html/pcrecpp.html4
-rw-r--r--doc/html/pcregrep.html40
-rw-r--r--doc/html/pcrepattern.html59
-rw-r--r--doc/html/pcrestack.html9
-rw-r--r--doc/html/pcretest.html152
-rw-r--r--doc/pcre.34
-rw-r--r--doc/pcre.txt1226
-rw-r--r--doc/pcreapi.348
-rw-r--r--doc/pcrebuild.345
-rw-r--r--doc/pcrecallout.36
-rw-r--r--doc/pcrecompat.312
-rw-r--r--doc/pcregrep.110
-rw-r--r--doc/pcregrep.txt518
-rw-r--r--doc/pcrepattern.322
-rw-r--r--doc/pcrestack.32
-rw-r--r--doc/pcretest.136
-rw-r--r--doc/pcretest.txt335
-rw-r--r--maint/README14
-rw-r--r--pcre.h.in2
-rw-r--r--pcre_compile.c158
-rw-r--r--pcre_dfa_exec.c106
-rw-r--r--pcre_exec.c26
-rw-r--r--pcre_internal.h10
-rw-r--r--pcre_printint.src32
-rw-r--r--pcre_study.c50
-rw-r--r--pcre_tables.c280
-rw-r--r--pcre_valid_utf8.c138
-rw-r--r--pcregrep.c42
-rw-r--r--pcreposix.c2
-rw-r--r--pcretest.c80
-rwxr-xr-xperltest.pl6
-rw-r--r--ucp.h2
46 files changed, 2358 insertions, 1807 deletions
diff --git a/AUTHORS b/AUTHORS
index 88db849..d6885ad 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -8,7 +8,7 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
-Copyright (c) 1997-2010 University of Cambridge
+Copyright (c) 1997-2011 University of Cambridge
All rights reserved
@@ -17,7 +17,7 @@ THE C++ WRAPPER LIBRARY
Written by: Google Inc.
-Copyright (c) 2007-2010 Google Inc
+Copyright (c) 2007-2011 Google Inc
All rights reserved
####
diff --git a/ChangeLog b/ChangeLog
index 68a42a2..ca76923 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -20,231 +20,231 @@ Version 8.13 02-Aug-2011
code. (b) A reference to 2 copies of a 3-byte code would not match 2 of a
2-byte code at the end of the subject (it thought there wasn't enough data
left).
-
-5. Comprehensive information about what went wrong is now returned by
- pcre_exec() and pcre_dfa_exec() when the UTF-8 string check fails, as long
- as the output vector has at least 2 elements. The offset of the start of
+
+5. Comprehensive information about what went wrong is now returned by
+ pcre_exec() and pcre_dfa_exec() when the UTF-8 string check fails, as long
+ as the output vector has at least 2 elements. The offset of the start of
the failing character and a reason code are placed in the vector.
-
-6. When the UTF-8 string check fails for pcre_compile(), the offset that is
- now returned is for the first byte of the failing character, instead of the
- last byte inspected. This is an incompatible change, but I hope it is small
+
+6. When the UTF-8 string check fails for pcre_compile(), the offset that is
+ now returned is for the first byte of the failing character, instead of the
+ last byte inspected. This is an incompatible change, but I hope it is small
enough not to be a problem. It makes the returned offset consistent with
pcre_exec() and pcre_dfa_exec().
-
+
7. pcretest now gives a text phrase as well as the error number when
pcre_exec() or pcre_dfa_exec() fails; if the error is a UTF-8 check
failure, the offset and reason code are output.
-
-8. When \R was used with a maximizing quantifier it failed to skip backwards
+
+8. When \R was used with a maximizing quantifier it failed to skip backwards
over a \r\n pair if the subsequent match failed. Instead, it just skipped
- back over a single character (\n). This seems wrong (because it treated the
+ back over a single character (\n). This seems wrong (because it treated the
two characters as a single entity when going forwards), conflicts with the
documentation that \R is equivalent to (?>\r\n|\n|...etc), and makes the
- behaviour of \R* different to (\R)*, which also seems wrong. The behaviour
+ behaviour of \R* different to (\R)*, which also seems wrong. The behaviour
has been changed.
-
-9. Some internal refactoring has changed the processing so that the handling
+
+9. Some internal refactoring has changed the processing so that the handling
of the PCRE_CASELESS and PCRE_MULTILINE options is done entirely at compile
time (the PCRE_DOTALL option was changed this way some time ago: version
- 7.7 change 16). This has made it possible to abolish the OP_OPT op code,
- which was always a bit of a fudge. It also means that there is one less
- argument for the match() function, which reduces its stack requirements
+ 7.7 change 16). This has made it possible to abolish the OP_OPT op code,
+ which was always a bit of a fudge. It also means that there is one less
+ argument for the match() function, which reduces its stack requirements
slightly. This change also fixes an incompatibility with Perl: the pattern
(?i:([^b]))(?1) should not match "ab", but previously PCRE gave a match.
-
+
10. More internal refactoring has drastically reduced the number of recursive
- calls to match() for possessively repeated groups such as (abc)++ when
+ calls to match() for possessively repeated groups such as (abc)++ when
using pcre_exec().
-
+
11. While implementing 10, a number of bugs in the handling of groups were
discovered and fixed:
-
+
(?<=(a)+) was not diagnosed as invalid (non-fixed-length lookbehind).
(a|)*(?1) gave a compile-time internal error.
- ((a|)+)+ did not notice that the outer group could match an empty string.
+ ((a|)+)+ did not notice that the outer group could match an empty string.
(^a|^)+ was not marked as anchored.
- (.*a|.*)+ was not marked as matching at start or after a newline.
-
+ (.*a|.*)+ was not marked as matching at start or after a newline.
+
12. Yet more internal refactoring has removed another argument from the match()
- function. Special calls to this function are now indicated by setting a
- value in a variable in the "match data" data block.
-
-13. Be more explicit in pcre_study() instead of relying on "default" for
- opcodes that mean there is no starting character; this means that when new
- ones are added and accidentally left out of pcre_study(), testing should
+ function. Special calls to this function are now indicated by setting a
+ value in a variable in the "match data" data block.
+
+13. Be more explicit in pcre_study() instead of relying on "default" for
+ opcodes that mean there is no starting character; this means that when new
+ ones are added and accidentally left out of pcre_study(), testing should
pick them up.
-
-14. The -s option of pcretest has been documented for ages as being an old
- synonym of -m (show memory usage). I have changed it to mean "force study
- for every regex", that is, assume /S for every regex. This is similar to -i
- and -d etc. It's slightly incompatible, but I'm hoping nobody is still
+
+14. The -s option of pcretest has been documented for ages as being an old
+ synonym of -m (show memory usage). I have changed it to mean "force study
+ for every regex", that is, assume /S for every regex. This is similar to -i
+ and -d etc. It's slightly incompatible, but I'm hoping nobody is still
using it. It makes it easier to run collections of tests with and without
- study enabled, and thereby test pcre_study() more easily. All the standard
- tests are now run with and without -s (but some patterns can be marked as
+ study enabled, and thereby test pcre_study() more easily. All the standard
+ tests are now run with and without -s (but some patterns can be marked as
"never study" - see 20 below).
-
+
15. When (*ACCEPT) was used in a subpattern that was called recursively, the
- restoration of the capturing data to the outer values was not happening
+ restoration of the capturing data to the outer values was not happening
correctly.
-
+
16. If a recursively called subpattern ended with (*ACCEPT) and matched an
empty string, and PCRE_NOTEMPTY was set, pcre_exec() thought the whole
pattern had matched an empty string, and so incorrectly returned a no
match.
-
+
17. There was optimizing code for the last branch of non-capturing parentheses,
- and also for the obeyed branch of a conditional subexpression, which used
- tail recursion to cut down on stack usage. Unfortunately, now that there is
- the possibility of (*THEN) occurring in these branches, tail recursion is
- no longer possible because the return has to be checked for (*THEN). These
- two optimizations have therefore been removed.
-
+ and also for the obeyed branch of a conditional subexpression, which used
+ tail recursion to cut down on stack usage. Unfortunately, now that there is
+ the possibility of (*THEN) occurring in these branches, tail recursion is
+ no longer possible because the return has to be checked for (*THEN). These
+ two optimizations have therefore been removed.
+
18. If a pattern containing \R was studied, it was assumed that \R always
matched two bytes, thus causing the minimum subject length to be
incorrectly computed because \R can also match just one byte.
-
+
19. If a pattern containing (*ACCEPT) was studied, the minimum subject length
- was incorrectly computed.
-
+ was incorrectly computed.
+
20. If /S is present twice on a test pattern in pcretest input, it now
- *disables* studying, thereby overriding the use of -s on the command line
+ *disables* studying, thereby overriding the use of -s on the command line
(see 14 above). This is necessary for one or two tests to keep the output
identical in both cases.
-
+
21. When (*ACCEPT) was used in an assertion that matched an empty string and
- PCRE_NOTEMPTY was set, PCRE applied the non-empty test to the assertion.
-
-22. When an atomic group that contained a capturing parenthesis was
- successfully matched, but the branch in which it appeared failed, the
- capturing was not being forgotten if a higher numbered group was later
+ PCRE_NOTEMPTY was set, PCRE applied the non-empty test to the assertion.
+
+22. When an atomic group that contained a capturing parenthesis was
+ successfully matched, but the branch in which it appeared failed, the
+ capturing was not being forgotten if a higher numbered group was later
captured. For example, /(?>(a))b|(a)c/ when matching "ac" set capturing
group 1 to "a", when in fact it should be unset. This applied to multi-
- branched capturing and non-capturing groups, repeated or not, and also to
- positive assertions (capturing in negative assertions does not happen
- in PCRE) and also to nested atomic groups.
-
-23. Add the ++ qualifier feature to pcretest, to show the remainder of the
- subject after a captured substring, to make it easier to tell which of a
+ branched capturing and non-capturing groups, repeated or not, and also to
+ positive assertions (capturing in negative assertions does not happen
+ in PCRE) and also to nested atomic groups.
+
+23. Add the ++ qualifier feature to pcretest, to show the remainder of the
+ subject after a captured substring, to make it easier to tell which of a
number of identical substrings has been captured.
-
+
24. The way atomic groups are processed by pcre_exec() has been changed so that
- if they are repeated, backtracking one repetition now resets captured
+ if they are repeated, backtracking one repetition now resets captured
values correctly. For example, if ((?>(a+)b)+aabab) is matched against
- "aaaabaaabaabab" the value of captured group 2 is now correctly recorded as
- "aaa". Previously, it would have been "a". As part of this code
+ "aaaabaaabaabab" the value of captured group 2 is now correctly recorded as
+ "aaa". Previously, it would have been "a". As part of this code
refactoring, the way recursive calls are handled has also been changed.
-
-25. If an assertion condition captured any substrings, they were not passed
+
+25. If an assertion condition captured any substrings, they were not passed
back unless some other capturing happened later. For example, if
(?(?=(a))a) was matched against "a", no capturing was returned.
-
+
26. When studying a pattern that contained subroutine calls or assertions,
- the code for finding the minimum length of a possible match was handling
- direct recursions such as (xxx(?1)|yyy) but not mutual recursions (where
+ the code for finding the minimum length of a possible match was handling
+ direct recursions such as (xxx(?1)|yyy) but not mutual recursions (where
group 1 called group 2 while simultaneously a separate group 2 called group
1). A stack overflow occurred in this case. I have fixed this by limiting
the recursion depth to 10.
-
+
27. Updated RunTest.bat in the distribution to the version supplied by Tom
Fortmann. This supports explicit test numbers on the command line, and has
argument validation and error reporting.
-
-28. An instance of \X with an unlimited repeat could fail if at any point the
- first character it looked at was a mark character.
-
-29. Some minor code refactoring concerning Unicode properties and scripts
- should reduce the stack requirement of match() slightly.
-
+
+28. An instance of \X with an unlimited repeat could fail if at any point the
+ first character it looked at was a mark character.
+
+29. Some minor code refactoring concerning Unicode properties and scripts
+ should reduce the stack requirement of match() slightly.
+
30. Added the '=' option to pcretest to check the setting of unused capturing
slots at the end of the pattern, which are documented as being -1, but are
- not included in the return count.
-
+ not included in the return count.
+
31. If \k was not followed by a braced, angle-bracketed, or quoted name, PCRE
- compiled something random. Now it gives a compile-time error (as does
- Perl).
-
+ compiled something random. Now it gives a compile-time error (as does
+ Perl).
+
32. A *MARK encountered during the processing of a positive assertion is now
- recorded and passed back (compatible with Perl).
-
+ recorded and passed back (compatible with Perl).
+
33. If --only-matching or --colour was set on a pcregrep call whose pattern
had alternative anchored branches, the search for a second match in a line
was done as if at the line start. Thus, for example, /^01|^02/ incorrectly
matched the line "0102" twice. The same bug affected patterns that started
with a backwards assertion. For example /\b01|\b02/ also matched "0102"
- twice.
-
-34. Previously, PCRE did not allow quantification of assertions. However, Perl
- does, and because of capturing effects, quantifying parenthesized
- assertions may at times be useful. Quantifiers are now allowed for
+ twice.
+
+34. Previously, PCRE did not allow quantification of assertions. However, Perl
+ does, and because of capturing effects, quantifying parenthesized
+ assertions may at times be useful. Quantifiers are now allowed for
parenthesized assertions.
-
-35. A minor code tidy in pcre_compile() when checking options for \R usage.
+
+35. A minor code tidy in pcre_compile() when checking options for \R usage.
36. \g was being checked for fancy things in a character class, when it should
just be a literal "g".
-
+
37. PCRE was rejecting [:a[:digit:]] whereas Perl was not. It seems that the
appearance of a nested POSIX class supersedes an apparent external class.
For example, [:a[:digit:]b:] matches "a", "b", ":", or a digit. Also,
unescaped square brackets may also appear as part of class names. For
- example, [:a[:abc]b:] gives unknown class "[:abc]b:]". PCRE now behaves
+ example, [:a[:abc]b:] gives unknown class "[:abc]b:]". PCRE now behaves
more like Perl.
-
-38. PCRE was giving an error for \N with a braced quantifier such as {1,} (this
+
+38. PCRE was giving an error for \N with a braced quantifier such as {1,} (this
was because it thought it was \N{name}, which is not supported).
-
-39. Add minix to OS list not supporting the -S option in pcretest.
+
+39. Add minix to OS list not supporting the -S option in pcretest.
40. PCRE tries to detect cases of infinite recursion at compile time, but it
cannot analyze patterns in sufficient detail to catch mutual recursions
- such as ((?1))((?2)). There is now a runtime test that gives an error if a
- subgroup is called recursively as a subpattern for a second time at the
+ such as ((?1))((?2)). There is now a runtime test that gives an error if a
+ subgroup is called recursively as a subpattern for a second time at the
same position in the subject string. In previous releases this might have
- been caught by the recursion limit, or it might have run out of stack.
-
+ been caught by the recursion limit, or it might have run out of stack.
+
41. A pattern such as /(?(R)a+|(?R)b)/ is quite safe, as the recursion can
- happen only once. PCRE was, however incorrectly giving a compile time error
- "recursive call could loop indefinitely" because it cannot analyze the
- pattern in sufficient detail. The compile time test no longer happens when
- PCRE is compiling a conditional subpattern, but actual runaway loops are
- now caught at runtime (see 40 above).
-
+ happen only once. PCRE was, however incorrectly giving a compile time error
+ "recursive call could loop indefinitely" because it cannot analyze the
+ pattern in sufficient detail. The compile time test no longer happens when
+ PCRE is compiling a conditional subpattern, but actual runaway loops are
+ now caught at runtime (see 40 above).
+
42. It seems that Perl allows any characters other than a closing parenthesis
- to be part of the NAME in (*MARK:NAME) and other backtracking verbs. PCRE
- has been changed to be the same.
-
+ to be part of the NAME in (*MARK:NAME) and other backtracking verbs. PCRE
+ has been changed to be the same.
+
43. Updated configure.ac to put in more quoting round AC_LANG_PROGRAM etc. so
- as not to get warnings when autogen.sh is called. Also changed
+ as not to get warnings when autogen.sh is called. Also changed
AC_PROG_LIBTOOL (deprecated) to LT_INIT (the current macro).
-
-44. To help people who use pcregrep to scan files containing exceedingly long
+
+44. To help people who use pcregrep to scan files containing exceedingly long
lines, the following changes have been made:
-
+
(a) The default value of the buffer size parameter has been increased from
8K to 20K. (The actual buffer used is three times this size.)
-
+
(b) The default can be changed by ./configure --with-pcregrep-bufsiz when
PCRE is built.
-
+
(c) A --buffer-size=n option has been added to pcregrep, to allow the size
to be set at run time.
-
+
(d) Numerical values in pcregrep options can be followed by K or M, for
example --buffer-size=50K.
-
- (e) If a line being scanned overflows pcregrep's buffer, an error is now
+
+ (e) If a line being scanned overflows pcregrep's buffer, an error is now
given and the return code is set to 2.
-
-45. Add a pointer to the latest mark to the callout data block.
+
+45. Add a pointer to the latest mark to the callout data block.
46. The pattern /.(*F)/, when applied to "abc" with PCRE_PARTIAL_HARD, gave a
partial match of an empty string instead of no match. This was specific to
the use of ".".
-
+
47. The pattern /f.*/8s, when applied to "for" with PCRE_PARTIAL_HARD, gave a
complete match instead of a partial match. This bug was dependent on both
- the PCRE_UTF8 and PCRE_DOTALL options being set.
+ the PCRE_UTF8 and PCRE_DOTALL options being set.
Version 8.12 15-Jan-2011
diff --git a/LICENCE b/LICENCE
index 0dd2257..65a7ec7 100644
--- a/LICENCE
+++ b/LICENCE
@@ -22,7 +22,7 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
-Copyright (c) 1997-2010 University of Cambridge
+Copyright (c) 1997-2011 University of Cambridge
All rights reserved.
@@ -31,7 +31,7 @@ THE C++ WRAPPER FUNCTIONS
Contributed by: Google Inc.
-Copyright (c) 2007-2010, Google Inc.
+Copyright (c) 2007-2011, Google Inc.
All rights reserved.
diff --git a/NEWS b/NEWS
index 5f2b29b..3ef0c78 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,16 @@
News about PCRE releases
------------------------
+Release 8.13 02-Aug-2011
+------------------------
+
+This is mainly a bug-fix release. There has been a lot of internal refactoring.
+The Unicode tables have been updated. The only new feature in the library is
+the passing of *MARK information to callouts. Some additions have been made to
+pcretest to make testing easier and more comprehensive. There is a new option
+for pcregrep to adjust its internal buffer size.
+
+
Release 8.12 15-Jan-2011
------------------------
diff --git a/README b/README
index 2f3e926..ab82c7c 100644
--- a/README
+++ b/README
@@ -159,7 +159,15 @@ possible to build it as a C++ library, though the provided building apparatus
does not have any features to support this.
There are some optional features that can be included or omitted from the PCRE
-library. You can read more about them in the pcrebuild man page.
+library. They are also documented in the pcrebuild man page.
+
+. By default, both shared and static libraries are built. You can change this
+ by adding one of these options to the "configure" command:
+
+ --disable-shared
+ --disable-static
+
+ (See also "Shared libraries on Unix-like systems" below.)
. If you want to suppress the building of the C++ wrapper library, you can add
--disable-cpp to the "configure" command. Otherwise, when "configure" is run,
@@ -279,6 +287,13 @@ library. You can read more about them in the pcrebuild man page.
Of course, the relevant libraries must be installed on your system.
+. The default size of internal buffer used by pcregrep can be set by, for
+ example:
+
+ --with-pcregrep-bufsize=50K
+
+ The default value is 20K.
+
. It is possible to compile pcretest so that it links with the libreadline
library, by specifying
@@ -796,4 +811,4 @@ The distribution should contain the following files:
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
-Last updated: 19 January 2010
+Last updated: 02 August 2011
diff --git a/RunTest b/RunTest
index 624d8e5..c8f0283 100755
--- a/RunTest
+++ b/RunTest
@@ -152,7 +152,7 @@ echo PCRE C library tests
if [ $do1 = yes ] ; then
echo "Test 1: main functionality (Compatible with Perl >= 5.8)"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput1 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput1 testtry
@@ -160,14 +160,14 @@ if [ $do1 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
# PCRE tests that are not Perl-compatible - API, errors, internals
if [ $do2 = yes ] ; then
echo "Test 2: API, errors, internals, and non-Perl stuff"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput2 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput2 testtry
@@ -182,7 +182,7 @@ if [ $do2 = yes ] ; then
exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
# Locale-specific tests, provided that either the "fr_FR" or the "french"
@@ -210,7 +210,7 @@ if [ $do3 = yes ] ; then
if [ "$locale" != "" ] ; then
echo "Test 3: locale-specific features (using '$locale' locale)"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $infile testtry
if [ $? = 0 ] ; then
$cf $outfile testtry
@@ -219,13 +219,13 @@ if [ $do3 = yes ] ; then
echo "Locale test did not run entirely successfully."
echo "This usually means that there is a problem with the locale"
echo "settings rather than a bug in PCRE."
- break;
+ break;
else
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
fi
else exit 1
fi
- done
+ done
else
echo "Cannot test locale-specific features - neither the 'fr_FR' nor the"
echo "'french' locale exists, or the \"locale\" command is not available"
@@ -238,7 +238,7 @@ fi
if [ $do4 = yes ] ; then
echo "Test 4: UTF-8 support (Compatible with Perl >= 5.8)"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput4 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput4 testtry
@@ -246,12 +246,12 @@ if [ $do4 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
if [ $do5 = yes ] ; then
echo "Test 5: API, internals, and non-Perl stuff for UTF-8 support"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput5 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput5 testtry
@@ -259,12 +259,12 @@ if [ $do5 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
if [ $do6 = yes ] ; then
echo "Test 6: Unicode property support (Compatible with Perl >= 5.10)"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput6 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput6 testtry
@@ -272,14 +272,14 @@ if [ $do6 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
# Tests for DFA matching support
if [ $do7 = yes ] ; then
echo "Test 7: DFA matching"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt -dfa $testdata/testinput7 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput7 testtry
@@ -287,12 +287,12 @@ if [ $do7 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
if [ $do8 = yes ] ; then
echo "Test 8: DFA matching with UTF-8"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt -dfa $testdata/testinput8 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput8 testtry
@@ -300,12 +300,12 @@ if [ $do8 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
if [ $do9 = yes ] ; then
echo "Test 9: DFA matching with Unicode properties"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt -dfa $testdata/testinput9 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput9 testtry
@@ -313,7 +313,7 @@ if [ $do9 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
# Test of internal offsets and code sizes. This test is run only when there
@@ -324,7 +324,7 @@ fi
if [ $do10 = yes ] ; then
echo "Test 10: Internal offsets and code size tests"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput10 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput10 testtry
@@ -332,14 +332,14 @@ if [ $do10 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
# Test of Perl >= 5.10 features without UTF8 support
if [ $do11 = yes ] ; then
echo "Test 11: Features from Perl >= 5.10 without UTF8 support"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput11 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput11 testtry
@@ -347,14 +347,14 @@ if [ $do11 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
# Test of Perl >= 5.10 features with UTF8 support
if [ $do12 = yes ] ; then
echo "Test 12: Features from Perl >= 5.10 with UTF8 support"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput12 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput12 testtry
@@ -362,14 +362,14 @@ if [ $do12 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
# Test non-Perl-compatible Unicode property support
if [ $do13 = yes ] ; then
echo "Test 13: API, internals, and non-Perl stuff for Unicode property support"
- for opt in "" "-s"; do
+ for opt in "" "-s"; do
$valgrind ./pcretest -q $opt $testdata/testinput13 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput13 testtry
@@ -377,7 +377,7 @@ if [ $do13 = yes ] ; then
else exit 1
fi
if [ "$opt" = "-s" ] ; then echo "OK with study" ; else echo "OK"; fi
- done
+ done
fi
# End
diff --git a/RunTest.bat b/RunTest.bat
index dea4bb9..8d411f4 100644
--- a/RunTest.bat
+++ b/RunTest.bat
@@ -4,7 +4,7 @@
@rem Philip H also changed test 3 to use "wintest" files.
@rem
@rem Updated by Tom Fortmann to support explicit test numbers on the command line.
-@rem Added argument validation and added error reporting.
+@rem Added argument validation and added error reporting.
@rem
@rem MS Windows batch file to run pcretest on testfiles with the correct
@rem options.
diff --git a/config-cmake.h.in b/config-cmake.h.in
index 5951af7..b212276 100644
--- a/config-cmake.h.in
+++ b/config-cmake.h.in
@@ -36,7 +36,7 @@
#define LINK_SIZE @PCRE_LINK_SIZE@
#define MATCH_LIMIT @PCRE_MATCH_LIMIT@
#define MATCH_LIMIT_RECURSION @PCRE_MATCH_LIMIT_RECURSION@
-
+#define PCREGREP_BUFSIZE @PCREGREP_BUFSIZE@
#define MAX_NAME_SIZE 32
#define MAX_NAME_COUNT 10000
diff --git a/configure.ac b/configure.ac
index 1b50ed6..8bb4017 100644
--- a/configure.ac
+++ b/configure.ac
@@ -509,7 +509,7 @@ fi
if test $with_pcregrep_bufsize -lt 8192 ; then
with_pcregrep_bufsize="8192"
fi
-
+
AC_DEFINE_UNQUOTED([PCREGREP_BUFSIZE], [$with_pcregrep_bufsize], [
The value of PCREGREP_BUFSIZE determines the size of buffer used by
pcregrep to hold parts of the file it is searching. On systems that
@@ -744,7 +744,7 @@ $PACKAGE-$VERSION configuration summary:
Match limit recursion ........... : ${with_match_limit_recursion}
Build shared libs ............... : ${enable_shared}
Build static libs ............... : ${enable_static}
- Buffer size for pcregrep ........ : ${with_pcregrep_bufsize}
+ Buffer size for pcregrep ........ : ${with_pcregrep_bufsize}
Link pcregrep with libz ......... : ${enable_pcregrep_libz}
Link pcregrep with libbz2 ....... : ${enable_pcregrep_libbz2}
Link pcretest with libreadline .. : ${enable_pcretest_libreadline}
diff --git a/doc/html/pcre.html b/doc/html/pcre.html
index f2ef9dd..ddba3d6 100644
--- a/doc/html/pcre.html
+++ b/doc/html/pcre.html
@@ -33,7 +33,7 @@ for requesting some minor changes that give better JavaScript compatibility.
The current implementation of PCRE corresponds approximately with Perl 5.12,
including support for UTF-8 encoded strings and Unicode general category
properties. However, UTF-8 and Unicode support has to be explicitly enabled; it
-is not the default. The Unicode tables correspond to Unicode release 5.2.0.
+is not the default. The Unicode tables correspond to Unicode release 6.0.0.
</P>
<P>
In addition to the Perl-compatible matching function, PCRE contains an
@@ -207,13 +207,18 @@ the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
UTF-8.)
</P>
<P>
-If an invalid UTF-8 string is passed to PCRE, an error return
-(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
-your strings are valid, and therefore want to skip these checks in order to
-improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
-at run time, PCRE assumes that the pattern or subject it is given
-(respectively) contains only valid UTF-8 codes. In this case, it does not
-diagnose an invalid UTF-8 string.
+If an invalid UTF-8 string is passed to PCRE, an error return is given. At
+compile time, the only additional information is the offset to the first byte
+of the failing character. The runtime functions (<b>pcre_exec()</b> and
+<b>pcre_dfa_exec()</b>), pass back this information as well as a more detailed
+reason code if the caller has provided memory in which to do this.
+</P>
+<P>
+In some situations, you may already know that your strings are valid, and
+therefore want to skip these checks in order to improve performance. If you set
+the PCRE_NO_UTF8_CHECK flag at compile time or at run time, PCRE assumes that
+the pattern or subject it is given (respectively) contains only valid UTF-8
+codes. In this case, it does not diagnose an invalid UTF-8 string.
</P>
<P>
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
@@ -305,9 +310,9 @@ two digits 10, at the domain cam.ac.uk.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 13 November 2010
+Last updated: 07 May 2011
<br>
-Copyright &copy; 1997-2010 University of Cambridge.
+Copyright &copy; 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcreapi.html b/doc/html/pcreapi.html
index 8c86fb5..4c09656 100644
--- a/doc/html/pcreapi.html
+++ b/doc/html/pcreapi.html
@@ -444,17 +444,17 @@ If <i>errptr</i> is NULL, <b>pcre_compile()</b> returns NULL immediately.
Otherwise, if compilation of a pattern fails, <b>pcre_compile()</b> returns
NULL, and sets the variable pointed to by <i>errptr</i> to point to a textual
error message. This is a static string that is part of the library. You must
-not try to free it. The offset from the start of the pattern to the byte that
-was being processed when the error was discovered is placed in the variable
-pointed to by <i>erroffset</i>, which must not be NULL. If it is, an immediate
-error is given. Some errors are not detected until checks are carried out when
-the whole pattern has been scanned; in this case the offset is set to the end
-of the pattern.
+not try to free it. Normally, the offset from the start of the pattern to the
+byte that was being processed when the error was discovered is placed in the
+variable pointed to by <i>erroffset</i>, which must not be NULL (if it is, an
+immediate error is given). However, for an invalid UTF-8 string, the offset is
+that of the first byte of the failing character. Also, some errors are not
+detected until checks are carried out when the whole pattern has been scanned;
+in these cases the offset passed back is the length of the pattern.
</P>
<P>
Note that the offset is in bytes, not characters, even in UTF-8 mode. It may
-point into the middle of a UTF-8 character (for example, when
-PCRE_ERROR_BADUTF8 is returned for an invalid UTF-8 string).
+sometimes point into the middle of a UTF-8 character.
</P>
<P>
If <b>pcre_compile2()</b> is used instead of <b>pcre_compile()</b>, and the
@@ -561,9 +561,9 @@ ignored. This is equivalent to Perl's /x option, and it can be changed within a
pattern by a (?x) option setting.
</P>
<P>
-Which characters are interpreted as newlines
-is controlled by the options passed to <b>pcre_compile()</b> or by a special
-sequence at the start of the pattern, as described in the section entitled
+Which characters are interpreted as newlines is controlled by the options
+passed to <b>pcre_compile()</b> or by a special sequence at the start of the
+pattern, as described in the section entitled
<a href="pcrepattern.html#newlines">"Newline conventions"</a>
in the <b>pcrepattern</b> documentation. Note that the end of this type of
comment is a literal newline sequence in the pattern; escape sequences that
@@ -939,7 +939,7 @@ internal tables) to <b>pcre_exec()</b>. Although not intended for this purpose,
this facility could be used to match a pattern in a different locale from the
one in which it was compiled. Passing table pointers at run time is discussed
below in the section on matching a pattern.
-</P>
+<a name="infoaboutpattern"></a></P>
<br><a name="SEC11" href="#TOC1">INFORMATION ABOUT A PATTERN</a><br>
<P>
<b>int pcre_fullinfo(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
@@ -1533,9 +1533,13 @@ in the main
<a href="pcre.html"><b>pcre</b></a>
page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_exec()</b> returns
the error PCRE_ERROR_BADUTF8 or, if PCRE_PARTIAL_HARD is set and the problem is
-a truncated UTF-8 character at the end of the subject, PCRE_ERROR_SHORTUTF8. If
-<i>startoffset</i> contains a value that does not point to the start of a UTF-8
-character (or to the end of the subject), PCRE_ERROR_BADUTF8_OFFSET is
+a truncated UTF-8 character at the end of the subject, PCRE_ERROR_SHORTUTF8. In
+both cases, information about the precise nature of the error may also be
+returned (see the descriptions of these errors in the section entitled \fIError
+return values from\fP <b>pcre_exec()</b>
+<a href="#errorlist">below).</a>
+If <i>startoffset</i> contains a value that does not point to the start of a
+UTF-8 character (or to the end of the subject), PCRE_ERROR_BADUTF8_OFFSET is
returned.
</P>
<P>
@@ -1784,14 +1788,21 @@ documentation for details.
<pre>
PCRE_ERROR_BADUTF8 (-10)
</pre>
-A string that contains an invalid UTF-8 byte sequence was passed as a subject.
-However, if PCRE_PARTIAL_HARD is set and the problem is a truncated UTF-8
-character at the end of the subject, PCRE_ERROR_SHORTUTF8 is used instead.
+A string that contains an invalid UTF-8 byte sequence was passed as a subject,
+and the PCRE_NO_UTF8_CHECK option was not set. If the size of the output vector
+(<i>ovecsize</i>) is at least 2, the byte offset to the start of the the invalid
+UTF-8 character is placed in the first element, and a reason code is placed in
+the second element. The reason codes are listed in the
+<a href="#badutf8reasons">following section.</a>
+For backward compatibility, if PCRE_PARTIAL_HARD is set and the problem is a
+truncated UTF-8 character at the end of the subject (reason codes 1 to 5),
+PCRE_ERROR_SHORTUTF8 is returned instead of PCRE_ERROR_BADUTF8.
<pre>
PCRE_ERROR_BADUTF8_OFFSET (-11)
</pre>
-The UTF-8 byte sequence that was passed as a subject was valid, but the value
-of <i>startoffset</i> did not point to the beginning of a UTF-8 character or the
+The UTF-8 byte sequence that was passed as a subject was checked and found to
+be valid (the PCRE_NO_UTF8_CHECK option was not set), but the value of
+<i>startoffset</i> did not point to the beginning of a UTF-8 character or the
end of the subject.
<pre>
PCRE_ERROR_PARTIAL (-12)
@@ -1833,12 +1844,98 @@ subject, that is, the value in <i>length</i>.
<pre>
PCRE_ERROR_SHORTUTF8 (-25)
</pre>
-The subject string ended with an incomplete (truncated) UTF-8 character, and
-the PCRE_PARTIAL_HARD option was set. Without this option, PCRE_ERROR_BADUTF8
-is returned in this situation.
+This error is returned instead of PCRE_ERROR_BADUTF8 when the subject string
+ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD option is set.
+Information about the failure is returned as for PCRE_ERROR_BADUTF8. It is in
+fact sufficient to detect this case, but this special error code for
+PCRE_PARTIAL_HARD precedes the implementation of returned information; it is
+retained for backwards compatibility.
+<pre>
+ PCRE_ERROR_RECURSELOOP (-26)
+</pre>
+This error is returned when <b>pcre_exec()</b> detects a recursion loop within
+the pattern. Specifically, it means that either the whole pattern or a
+subpattern has been called recursively for the second time at the same position
+in the subject string. Some simple patterns that might do this are detected and
+faulted at compile time, but more complicated cases, in particular mutual
+recursions between two different subpatterns, cannot be detected until run
+time.
</P>
<P>
Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>.
+<a name="badutf8reasons"></a></P>
+<br><b>
+Reason codes for invalid UTF-8 strings
+</b><br>
+<P>
+When <b>pcre_exec()</b> returns either PCRE_ERROR_BADUTF8 or
+PCRE_ERROR_SHORTUTF8, and the size of the output vector (<i>ovecsize</i>) is at
+least 2, the offset of the start of the invalid UTF-8 character is placed in
+the first output vector element (<i>ovector[0]</i>) and a reason code is placed
+in the second element (<i>ovector[1]</i>). The reason codes are given names in
+the <b>pcre.h</b> header file:
+<pre>
+ PCRE_UTF8_ERR1
+ PCRE_UTF8_ERR2
+ PCRE_UTF8_ERR3
+ PCRE_UTF8_ERR4
+ PCRE_UTF8_ERR5
+</pre>
+The string ends with a truncated UTF-8 character; the code specifies how many
+bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be
+no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279)
+allows for up to 6 bytes, and this is checked first; hence the possibility of
+4 or 5 missing bytes.
+<pre>
+ PCRE_UTF8_ERR6
+ PCRE_UTF8_ERR7
+ PCRE_UTF8_ERR8
+ PCRE_UTF8_ERR9
+ PCRE_UTF8_ERR10
+</pre>
+The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the
+character do not have the binary value 0b10 (that is, either the most
+significant bit is 0, or the next bit is 1).
+<pre>
+ PCRE_UTF8_ERR11
+ PCRE_UTF8_ERR12
+</pre>
+A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long;
+these code points are excluded by RFC 3629.
+<pre>
+ PCRE_UTF8_ERR13
+</pre>
+A 4-byte character has a value greater than 0x10fff; these code points are
+excluded by RFC 3629.
+<pre>
+ PCRE_UTF8_ERR14
+</pre>
+A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of
+code points are reserved by RFC 3629 for use with UTF-16, and so are excluded
+from UTF-8.
+<pre>
+ PCRE_UTF8_ERR15
+ PCRE_UTF8_ERR16
+ PCRE_UTF8_ERR17
+ PCRE_UTF8_ERR18
+ PCRE_UTF8_ERR19
+</pre>
+A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a
+value that can be represented by fewer bytes, which is invalid. For example,
+the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just
+one byte.
+<pre>
+ PCRE_UTF8_ERR20
+</pre>
+The two most significant bits of the first byte of a character have the binary
+value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a
+byte can only validly occur as the second or subsequent byte of a multi-byte
+character.
+<pre>
+ PCRE_UTF8_ERR21
+</pre>
+The first byte of a character has the value 0xfe or 0xff. These values can
+never occur in a valid UTF-8 string.
</P>
<br><a name="SEC15" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
<P>
@@ -2032,7 +2129,8 @@ fourth are pointers to variables which are updated by the function. After it
has run, they point to the first and last entries in the name-to-number table
for the given name. The function itself returns the length of each entry, or
PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
-described above in the section entitled <i>Information about a pattern</i>.
+described above in the section entitled <i>Information about a pattern</i>
+<a href="#infoaboutpattern">above.</a>
Given all the relevant entries for the name, you can extract each of their
numbers, and hence the captured data, if any.
</P>
@@ -2245,9 +2343,9 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 21 November 2010
+Last updated: 28 July 2011
<br>
-Copyright &copy; 1997-2010 University of Cambridge.
+Copyright &copy; 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcrebuild.html b/doc/html/pcrebuild.html
index 22f83c6..1db86c7 100644
--- a/doc/html/pcrebuild.html
+++ b/doc/html/pcrebuild.html
@@ -14,12 +14,12 @@ man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">PCRE BUILD-TIME OPTIONS</a>
-<li><a name="TOC2" href="#SEC2">C++ SUPPORT</a>
-<li><a name="TOC3" href="#SEC3">UTF-8 SUPPORT</a>
-<li><a name="TOC4" href="#SEC4">UNICODE CHARACTER PROPERTY SUPPORT</a>
-<li><a name="TOC5" href="#SEC5">CODE VALUE OF NEWLINE</a>
-<li><a name="TOC6" href="#SEC6">WHAT \R MATCHES</a>
-<li><a name="TOC7" href="#SEC7">BUILDING SHARED AND STATIC LIBRARIES</a>
+<li><a name="TOC2" href="#SEC2">BUILDING SHARED AND STATIC LIBRARIES</a>
+<li><a name="TOC3" href="#SEC3">C++ SUPPORT</a>
+<li><a name="TOC4" href="#SEC4">UTF-8 SUPPORT</a>
+<li><a name="TOC5" href="#SEC5">UNICODE CHARACTER PROPERTY SUPPORT</a>
+<li><a name="TOC6" href="#SEC6">CODE VALUE OF NEWLINE</a>
+<li><a name="TOC7" href="#SEC7">WHAT \R MATCHES</a>
<li><a name="TOC8" href="#SEC8">POSIX MALLOC USAGE</a>
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
@@ -27,10 +27,11 @@ man page, in case the conversion went wrong.
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
<li><a name="TOC14" href="#SEC14">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
-<li><a name="TOC15" href="#SEC15">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a>
-<li><a name="TOC16" href="#SEC16">SEE ALSO</a>
-<li><a name="TOC17" href="#SEC17">AUTHOR</a>
-<li><a name="TOC18" href="#SEC18">REVISION</a>
+<li><a name="TOC15" href="#SEC15">PCREGREP BUFFER SIZE</a>
+<li><a name="TOC16" href="#SEC16">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a>
+<li><a name="TOC17" href="#SEC17">SEE ALSO</a>
+<li><a name="TOC18" href="#SEC18">AUTHOR</a>
+<li><a name="TOC19" href="#SEC19">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE BUILD-TIME OPTIONS</a><br>
<P>
@@ -61,7 +62,17 @@ The following sections include descriptions of options whose names begin with
--enable and --disable always come in pairs, so the complementary option always
exists as well, but as it specifies the default, it is not described.
</P>
-<br><a name="SEC2" href="#TOC1">C++ SUPPORT</a><br>
+<br><a name="SEC2" href="#TOC1">BUILDING SHARED AND STATIC LIBRARIES</a><br>
+<P>
+The PCRE building process uses <b>libtool</b> to build both shared and static
+Unix libraries by default. You can suppress one of these by adding one of
+<pre>
+ --disable-shared
+ --disable-static
+</pre>
+to the <b>configure</b> command, as required.
+</P>
+<br><a name="SEC3" href="#TOC1">C++ SUPPORT</a><br>
<P>
By default, the <b>configure</b> script will search for a C++ compiler and C++
header files. If it finds them, it automatically builds the C++ wrapper library
@@ -71,7 +82,7 @@ for PCRE. You can disable this by adding
</pre>
to the <b>configure</b> command.
</P>
-<br><a name="SEC3" href="#TOC1">UTF-8 SUPPORT</a><br>
+<br><a name="SEC4" href="#TOC1">UTF-8 SUPPORT</a><br>
<P>
To build PCRE with support for UTF-8 Unicode character strings, add
<pre>
@@ -89,7 +100,7 @@ not possible to support both EBCDIC and UTF-8 codes in the same version of the
library. Consequently, --enable-utf8 and --enable-ebcdic are mutually
exclusive.
</P>
-<br><a name="SEC4" href="#TOC1">UNICODE CHARACTER PROPERTY SUPPORT</a><br>
+<br><a name="SEC5" href="#TOC1">UNICODE CHARACTER PROPERTY SUPPORT</a><br>
<P>
UTF-8 support allows PCRE to process character values greater than 255 in the
strings that it handles. On its own, however, it does not provide any
@@ -109,7 +120,7 @@ supported. Details are given in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation.
</P>
-<br><a name="SEC5" href="#TOC1">CODE VALUE OF NEWLINE</a><br>
+<br><a name="SEC6" href="#TOC1">CODE VALUE OF NEWLINE</a><br>
<P>
By default, PCRE interprets the linefeed (LF) character as indicating the end
of a line. This is the normal newline character on Unix-like systems. You can
@@ -142,7 +153,7 @@ Whatever line ending convention is selected when PCRE is built can be
overridden when the library functions are called. At build time it is
conventional to use the standard for your operating system.
</P>
-<br><a name="SEC6" href="#TOC1">WHAT \R MATCHES</a><br>
+<br><a name="SEC7" href="#TOC1">WHAT \R MATCHES</a><br>
<P>
By default, the sequence \R in a pattern matches any Unicode newline sequence,
whatever has been selected as the line ending sequence. If you specify
@@ -153,16 +164,6 @@ the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
selected when PCRE is built can be overridden when the library functions are
called.
</P>
-<br><a name="SEC7" href="#TOC1">BUILDING SHARED AND STATIC LIBRARIES</a><br>
-<P>
-The PCRE building process uses <b>libtool</b> to build both shared and static
-Unix libraries by default. You can suppress one of these by adding one of
-<pre>
- --disable-shared
- --disable-static
-</pre>
-to the <b>configure</b> command, as required.
-</P>
<br><a name="SEC8" href="#TOC1">POSIX MALLOC USAGE</a><br>
<P>
When PCRE is called through the POSIX interface (see the
@@ -299,7 +300,22 @@ to the <b>configure</b> command. These options naturally require that the
relevant libraries are installed on your system. Configuration will fail if
they are not.
</P>
-<br><a name="SEC15" href="#TOC1">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a><br>
+<br><a name="SEC15" href="#TOC1">PCREGREP BUFFER SIZE</a><br>
+<P>
+<b>pcregrep</b> uses an internal buffer to hold a "window" on the file it is
+scanning, in order to be able to output "before" and "after" lines when it
+finds a match. The size of the buffer is controlled by a parameter whose
+default value is 20K. The buffer itself is three times this size, but because
+of the way it is used for holding "before" lines, the longest line that is
+guaranteed to be processable is the parameter size. You can change the default
+parameter value by adding, for example,
+<pre>
+ --with-pcregrep-bufsize=50K
+</pre>
+to the <b>configure</b> command. The caller of \fPpcregrep\fP can, however,
+override this value by specifying a run-time option.
+</P>
+<br><a name="SEC16" href="#TOC1">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a><br>
<P>
If you add
<pre>
@@ -330,11 +346,11 @@ automatically included, you may need to add something like
</pre>
immediately before the <b>configure</b> command.
</P>
-<br><a name="SEC16" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC17" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcreapi</b>(3), <b>pcre_config</b>(3).
</P>
-<br><a name="SEC17" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC18" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@@ -343,11 +359,11 @@ University Computing Service
Cambridge CB2 3QH, England.
<br>
</P>
-<br><a name="SEC18" href="#TOC1">REVISION</a><br>
+<br><a name="SEC19" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 29 September 2009
+Last updated: 02 August 2011
<br>
-Copyright &copy; 1997-2009 University of Cambridge.
+Copyright &copy; 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcrecallout.html b/doc/html/pcrecallout.html
index 4ae09a5..5e27179 100644
--- a/doc/html/pcrecallout.html
+++ b/doc/html/pcrecallout.html
@@ -93,21 +93,22 @@ the <b>pcre_exec()</b> and the <b>pcre_dfa_exec()</b> matching functions. The
only argument to the callout function is a pointer to a <b>pcre_callout</b>
block. This structure contains the following fields:
<pre>
- int <i>version</i>;
- int <i>callout_number</i>;
- int *<i>offset_vector</i>;
- const char *<i>subject</i>;
- int <i>subject_length</i>;
- int <i>start_match</i>;
- int <i>current_position</i>;
- int <i>capture_top</i>;
- int <i>capture_last</i>;
- void *<i>callout_data</i>;
- int <i>pattern_position</i>;
- int <i>next_item_length</i>;
+ int <i>version</i>;
+ int <i>callout_number</i>;
+ int *<i>offset_vector</i>;
+ const char *<i>subject</i>;
+ int <i>subject_length</i>;
+ int <i>start_match</i>;
+ int <i>current_position</i>;
+ int <i>capture_top</i>;
+ int <i>capture_last</i>;
+ void *<i>callout_data</i>;
+ int <i>pattern_position</i>;
+ int <i>next_item_length</i>;
+ const unsigned char *<i>mark</i>;
</pre>
The <i>version</i> field is an integer containing the version number of the
-block format. The initial version was 0; the current version is 1. The version
+block format. The initial version was 0; the current version is 2. The version
number will change again in future if additional fields are added, but the
intention is never to remove any of the existing fields.
</P>
@@ -180,6 +181,13 @@ The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
help in distinguishing between different automatic callouts, which all have the
same callout number. However, they are set for all callouts.
</P>
+<P>
+The <i>mark</i> field is present from version 2 of the <i>pcre_callout</i>
+structure. In callouts from <b>pcre_exec()</b> it contains a pointer to the
+zero-terminated name of the most recently passed (*MARK) item in the match, or
+NULL if there are no (*MARK)s in the current matching path. In callouts from
+<b>pcre_dfa_exec()</b> this field always contains NULL.
+</P>
<br><a name="SEC4" href="#TOC1">RETURN VALUES</a><br>
<P>
The external callout function returns an integer to PCRE. If the value is zero,
@@ -206,9 +214,9 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 21 November 2010
+Last updated: 31 July 2011
<br>
-Copyright &copy; 1997-2010 University of Cambridge.
+Copyright &copy; 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcrecompat.html b/doc/html/pcrecompat.html
index 2ee59e9..126436c 100644
--- a/doc/html/pcrecompat.html
+++ b/doc/html/pcrecompat.html
@@ -29,10 +29,12 @@ in the main
page.
</P>
<P>
-2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
-them, but they do not mean what you might think. For example, (?!a){3} does
-not assert that the next three characters are not "a". It just asserts that the
-next character is not "a" three times.
+2. PCRE allows repeat quantifiers only on parenthesized assertions, but they do
+not mean what you might think. For example, (?!a){3} does not assert that the
+next three characters are not "a". It just asserts that the next character is
+not "a" three times (in principle: PCRE optimizes this to run the assertion
+just once). Perl allows repeat quantifiers on other assertions such as \b, but
+these do not seem to have any use.
</P>
<P>
3. Capturing subpatterns that occur inside negative lookahead assertions are
@@ -49,9 +51,11 @@ represent a binary zero.
</P>
<P>
5. The following Perl escape sequences are not supported: \l, \u, \L,
-\U, and \N. In fact these are implemented by Perl's general string-handling
-and are not part of its pattern matching engine. If any of these are
-encountered by PCRE, an error is generated.
+\U, and \N when followed by a character name or Unicode value. (\N on its
+own, matching a non-newline character, is supported.) In fact these are
+implemented by Perl's general string-handling and are not part of its pattern
+matching engine. If any of these are encountered by PCRE, an error is
+generated.
</P>
<P>
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE is
@@ -64,7 +68,12 @@ the internal representation of Unicode characters, there is no need to
implement the somewhat messy concept of surrogates."
</P>
<P>
-7. PCRE does support the \Q...\E escape for quoting substrings. Characters in
+7. PCRE implements a simpler version of \X than Perl, which changed to make
+\X match what Unicode calls an "extended grapheme cluster". This is more
+complicated than an extended Unicode sequence, which is what PCRE matches.
+</P>
+<P>
+8. PCRE does support the \Q...\E escape for quoting substrings. Characters in
between are treated as literals. This is slightly different from Perl in that $
and @ are also handled as literals inside the quotes. In Perl, they cause
variable interpolation (but of course PCRE does not have variables). Note the
@@ -79,7 +88,7 @@ following examples:
The \Q...\E sequence is recognized both inside and outside character classes.
</P>
<P>
-8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
+9. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
constructions. However, there is support for recursive patterns. This is not
available in Perl 5.8, but it is in Perl 5.10. Also, the PCRE "callout"
feature allows an external function to be called during pattern matching. See
@@ -88,7 +97,7 @@ the
documentation for details.
</P>
<P>
-9. Subpatterns that are called recursively or as "subroutines" are always
+10. Subpatterns that are called recursively or as "subroutines" are always
treated as atomic groups in PCRE. This is like Python, but unlike Perl. There
is a discussion of an example that explains this in more detail in the
<a href="pcrepattern.html#recursiondifference">section on recursion differences from Perl</a>
@@ -97,12 +106,12 @@ in the
page.
</P>
<P>
-10. There are some differences that are concerned with the settings of captured
+11. There are some differences that are concerned with the settings of captured
strings when part of a pattern is repeated. For example, matching "aba" against
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
</P>
<P>
-11. PCRE's handling of duplicate subpattern numbers and duplicate subpattern
+12. PCRE's handling of duplicate subpattern numbers and duplicate subpattern
names is not as general as Perl's. This is a consequence of the fact the PCRE
works internally just with numbers, using an external table to translate
between numbers and names. In particular, a pattern such as (?|(?&#60;a&#62;A)|(?&#60;b)B),
@@ -113,11 +122,13 @@ names map to capturing subpattern number 1. To avoid this confusing situation,
an error is given at compile time.
</P>
<P>
-12. Perl recognizes comments in some places that PCRE doesn't, for example,
-between the ( and ? at the start of a subpattern.
+13. Perl recognizes comments in some places that PCRE does not, for example,
+between the ( and ? at the start of a subpattern. If the /x modifier is set,
+Perl allows whitespace between ( and ? but PCRE never does, even if the
+PCRE_EXTENDED option is set.
</P>
<P>
-13. PCRE provides some extensions to the Perl regular expression facilities.
+14. PCRE provides some extensions to the Perl regular expression facilities.
Perl 5.10 includes new features that are not in earlier versions of Perl, some
of which (such as named parentheses) have been in PCRE for some time. This list
is with respect to Perl 5.10:
@@ -186,9 +197,9 @@ Cambridge CB2 3QH, England.
REVISION
</b><br>
<P>
-Last updated: 31 October 2010
+Last updated: 24 July 2011
<br>
-Copyright &copy; 1997-2010 University of Cambridge.
+Copyright &copy; 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcrecpp.html b/doc/html/pcrecpp.html
index 887c245..0ef2d4f 100644
--- a/doc/html/pcrecpp.html
+++ b/doc/html/pcrecpp.html
@@ -232,7 +232,7 @@ Normally, to pass one or more modifiers to a RE class, you declare
a <i>RE_Options</i> object, set the appropriate options, and pass this
object to a RE constructor. Example:
<pre>
- RE_options opt;
+ RE_Options opt;
opt.set_caseless(true);
if (RE("HELLO", opt).PartialMatch("hello world")) ...
</pre>
@@ -362,6 +362,8 @@ Copyright &copy; 2007 Google Inc.
<P>
Last updated: 17 March 2009
<br>
+Minor typo fixed: 25 July 2011
+<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
diff --git a/doc/html/pcregrep.html b/doc/html/pcregrep.html
index 925d1e2..52900ea 100644
--- a/doc/html/pcregrep.html
+++ b/doc/html/pcregrep.html
@@ -74,11 +74,19 @@ possible to search for patterns that span line boundaries. What defines a line
boundary is controlled by the <b>-N</b> (<b>--newline</b>) option.
</P>
<P>
-Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
-BUFSIZ is defined in <b>&#60;stdio.h&#62;</b>. When there is more than one pattern
-(specified by the use of <b>-e</b> and/or <b>-f</b>), each pattern is applied to
-each line in the order in which they are defined, except that all the <b>-e</b>
-patterns are tried before the <b>-f</b> patterns.
+The amount of memory used for buffering files that are being scanned is
+controlled by a parameter that can be set by the <b>--buffer-size</b> option.
+The default value for this parameter is specified when <b>pcregrep</b> is built,
+with the default default being 20K. A block of memory three times this size is
+used (to allow for buffering "before" and "after" lines). An error occurs if a
+line overflows the buffer.
+</P>
+<P>
+Patterns are limited to 8K or BUFSIZ bytes, whichever is the greater. BUFSIZ is
+defined in <b>&#60;stdio.h&#62;</b>. When there is more than one pattern (specified by
+the use of <b>-e</b> and/or <b>-f</b>), each pattern is applied to each line in
+the order in which they are defined, except that all the <b>-e</b> patterns are
+tried before the <b>-f</b> patterns.
</P>
<P>
By default, as soon as one pattern matches (or fails to match when <b>-v</b> is
@@ -122,11 +130,12 @@ standard input is always so treated.
The order in which some of the options appear can affect the output. For
example, both the <b>-h</b> and <b>-l</b> options affect the printing of file
names. Whichever comes later in the command line will be the one that takes
-effect.
+effect. Numerical values for options may be followed by K or M, to signify
+multiplication by 1024 or 1024*1024 respectively.
</P>
<P>
<b>--</b>
-This terminate the list of options. It is useful if the next item on the
+This terminates the list of options. It is useful if the next item on the
command line starts with a hyphen but is not an option. This allows for the
processing of patterns and filenames that start with hyphens.
</P>
@@ -149,6 +158,11 @@ of <i>number</i> is expected to be relatively small. However, <b>pcregrep</b>
guarantees to have up to 8K of preceding text available for context output.
</P>
<P>
+<b>--buffer-size=</b><i>number</i>
+Set the parameter that controls how much memory is used for buffering files
+that are being scanned.
+</P>
+<P>
<b>-C</b> <i>number</i>, <b>--context=</b><i>number</i>
Output <i>number</i> lines of context both before and after each matching line.
This is equivalent to setting both <b>-A</b> and <b>-B</b> to the same value.
@@ -421,7 +435,7 @@ and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
which recognizes any of the preceding three types, and an "any" convention, in
which any Unicode line ending sequence is assumed to end a line. The Unicode
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
-(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
+(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
PS (paragraph separator, U+2029).
<br>
<br>
@@ -600,10 +614,10 @@ discussion of these options above).
<br><a name="SEC10" href="#TOC1">DIAGNOSTICS</a><br>
<P>
Exit status is 0 if any matches were found, 1 if no matches were found, and 2
-for syntax errors and non-existent or inacessible files (even if matches were
-found in other files) or too many matching errors. Using the <b>-s</b> option to
-suppress error messages about inaccessble files does not affect the return
-code.
+for syntax errors, overlong lines, non-existent or inaccessible files (even if
+matches were found in other files) or too many matching errors. Using the
+<b>-s</b> option to suppress error messages about inaccessible files does not
+affect the return code.
</P>
<br><a name="SEC11" href="#TOC1">SEE ALSO</a><br>
<P>
@@ -620,7 +634,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 14 January 2011
+Last updated: 30 July 2011
<br>
Copyright &copy; 1997-2011 University of Cambridge.
<br>
diff --git a/doc/html/pcrepattern.html b/doc/html/pcrepattern.html
index b1fa6e0..6ddf3ef 100644
--- a/doc/html/pcrepattern.html
+++ b/doc/html/pcrepattern.html
@@ -245,7 +245,11 @@ Perl, $ and @ cause variable interpolation. Note the following examples:
\Qabc\E\$\Qxyz\E abc$xyz abc$xyz
</pre>
The \Q...\E sequence is recognized both inside and outside character classes.
-An isolated \E that is not preceded by \Q is ignored.
+An isolated \E that is not preceded by \Q is ignored. If \Q is not followed
+by \E later in the pattern, the literal interpretation continues to the end of
+the pattern (that is, \E is assumed at the end). If the isolated \Q is inside
+a character class, this causes an error, because the character class is not
+terminated.
<a name="digitsafterbackslash"></a></P>
<br><b>
Non-printing characters
@@ -752,6 +756,10 @@ preceding character. None of them have codepoints less than 256, so in
non-UTF-8 mode \X matches any one character.
</P>
<P>
+Note that recent versions of Perl have changed \X to match what Unicode calls
+an "extended grapheme cluster", which has a more complicated definition.
+</P>
+<P>
Matching characters by Unicode property is not fast, because PCRE has to search
a structure that contains data for over fifteen thousand characters. That is
why the traditional escape sequences such as \d and \w do not use Unicode
@@ -1405,7 +1413,7 @@ items:
an escape such as \d or \pL that matches a single character
a character class
a back reference (see next section)
- a parenthesized subpattern (unless it is an assertion)
+ a parenthesized subpattern (including assertions)
a recursive or "subroutine" call to a subpattern
</pre>
The general repetition quantifier specifies a minimum and maximum number of
@@ -1796,12 +1804,32 @@ that look behind it. An assertion subpattern is matched in the normal way,
except that it does not cause the current matching position to be changed.
</P>
<P>
-Assertion subpatterns are not capturing subpatterns, and may not be repeated,
-because it makes no sense to assert the same thing several times. If any kind
-of assertion contains capturing subpatterns within it, these are counted for
-the purposes of numbering the capturing subpatterns in the whole pattern.
-However, substring capturing is carried out only for positive assertions,
-because it does not make sense for negative assertions.
+Assertion subpatterns are not capturing subpatterns. If such an assertion
+contains capturing subpatterns within it, these are counted for the purposes of
+numbering the capturing subpatterns in the whole pattern. However, substring
+capturing is carried out only for positive assertions, because it does not make
+sense for negative assertions.
+</P>
+<P>
+For compatibility with Perl, assertion subpatterns may be repeated; though
+it makes no sense to assert the same thing several times, the side effect of
+capturing parentheses may occasionally be useful. In practice, there only three
+cases:
+<br>
+<br>
+(1) If the quantifier is {0}, the assertion is never obeyed during matching.
+However, it may contain internal capturing parenthesized groups that are called
+from elsewhere via the
+<a href="#subpatternsassubroutines">subroutine mechanism.</a>
+<br>
+<br>
+(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
+were {0,1}. At run time, the rest of the pattern match is tried with and
+without the assertion, the order depending on the greediness of the quantifier.
+<br>
+<br>
+(3) If the minimum repetition is greater than zero, the quantifier is ignored.
+The assertion is obeyed just once when encountered during matching.
</P>
<br><b>
Lookahead assertions
@@ -2445,8 +2473,10 @@ failing negative assertion, they cause an error if encountered by
<P>
If any of these verbs are used in an assertion or subroutine subpattern
(including recursive subpatterns), their effect is confined to that subpattern;
-it does not extend to the surrounding pattern. Note that such subpatterns are
-processed as anchored at the point where they are tested.
+it does not extend to the surrounding pattern, with one exception: a *MARK that
+is encountered in a positive assertion <i>is</i> passed back (compare capturing
+parentheses in assertions). Note that such subpatterns are processed as
+anchored at the point where they are tested.
</P>
<P>
The new verbs make use of what was previously invalid syntax: an opening
@@ -2536,6 +2566,11 @@ of obtaining this information than putting each alternative in its own
capturing parentheses.
</P>
<P>
+If (*MARK) is encountered in a positive assertion, its name is recorded and
+passed back if it is the last-encountered. This does not happen for negative
+assetions.
+</P>
+<P>
A name may also be returned after a failed match if the final path through the
pattern involves (*MARK). However, unless (*MARK) used in conjunction with
(*COMMIT), this is unlikely to happen for an unanchored pattern because, as the
@@ -2705,9 +2740,9 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC28" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 21 November 2010
+Last updated: 24 July 2011
<br>
-Copyright &copy; 1997-2010 University of Cambridge.
+Copyright &copy; 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcrestack.html b/doc/html/pcrestack.html
index 4423d22..2922e95 100644
--- a/doc/html/pcrestack.html
+++ b/doc/html/pcrestack.html
@@ -20,7 +20,10 @@ When you call <b>pcre_exec()</b>, it makes use of an internal function called
<b>match()</b>. This calls itself recursively at branch points in the pattern,
in order to remember the state of the match so that it can back up and try a
different alternative if the first one fails. As matching proceeds deeper and
-deeper into the tree of possibilities, the recursion depth increases.
+deeper into the tree of possibilities, the recursion depth increases. The
+<b>match()</b> function is also called in other circumstances, for example,
+whenever a parenthesized sub-pattern is entered, and in certain cases of
+repetition.
</P>
<P>
Not all calls of <b>match()</b> increase the recursion depth; for an item such
@@ -176,9 +179,9 @@ Cambridge CB2 3QH, England.
REVISION
</b><br>
<P>
-Last updated: 03 January 2010
+Last updated: 22 July 2011
<br>
-Copyright &copy; 1997-2010 University of Cambridge.
+Copyright &copy; 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/html/pcretest.html b/doc/html/pcretest.html
index 62aa468..7889672 100644
--- a/doc/html/pcretest.html
+++ b/doc/html/pcretest.html
@@ -14,7 +14,7 @@ man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
-<li><a name="TOC2" href="#SEC2">OPTIONS</a>
+<li><a name="TOC2" href="#SEC2">COMMAND LINE OPTIONS</a>
<li><a name="TOC3" href="#SEC3">DESCRIPTION</a>
<li><a name="TOC4" href="#SEC4">PATTERN MODIFIERS</a>
<li><a name="TOC5" href="#SEC5">DATA LINES</a>
@@ -31,7 +31,7 @@ man page, in case the conversion went wrong.
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
-<b>pcretest [options] [source] [destination]</b>
+<b>pcretest [options] [input file [output file]]</b>
<br>
<br>
<b>pcretest</b> was written as a test program for the PCRE regular expression
@@ -42,13 +42,16 @@ details of the regular expressions themselves, see the
documentation. For details of the PCRE library function calls and their
options, see the
<a href="pcreapi.html"><b>pcreapi</b></a>
-documentation.
+documentation. The input for <b>pcretest</b> is a sequence of regular expression
+patterns and strings to be matched, as described below. The output shows the
+result of each match. Options on the command line and the patterns control PCRE
+options and exactly what is output.
</P>
-<br><a name="SEC2" href="#TOC1">OPTIONS</a><br>
+<br><a name="SEC2" href="#TOC1">COMMAND LINE OPTIONS</a><br>
<P>
<b>-b</b>
-Behave as if each regex has the <b>/B</b> (show bytecode) modifier; the internal
-form is output after compilation.
+Behave as if each pattern has the <b>/B</b> (show byte code) modifier; the
+internal form is output after compilation.
</P>
<P>
<b>-C</b>
@@ -57,7 +60,7 @@ about the optional features that are included, and then exit.
</P>
<P>
<b>-d</b>
-Behave as if each regex has the <b>/D</b> (debug) modifier; the internal
+Behave as if each pattern has the <b>/D</b> (debug) modifier; the internal
form and information about the compiled pattern is output after compilation;
<b>-d</b> is equivalent to <b>-b -i</b>.
</P>
@@ -73,7 +76,7 @@ Output a brief summary these options and then exit.
</P>
<P>
<b>-i</b>
-Behave as if each regex has the <b>/I</b> modifier; information about the
+Behave as if each pattern has the <b>/I</b> modifier; information about the
compiled pattern is given after compilation.
</P>
<P>
@@ -85,8 +88,7 @@ calling <b>pcre_exec()</b> repeatedly with different limits.
<P>
<b>-m</b>
Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding <b>/M</b> to each regular expression. For compatibility
-with earlier versions of pcretest, <b>-s</b> is a synonym for <b>-m</b>.
+equivalent to adding <b>/M</b> to each regular expression.
</P>
<P>
<b>-o</b> <i>osize</i>
@@ -99,7 +101,7 @@ below).
</P>
<P>
<b>-p</b>
-Behave as if each regex has the <b>/P</b> modifier; the POSIX wrapper API is
+Behave as if each pattern has the <b>/P</b> modifier; the POSIX wrapper API is
used to call PCRE. None of the other options has any effect when <b>-p</b> is
set.
</P>
@@ -109,10 +111,28 @@ Do not output the version number of <b>pcretest</b> at the start of execution.
</P>
<P>
<b>-S</b> <i>size</i>
-On Unix-like systems, set the size of the runtime stack to <i>size</i>
+On Unix-like systems, set the size of the run-time stack to <i>size</i>
megabytes.
</P>
<P>
+<b>-s</b>
+Behave as if each pattern has the <b>/S</b> modifier; in other words, force each
+pattern to be studied. If the <b>/I</b> or <b>/D</b> option is present on a
+pattern (requesting output about the compiled pattern), information about the
+result of studying is not included when studying is caused only by <b>-s</b> and
+neither <b>-i</b> nor <b>-d</b> is present on the command line. This behaviour
+means that the output from tests that are run with and without <b>-s</b> should
+be identical, except when options that output information about the actual
+running of a match are set. The <b>-M</b>, <b>-t</b>, and <b>-tm</b> options,
+which give information about resources used, are likely to produce different
+output with and without <b>-s</b>. Output may also differ if the <b>/C</b> option
+is present on an individual pattern. This uses callouts to trace the the
+matching process, and this may be different between studied and non-studied
+patterns. If the pattern contains (*MARK) items there may also be differences,
+for the same reason. The <b>-s</b> command line option can be overridden for
+specific patterns that should never be studied (see the /S option below).
+</P>
+<P>
<b>-t</b>
Run each compile, study, and match many times with a timer, and output
resulting time per compile or match (in milliseconds). Do not set <b>-m</b> with
@@ -189,7 +209,7 @@ pcretest to read the next line as a continuation of the regular expression.
A pattern may be followed by any number of modifiers, which are mostly single
characters. Following Perl usage, these are referred to below as, for example,
"the <b>/i</b> modifier", even though the delimiter of the pattern need not
-always be a slash, and no slash is used when writing modifiers. Whitespace may
+always be a slash, and no slash is used when writing modifiers. White space may
appear between the final pattern delimiter and the first modifier, and between
the modifiers themselves.
</P>
@@ -226,10 +246,10 @@ options that do not correspond to anything in Perl:
<b>/&#60;bsr_unicode&#62;</b> PCRE_BSR_UNICODE
</pre>
The modifiers that are enclosed in angle brackets are literal strings as shown,
-including the angle brackets, but the letters can be in either case. This
-example sets multiline matching with CRLF as the line ending sequence:
+including the angle brackets, but the letters within can be in either case.
+This example sets multiline matching with CRLF as the line ending sequence:
<pre>
- /^abc/m&#60;crlf&#62;
+ /^abc/m&#60;CRLF&#62;
</pre>
As well as turning on the PCRE_UTF8 option, the <b>/8</b> modifier also causes
any non-printing characters in output strings to be printed using the
@@ -271,9 +291,21 @@ operates.
</P>
<P>
The <b>/+</b> modifier requests that as well as outputting the substring that
-matched the entire pattern, pcretest should in addition output the remainder of
-the subject string. This is useful for tests where the subject contains
-multiple copies of the same substring.
+matched the entire pattern, <b>pcretest</b> should in addition output the
+remainder of the subject string. This is useful for tests where the subject
+contains multiple copies of the same substring. If the <b>+</b> modifier appears
+twice, the same action is taken for captured substrings. In each case the
+remainder is output on the following line with a plus character following the
+capture number.
+</P>
+<P>
+The <b>/=</b> modifier requests that the values of all potential captured
+parentheses be output after a match by <b>pcre_exec()</b>. By default, only
+those up to the highest one actually used in the match are output
+(corresponding to the return code from <b>pcre_exec()</b>). Values in the
+offsets vector corresponding to higher numbers should be set to -1, and these
+are output as "&#60;unset&#62;". This modifier gives a way of checking that this is
+happening.
</P>
<P>
The <b>/B</b> modifier is a debugging feature. It requests that <b>pcretest</b>
@@ -331,9 +363,13 @@ The <b>/M</b> modifier causes the size of memory block used to hold the compiled
pattern to be output.
</P>
<P>
-The <b>/S</b> modifier causes <b>pcre_study()</b> to be called after the
-expression has been compiled, and the results used when the expression is
-matched.
+If the <b>/S</b> modifier appears once, it causes <b>pcre_study()</b> to be
+called after the expression has been compiled, and the results used when the
+expression is matched. If <b>/S</b> appears twice, it suppresses studying, even
+if it was requested externally by the <b>-s</b> command line option. This makes
+it possible to specify that certain patterns are always studied, and others are
+never studied, independently of <b>-s</b>. This feature is used in the test
+files in a few cases where the output is different when the pattern is studied.
</P>
<P>
The <b>/T</b> modifier must be followed by a single digit. It causes a specific
@@ -370,8 +406,8 @@ ignored.
<br><a name="SEC5" href="#TOC1">DATA LINES</a><br>
<P>
Before each data line is passed to <b>pcre_exec()</b>, leading and trailing
-whitespace is removed, and it is then scanned for \ escapes. Some of these are
-pretty esoteric features, intended for checking out some of the more
+white space is removed, and it is then scanned for \ escapes. Some of these
+are pretty esoteric features, intended for checking out some of the more
complicated features of PCRE. If you are just testing "ordinary" regular
expressions, you probably don't need any of these. The following escapes are
recognized:
@@ -379,7 +415,7 @@ recognized:
\a alarm (BEL, \x07)
\b backspace (\x08)
\e escape (\x27)
- \f formfeed (\x0c)
+ \f form feed (\x0c)
\n newline (\x0a)
\qdd set the PCRE_MATCH_LIMIT limit to dd (any number of digits)
\r carriage return (\x0d)
@@ -498,18 +534,22 @@ This section describes the output when the normal matching function,
<b>pcre_exec()</b>, is being used.
</P>
<P>
-When a match succeeds, pcretest outputs the list of captured substrings that
-<b>pcre_exec()</b> returns, starting with number 0 for the string that matched
-the whole pattern. Otherwise, it outputs "No match" when the return is
+When a match succeeds, <b>pcretest</b> outputs the list of captured substrings
+that <b>pcre_exec()</b> returns, starting with number 0 for the string that
+matched the whole pattern. Otherwise, it outputs "No match" when the return is
PCRE_ERROR_NOMATCH, and "Partial match:" followed by the partially matching
substring when <b>pcre_exec()</b> returns PCRE_ERROR_PARTIAL. (Note that this is
the entire substring that was inspected during the partial match; it may
include characters before the actual match start if a lookbehind assertion,
-\K, \b, or \B was involved.) For any other returns, it outputs the PCRE
-negative error number. Here is an example of an interactive <b>pcretest</b> run.
+\K, \b, or \B was involved.) For any other return, <b>pcretest</b> outputs
+the PCRE negative error number and a short descriptive phrase. If the error is
+a failed UTF-8 string check, the byte offset of the start of the failing
+character and the reason code are also output, provided that the size of the
+output vector is at least two. Here is an example of an interactive
+<b>pcretest</b> run.
<pre>
$ pcretest
- PCRE version 7.0 30-Nov-2006
+ PCRE version 8.13 2011-04-30
re&#62; /^abc(\d+)/
data&#62; abc123
@@ -518,11 +558,11 @@ negative error number. Here is an example of an interactive <b>pcretest</b> run.
data&#62; xyz
No match
</pre>
-Note that unset capturing substrings that are not followed by one that is set
-are not returned by <b>pcre_exec()</b>, and are not shown by <b>pcretest</b>. In
-the following example, there are two capturing substrings, but when the first
-data line is matched, the second, unset substring is not shown. An "internal"
-unset substring is shown as "&#60;unset&#62;", as for the second data line.
+Unset capturing substrings that are not followed by one that is set are not
+returned by <b>pcre_exec()</b>, and are not shown by <b>pcretest</b>. In the
+following example, there are two capturing substrings, but when the first data
+line is matched, the second, unset substring is not shown. An "internal" unset
+substring is shown as "&#60;unset&#62;", as for the second data line.
<pre>
re&#62; /(a)|(b)/
data&#62; a
@@ -556,7 +596,14 @@ matching attempts are output in sequence, like this:
0: ipp
1: pp
</pre>
-"No match" is output only if the first match attempt fails.
+"No match" is output only if the first match attempt fails. Here is an example
+of a failure message (the offset 4 that is specified by \&#62;4 is past the end of
+the subject string):
+<pre>
+ re&#62; /xyz/
+ data&#62; xyz\&#62;4
+ Error -24 (bad offset value)
+</PRE>
</P>
<P>
If any of the sequences <b>\C</b>, <b>\G</b>, or <b>\L</b> are present in a
@@ -656,9 +703,28 @@ example:
+10 ^ ^
0: E*
</pre>
+If a pattern contains (*MARK) items, an additional line is output whenever
+a change of latest mark is passed to the callout function. For example:
+<pre>
+ re&#62; /a(*MARK:X)bc/C
+ data&#62; abc
+ ---&#62;abc
+ +0 ^ a
+ +1 ^^ (*MARK:X)
+ +10 ^^ b
+ Latest Mark: X
+ +11 ^ ^ c
+ +12 ^ ^
+ 0: abc
+</pre>
+The mark changes between matching "a" and "b", but stays the same for the rest
+of the match, so nothing more is output. If, as a result of backtracking, the
+mark reverts to being unset, the text "&#60;unset&#62;" is output.
+</P>
+<P>
The callout function in <b>pcretest</b> returns zero (carry on matching) by
default, but you can use a \C item in a data line (as described above) to
-change this.
+change this and other parameters of the callout.
</P>
<P>
Inserting callouts can be helpful when using <b>pcretest</b> to check
@@ -682,7 +748,7 @@ function to distinguish printing and non-printing characters.
<br><a name="SEC12" href="#TOC1">SAVING AND RELOADING COMPILED PATTERNS</a><br>
<P>
The facilities described in this section are not available when the POSIX
-inteface to PCRE is being used, that is, when the <b>/P</b> pattern modifier is
+interface to PCRE is being used, that is, when the <b>/P</b> pattern modifier is
specified.
</P>
<P>
@@ -707,14 +773,14 @@ follows immediately after the compiled pattern. After writing the file,
<b>pcretest</b> expects to read a new pattern.
</P>
<P>
-A saved pattern can be reloaded into <b>pcretest</b> by specifing &#60; and a file
+A saved pattern can be reloaded into <b>pcretest</b> by specifying &#60; and a file
name instead of a pattern. The name of the file must not contain a &#60; character,
as otherwise <b>pcretest</b> will interpret the line as a pattern delimited by &#60;
characters.
For example:
<pre>
re&#62; &#60;/some/file
- Compiled regex loaded from /some/file
+ Compiled pattern loaded from /some/file
No study data
</pre>
When the pattern has been loaded, <b>pcretest</b> proceeds to read data lines in
@@ -757,9 +823,9 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 21 November 2010
+Last updated: 01 August 2011
<br>
-Copyright &copy; 1997-2010 University of Cambridge.
+Copyright &copy; 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
diff --git a/doc/pcre.3 b/doc/pcre.3
index 65f586c..8afdcf0 100644
--- a/doc/pcre.3
+++ b/doc/pcre.3
@@ -208,8 +208,8 @@ available as independent code points in the UTF-8 encoding. (In other words,
the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
UTF-8.)
.P
-If an invalid UTF-8 string is passed to PCRE, an error return is given. At
-compile time, the only additional information is the offset to the first byte
+If an invalid UTF-8 string is passed to PCRE, an error return is given. At
+compile time, the only additional information is the offset to the first byte
of the failing character. The runtime functions (\fBpcre_exec()\fP and
\fBpcre_dfa_exec()\fP), pass back this information as well as a more detailed
reason code if the caller has provided memory in which to do this.
diff --git a/doc/pcre.txt b/doc/pcre.txt
index ac4254e..cb9bd0e 100644
--- a/doc/pcre.txt
+++ b/doc/pcre.txt
@@ -29,7 +29,7 @@ INTRODUCTION
5.12, including support for UTF-8 encoded strings and Unicode general
category properties. However, UTF-8 and Unicode support has to be
explicitly enabled; it is not the default. The Unicode tables corre-
- spond to Unicode release 5.2.0.
+ spond to Unicode release 6.0.0.
In addition to the Perl-compatible matching function, PCRE contains an
alternative function that matches the same compiled patterns in a dif-
@@ -181,13 +181,19 @@ UTF-8 AND UNICODE PROPERTY SUPPORT
points in the UTF-8 encoding. (In other words, the whole surrogate
thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
- If an invalid UTF-8 string is passed to PCRE, an error return
- (PCRE_ERROR_BADUTF8) is given. In some situations, you may already know
- that your strings are valid, and therefore want to skip these checks in
- order to improve performance. If you set the PCRE_NO_UTF8_CHECK flag at
- compile time or at run time, PCRE assumes that the pattern or subject
- it is given (respectively) contains only valid UTF-8 codes. In this
- case, it does not diagnose an invalid UTF-8 string.
+ If an invalid UTF-8 string is passed to PCRE, an error return is given.
+ At compile time, the only additional information is the offset to the
+ first byte of the failing character. The runtime functions (pcre_exec()
+ and pcre_dfa_exec()), pass back this information as well as a more
+ detailed reason code if the caller has provided memory in which to do
+ this.
+
+ In some situations, you may already know that your strings are valid,
+ and therefore want to skip these checks in order to improve perfor-
+ mance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or at run
+ time, PCRE assumes that the pattern or subject it is given (respec-
+ tively) contains only valid UTF-8 codes. In this case, it does not
+ diagnose an invalid UTF-8 string.
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set,
what happens depends on why the string is invalid. If the string con-
@@ -266,8 +272,8 @@ AUTHOR
REVISION
- Last updated: 13 November 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 07 May 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
@@ -307,6 +313,18 @@ PCRE BUILD-TIME OPTIONS
is not described.
+BUILDING SHARED AND STATIC LIBRARIES
+
+ The PCRE building process uses libtool to build both shared and static
+ Unix libraries by default. You can suppress one of these by adding one
+ of
+
+ --disable-shared
+ --disable-static
+
+ to the configure command, as required.
+
+
C++ SUPPORT
By default, the configure script will search for a C++ compiler and C++
@@ -400,18 +418,6 @@ WHAT \R MATCHES
functions are called.
-BUILDING SHARED AND STATIC LIBRARIES
-
- The PCRE building process uses libtool to build both shared and static
- Unix libraries by default. You can suppress one of these by adding one
- of
-
- --disable-shared
- --disable-static
-
- to the configure command, as required.
-
-
POSIX MALLOC USAGE
When PCRE is called through the POSIX interface (see the pcreposix doc-
@@ -552,30 +558,46 @@ PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT
if they are not.
+PCREGREP BUFFER SIZE
+
+ pcregrep uses an internal buffer to hold a "window" on the file it is
+ scanning, in order to be able to output "before" and "after" lines when
+ it finds a match. The size of the buffer is controlled by a parameter
+ whose default value is 20K. The buffer itself is three times this size,
+ but because of the way it is used for holding "before" lines, the long-
+ est line that is guaranteed to be processable is the parameter size.
+ You can change the default parameter value by adding, for example,
+
+ --with-pcregrep-bufsize=50K
+
+ to the configure command. The caller of pcregrep can, however, override
+ this value by specifying a run-time option.
+
+
PCRETEST OPTION FOR LIBREADLINE SUPPORT
If you add
--enable-pcretest-libreadline
- to the configure command, pcretest is linked with the libreadline
- library, and when its input is from a terminal, it reads it using the
+ to the configure command, pcretest is linked with the libreadline
+ library, and when its input is from a terminal, it reads it using the
readline() function. This provides line-editing and history facilities.
Note that libreadline is GPL-licensed, so if you distribute a binary of
pcretest linked in this way, there may be licensing issues.
- Setting this option causes the -lreadline option to be added to the
- pcretest build. In many operating environments with a sytem-installed
+ Setting this option causes the -lreadline option to be added to the
+ pcretest build. In many operating environments with a sytem-installed
libreadline this is sufficient. However, in some environments (e.g. if
- an unmodified distribution version of readline is in use), some extra
- configuration may be necessary. The INSTALL file for libreadline says
+ an unmodified distribution version of readline is in use), some extra
+ configuration may be necessary. The INSTALL file for libreadline says
this:
"Readline uses the termcap functions, but does not link with the
termcap or curses library itself, allowing applications which link
with readline the to choose an appropriate library."
- If your environment has not been set up so that an appropriate library
+ If your environment has not been set up so that an appropriate library
is automatically included, you may need to add something like
LIBS="-ncurses"
@@ -597,8 +619,8 @@ AUTHOR
REVISION
- Last updated: 29 September 2009
- Copyright (c) 1997-2009 University of Cambridge.
+ Last updated: 02 August 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
@@ -1173,31 +1195,32 @@ COMPILING A PATTERN
if compilation of a pattern fails, pcre_compile() returns NULL, and
sets the variable pointed to by errptr to point to a textual error mes-
sage. This is a static string that is part of the library. You must not
- try to free it. The offset from the start of the pattern to the byte
- that was being processed when the error was discovered is placed in the
- variable pointed to by erroffset, which must not be NULL. If it is, an
- immediate error is given. Some errors are not detected until checks are
- carried out when the whole pattern has been scanned; in this case the
- offset is set to the end of the pattern.
+ try to free it. Normally, the offset from the start of the pattern to
+ the byte that was being processed when the error was discovered is
+ placed in the variable pointed to by erroffset, which must not be NULL
+ (if it is, an immediate error is given). However, for an invalid UTF-8
+ string, the offset is that of the first byte of the failing character.
+ Also, some errors are not detected until checks are carried out when
+ the whole pattern has been scanned; in these cases the offset passed
+ back is the length of the pattern.
Note that the offset is in bytes, not characters, even in UTF-8 mode.
- It may point into the middle of a UTF-8 character (for example, when
- PCRE_ERROR_BADUTF8 is returned for an invalid UTF-8 string).
+ It may sometimes point into the middle of a UTF-8 character.
- If pcre_compile2() is used instead of pcre_compile(), and the error-
- codeptr argument is not NULL, a non-zero error code number is returned
- via this argument in the event of an error. This is in addition to the
+ If pcre_compile2() is used instead of pcre_compile(), and the error-
+ codeptr argument is not NULL, a non-zero error code number is returned
+ via this argument in the event of an error. This is in addition to the
textual error message. Error codes and messages are listed below.
- If the final argument, tableptr, is NULL, PCRE uses a default set of
- character tables that are built when PCRE is compiled, using the
- default C locale. Otherwise, tableptr must be an address that is the
- result of a call to pcre_maketables(). This value is stored with the
- compiled pattern, and used again by pcre_exec(), unless another table
+ If the final argument, tableptr, is NULL, PCRE uses a default set of
+ character tables that are built when PCRE is compiled, using the
+ default C locale. Otherwise, tableptr must be an address that is the
+ result of a call to pcre_maketables(). This value is stored with the
+ compiled pattern, and used again by pcre_exec(), unless another table
pointer is passed to it. For more discussion, see the section on locale
support below.
- This code fragment shows a typical straightforward call to pcre_com-
+ This code fragment shows a typical straightforward call to pcre_com-
pile():
pcre *re;
@@ -1210,147 +1233,147 @@ COMPILING A PATTERN
&erroffset, /* for error offset */
NULL); /* use default character tables */
- The following names for option bits are defined in the pcre.h header
+ The following names for option bits are defined in the pcre.h header
file:
PCRE_ANCHORED
If this bit is set, the pattern is forced to be "anchored", that is, it
- is constrained to match only at the first matching point in the string
- that is being searched (the "subject string"). This effect can also be
- achieved by appropriate constructs in the pattern itself, which is the
+ is constrained to match only at the first matching point in the string
+ that is being searched (the "subject string"). This effect can also be
+ achieved by appropriate constructs in the pattern itself, which is the
only way to do it in Perl.
PCRE_AUTO_CALLOUT
If this bit is set, pcre_compile() automatically inserts callout items,
- all with number 255, before each pattern item. For discussion of the
+ all with number 255, before each pattern item. For discussion of the
callout facility, see the pcrecallout documentation.
PCRE_BSR_ANYCRLF
PCRE_BSR_UNICODE
These options (which are mutually exclusive) control what the \R escape
- sequence matches. The choice is either to match only CR, LF, or CRLF,
+ sequence matches. The choice is either to match only CR, LF, or CRLF,
or to match any Unicode newline sequence. The default is specified when
PCRE is built. It can be overridden from within the pattern, or by set-
ting an option when a compiled pattern is matched.
PCRE_CASELESS
- If this bit is set, letters in the pattern match both upper and lower
- case letters. It is equivalent to Perl's /i option, and it can be
- changed within a pattern by a (?i) option setting. In UTF-8 mode, PCRE
- always understands the concept of case for characters whose values are
- less than 128, so caseless matching is always possible. For characters
- with higher values, the concept of case is supported if PCRE is com-
- piled with Unicode property support, but not otherwise. If you want to
- use caseless matching for characters 128 and above, you must ensure
- that PCRE is compiled with Unicode property support as well as with
+ If this bit is set, letters in the pattern match both upper and lower
+ case letters. It is equivalent to Perl's /i option, and it can be
+ changed within a pattern by a (?i) option setting. In UTF-8 mode, PCRE
+ always understands the concept of case for characters whose values are
+ less than 128, so caseless matching is always possible. For characters
+ with higher values, the concept of case is supported if PCRE is com-
+ piled with Unicode property support, but not otherwise. If you want to
+ use caseless matching for characters 128 and above, you must ensure
+ that PCRE is compiled with Unicode property support as well as with
UTF-8 support.
PCRE_DOLLAR_ENDONLY
- If this bit is set, a dollar metacharacter in the pattern matches only
- at the end of the subject string. Without this option, a dollar also
- matches immediately before a newline at the end of the string (but not
- before any other newlines). The PCRE_DOLLAR_ENDONLY option is ignored
- if PCRE_MULTILINE is set. There is no equivalent to this option in
+ If this bit is set, a dollar metacharacter in the pattern matches only
+ at the end of the subject string. Without this option, a dollar also
+ matches immediately before a newline at the end of the string (but not
+ before any other newlines). The PCRE_DOLLAR_ENDONLY option is ignored
+ if PCRE_MULTILINE is set. There is no equivalent to this option in
Perl, and no way to set it within a pattern.
PCRE_DOTALL
- If this bit is set, a dot metacharacter in the pattern matches a char-
+ If this bit is set, a dot metacharacter in the pattern matches a char-
acter of any value, including one that indicates a newline. However, it
- only ever matches one character, even if newlines are coded as CRLF.
- Without this option, a dot does not match when the current position is
+ only ever matches one character, even if newlines are coded as CRLF.
+ Without this option, a dot does not match when the current position is
at a newline. This option is equivalent to Perl's /s option, and it can
- be changed within a pattern by a (?s) option setting. A negative class
+ be changed within a pattern by a (?s) option setting. A negative class
such as [^a] always matches newline characters, independent of the set-
ting of this option.
PCRE_DUPNAMES
- If this bit is set, names used to identify capturing subpatterns need
+ If this bit is set, names used to identify capturing subpatterns need
not be unique. This can be helpful for certain types of pattern when it
- is known that only one instance of the named subpattern can ever be
- matched. There are more details of named subpatterns below; see also
+ is known that only one instance of the named subpattern can ever be
+ matched. There are more details of named subpatterns below; see also
the pcrepattern documentation.
PCRE_EXTENDED
- If this bit is set, whitespace data characters in the pattern are
+ If this bit is set, whitespace data characters in the pattern are
totally ignored except when escaped or inside a character class. White-
space does not include the VT character (code 11). In addition, charac-
ters between an unescaped # outside a character class and the next new-
- line, inclusive, are also ignored. This is equivalent to Perl's /x
- option, and it can be changed within a pattern by a (?x) option set-
+ line, inclusive, are also ignored. This is equivalent to Perl's /x
+ option, and it can be changed within a pattern by a (?x) option set-
ting.
- Which characters are interpreted as newlines is controlled by the
- options passed to pcre_compile() or by a special sequence at the start
- of the pattern, as described in the section entitled "Newline conven-
+ Which characters are interpreted as newlines is controlled by the
+ options passed to pcre_compile() or by a special sequence at the start
+ of the pattern, as described in the section entitled "Newline conven-
tions" in the pcrepattern documentation. Note that the end of this type
- of comment is a literal newline sequence in the pattern; escape
+ of comment is a literal newline sequence in the pattern; escape
sequences that happen to represent a newline do not count.
- This option makes it possible to include comments inside complicated
- patterns. Note, however, that this applies only to data characters.
- Whitespace characters may never appear within special character
+ This option makes it possible to include comments inside complicated
+ patterns. Note, however, that this applies only to data characters.
+ Whitespace characters may never appear within special character
sequences in a pattern, for example within the sequence (?( that intro-
duces a conditional subpattern.
PCRE_EXTRA
- This option was invented in order to turn on additional functionality
- of PCRE that is incompatible with Perl, but it is currently of very
- little use. When set, any backslash in a pattern that is followed by a
- letter that has no special meaning causes an error, thus reserving
- these combinations for future expansion. By default, as in Perl, a
- backslash followed by a letter with no special meaning is treated as a
+ This option was invented in order to turn on additional functionality
+ of PCRE that is incompatible with Perl, but it is currently of very
+ little use. When set, any backslash in a pattern that is followed by a
+ letter that has no special meaning causes an error, thus reserving
+ these combinations for future expansion. By default, as in Perl, a
+ backslash followed by a letter with no special meaning is treated as a
literal. (Perl can, however, be persuaded to give an error for this, by
- running it with the -w option.) There are at present no other features
- controlled by this option. It can also be set by a (?X) option setting
+ running it with the -w option.) There are at present no other features
+ controlled by this option. It can also be set by a (?X) option setting
within a pattern.
PCRE_FIRSTLINE
- If this option is set, an unanchored pattern is required to match
- before or at the first newline in the subject string, though the
+ If this option is set, an unanchored pattern is required to match
+ before or at the first newline in the subject string, though the
matched text may continue over the newline.
PCRE_JAVASCRIPT_COMPAT
If this option is set, PCRE's behaviour is changed in some ways so that
- it is compatible with JavaScript rather than Perl. The changes are as
+ it is compatible with JavaScript rather than Perl. The changes are as
follows:
- (1) A lone closing square bracket in a pattern causes a compile-time
- error, because this is illegal in JavaScript (by default it is treated
+ (1) A lone closing square bracket in a pattern causes a compile-time
+ error, because this is illegal in JavaScript (by default it is treated
as a data character). Thus, the pattern AB]CD becomes illegal when this
option is set.
- (2) At run time, a back reference to an unset subpattern group matches
- an empty string (by default this causes the current matching alterna-
- tive to fail). A pattern such as (\1)(a) succeeds when this option is
- set (assuming it can find an "a" in the subject), whereas it fails by
+ (2) At run time, a back reference to an unset subpattern group matches
+ an empty string (by default this causes the current matching alterna-
+ tive to fail). A pattern such as (\1)(a) succeeds when this option is
+ set (assuming it can find an "a" in the subject), whereas it fails by
default, for Perl compatibility.
PCRE_MULTILINE
- By default, PCRE treats the subject string as consisting of a single
- line of characters (even if it actually contains newlines). The "start
- of line" metacharacter (^) matches only at the start of the string,
- while the "end of line" metacharacter ($) matches only at the end of
+ By default, PCRE treats the subject string as consisting of a single
+ line of characters (even if it actually contains newlines). The "start
+ of line" metacharacter (^) matches only at the start of the string,
+ while the "end of line" metacharacter ($) matches only at the end of
the string, or before a terminating newline (unless PCRE_DOLLAR_ENDONLY
is set). This is the same as Perl.
- When PCRE_MULTILINE it is set, the "start of line" and "end of line"
- constructs match immediately following or immediately before internal
- newlines in the subject string, respectively, as well as at the very
- start and end. This is equivalent to Perl's /m option, and it can be
+ When PCRE_MULTILINE it is set, the "start of line" and "end of line"
+ constructs match immediately following or immediately before internal
+ newlines in the subject string, respectively, as well as at the very
+ start and end. This is equivalent to Perl's /m option, and it can be
changed within a pattern by a (?m) option setting. If there are no new-
- lines in a subject string, or no occurrences of ^ or $ in a pattern,
+ lines in a subject string, or no occurrences of ^ or $ in a pattern,
setting PCRE_MULTILINE has no effect.
PCRE_NEWLINE_CR
@@ -1359,32 +1382,32 @@ COMPILING A PATTERN
PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
- These options override the default newline definition that was chosen
- when PCRE was built. Setting the first or the second specifies that a
- newline is indicated by a single character (CR or LF, respectively).
- Setting PCRE_NEWLINE_CRLF specifies that a newline is indicated by the
- two-character CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies
+ These options override the default newline definition that was chosen
+ when PCRE was built. Setting the first or the second specifies that a
+ newline is indicated by a single character (CR or LF, respectively).
+ Setting PCRE_NEWLINE_CRLF specifies that a newline is indicated by the
+ two-character CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies
that any of the three preceding sequences should be recognized. Setting
- PCRE_NEWLINE_ANY specifies that any Unicode newline sequence should be
+ PCRE_NEWLINE_ANY specifies that any Unicode newline sequence should be
recognized. The Unicode newline sequences are the three just mentioned,
- plus the single characters VT (vertical tab, U+000B), FF (formfeed,
- U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
- (paragraph separator, U+2029). The last two are recognized only in
+ plus the single characters VT (vertical tab, U+000B), FF (formfeed,
+ U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
+ (paragraph separator, U+2029). The last two are recognized only in
UTF-8 mode.
- The newline setting in the options word uses three bits that are
+ The newline setting in the options word uses three bits that are
treated as a number, giving eight possibilities. Currently only six are
- used (default plus the five values above). This means that if you set
- more than one newline option, the combination may or may not be sensi-
+ used (default plus the five values above). This means that if you set
+ more than one newline option, the combination may or may not be sensi-
ble. For example, PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to
- PCRE_NEWLINE_CRLF, but other combinations may yield unused numbers and
+ PCRE_NEWLINE_CRLF, but other combinations may yield unused numbers and
cause an error.
- The only time that a line break in a pattern is specially recognized
- when compiling is when PCRE_EXTENDED is set. CR and LF are whitespace
- characters, and so are ignored in this mode. Also, an unescaped # out-
- side a character class indicates a comment that lasts until after the
- next line break sequence. In other circumstances, line break sequences
+ The only time that a line break in a pattern is specially recognized
+ when compiling is when PCRE_EXTENDED is set. CR and LF are whitespace
+ characters, and so are ignored in this mode. Also, an unescaped # out-
+ side a character class indicates a comment that lasts until after the
+ next line break sequence. In other circumstances, line break sequences
in patterns are treated as literal data.
The newline option that is set at compile time becomes the default that
@@ -1393,65 +1416,65 @@ COMPILING A PATTERN
PCRE_NO_AUTO_CAPTURE
If this option is set, it disables the use of numbered capturing paren-
- theses in the pattern. Any opening parenthesis that is not followed by
- ? behaves as if it were followed by ?: but named parentheses can still
- be used for capturing (and they acquire numbers in the usual way).
+ theses in the pattern. Any opening parenthesis that is not followed by
+ ? behaves as if it were followed by ?: but named parentheses can still
+ be used for capturing (and they acquire numbers in the usual way).
There is no equivalent of this option in Perl.
NO_START_OPTIMIZE
- This is an option that acts at matching time; that is, it is really an
- option for pcre_exec() or pcre_dfa_exec(). If it is set at compile
- time, it is remembered with the compiled pattern and assumed at match-
- ing time. For details see the discussion of PCRE_NO_START_OPTIMIZE
+ This is an option that acts at matching time; that is, it is really an
+ option for pcre_exec() or pcre_dfa_exec(). If it is set at compile
+ time, it is remembered with the compiled pattern and assumed at match-
+ ing time. For details see the discussion of PCRE_NO_START_OPTIMIZE
below.
PCRE_UCP
- This option changes the way PCRE processes \B, \b, \D, \d, \S, \s, \W,
- \w, and some of the POSIX character classes. By default, only ASCII
- characters are recognized, but if PCRE_UCP is set, Unicode properties
- are used instead to classify characters. More details are given in the
- section on generic character types in the pcrepattern page. If you set
- PCRE_UCP, matching one of the items it affects takes much longer. The
- option is available only if PCRE has been compiled with Unicode prop-
+ This option changes the way PCRE processes \B, \b, \D, \d, \S, \s, \W,
+ \w, and some of the POSIX character classes. By default, only ASCII
+ characters are recognized, but if PCRE_UCP is set, Unicode properties
+ are used instead to classify characters. More details are given in the
+ section on generic character types in the pcrepattern page. If you set
+ PCRE_UCP, matching one of the items it affects takes much longer. The
+ option is available only if PCRE has been compiled with Unicode prop-
erty support.
PCRE_UNGREEDY
- This option inverts the "greediness" of the quantifiers so that they
- are not greedy by default, but become greedy if followed by "?". It is
- not compatible with Perl. It can also be set by a (?U) option setting
+ This option inverts the "greediness" of the quantifiers so that they
+ are not greedy by default, but become greedy if followed by "?". It is
+ not compatible with Perl. It can also be set by a (?U) option setting
within the pattern.
PCRE_UTF8
- This option causes PCRE to regard both the pattern and the subject as
- strings of UTF-8 characters instead of single-byte character strings.
- However, it is available only when PCRE is built to include UTF-8 sup-
- port. If not, the use of this option provokes an error. Details of how
- this option changes the behaviour of PCRE are given in the section on
+ This option causes PCRE to regard both the pattern and the subject as
+ strings of UTF-8 characters instead of single-byte character strings.
+ However, it is available only when PCRE is built to include UTF-8 sup-
+ port. If not, the use of this option provokes an error. Details of how
+ this option changes the behaviour of PCRE are given in the section on
UTF-8 support in the main pcre page.
PCRE_NO_UTF8_CHECK
When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
- automatically checked. There is a discussion about the validity of
- UTF-8 strings in the main pcre page. If an invalid UTF-8 sequence of
- bytes is found, pcre_compile() returns an error. If you already know
+ automatically checked. There is a discussion about the validity of
+ UTF-8 strings in the main pcre page. If an invalid UTF-8 sequence of
+ bytes is found, pcre_compile() returns an error. If you already know
that your pattern is valid, and you want to skip this check for perfor-
- mance reasons, you can set the PCRE_NO_UTF8_CHECK option. When it is
- set, the effect of passing an invalid UTF-8 string as a pattern is
- undefined. It may cause your program to crash. Note that this option
- can also be passed to pcre_exec() and pcre_dfa_exec(), to suppress the
+ mance reasons, you can set the PCRE_NO_UTF8_CHECK option. When it is
+ set, the effect of passing an invalid UTF-8 string as a pattern is
+ undefined. It may cause your program to crash. Note that this option
+ can also be passed to pcre_exec() and pcre_dfa_exec(), to suppress the
UTF-8 validity checking of subject strings.
COMPILATION ERROR CODES
- The following table lists the error codes than may be returned by
- pcre_compile2(), along with the error messages that may be returned by
- both compiling functions. As PCRE has developed, some error codes have
+ The following table lists the error codes than may be returned by
+ pcre_compile2(), along with the error messages that may be returned by
+ both compiling functions. As PCRE has developed, some error codes have
fallen out of use. To avoid confusion, they have not been re-used.
0 no error
@@ -1526,7 +1549,7 @@ COMPILATION ERROR CODES
66 (*MARK) must have an argument
67 this version of PCRE is not compiled with PCRE_UCP support
- The numbers 32 and 10000 in errors 48 and 49 are defaults; different
+ The numbers 32 and 10000 in errors 48 and 49 are defaults; different
values may be used if the limits were changed when PCRE was built.
@@ -1535,32 +1558,32 @@ STUDYING A PATTERN
pcre_extra *pcre_study(const pcre *code, int options
const char **errptr);
- If a compiled pattern is going to be used several times, it is worth
+ If a compiled pattern is going to be used several times, it is worth
spending more time analyzing it in order to speed up the time taken for
- matching. The function pcre_study() takes a pointer to a compiled pat-
+ matching. The function pcre_study() takes a pointer to a compiled pat-
tern as its first argument. If studying the pattern produces additional
- information that will help speed up matching, pcre_study() returns a
- pointer to a pcre_extra block, in which the study_data field points to
+ information that will help speed up matching, pcre_study() returns a
+ pointer to a pcre_extra block, in which the study_data field points to
the results of the study.
The returned value from pcre_study() can be passed directly to
- pcre_exec() or pcre_dfa_exec(). However, a pcre_extra block also con-
- tains other fields that can be set by the caller before the block is
+ pcre_exec() or pcre_dfa_exec(). However, a pcre_extra block also con-
+ tains other fields that can be set by the caller before the block is
passed; these are described below in the section on matching a pattern.
- If studying the pattern does not produce any useful information,
+ If studying the pattern does not produce any useful information,
pcre_study() returns NULL. In that circumstance, if the calling program
- wants to pass any of the other fields to pcre_exec() or
+ wants to pass any of the other fields to pcre_exec() or
pcre_dfa_exec(), it must set up its own pcre_extra block.
- The second argument of pcre_study() contains option bits. At present,
+ The second argument of pcre_study() contains option bits. At present,
no options are defined, and this argument should always be zero.
- The third argument for pcre_study() is a pointer for an error message.
- If studying succeeds (even if no data is returned), the variable it
- points to is set to NULL. Otherwise it is set to point to a textual
+ The third argument for pcre_study() is a pointer for an error message.
+ If studying succeeds (even if no data is returned), the variable it
+ points to is set to NULL. Otherwise it is set to point to a textual
error message. This is a static string that is part of the library. You
- must not try to free it. You should test the error pointer for NULL
+ must not try to free it. You should test the error pointer for NULL
after calling pcre_study(), to be sure that it has run successfully.
This is a typical call to pcre_study():
@@ -1574,78 +1597,78 @@ STUDYING A PATTERN
Studying a pattern does two things: first, a lower bound for the length
of subject string that is needed to match the pattern is computed. This
does not mean that there are any strings of that length that match, but
- it does guarantee that no shorter strings match. The value is used by
- pcre_exec() and pcre_dfa_exec() to avoid wasting time by trying to
- match strings that are shorter than the lower bound. You can find out
+ it does guarantee that no shorter strings match. The value is used by
+ pcre_exec() and pcre_dfa_exec() to avoid wasting time by trying to
+ match strings that are shorter than the lower bound. You can find out
the value in a calling program via the pcre_fullinfo() function.
Studying a pattern is also useful for non-anchored patterns that do not
- have a single fixed starting character. A bitmap of possible starting
- bytes is created. This speeds up finding a position in the subject at
+ have a single fixed starting character. A bitmap of possible starting
+ bytes is created. This speeds up finding a position in the subject at
which to start matching.
- The two optimizations just described can be disabled by setting the
- PCRE_NO_START_OPTIMIZE option when calling pcre_exec() or
- pcre_dfa_exec(). You might want to do this if your pattern contains
- callouts or (*MARK), and you want to make use of these facilities in
- cases where matching fails. See the discussion of PCRE_NO_START_OPTI-
+ The two optimizations just described can be disabled by setting the
+ PCRE_NO_START_OPTIMIZE option when calling pcre_exec() or
+ pcre_dfa_exec(). You might want to do this if your pattern contains
+ callouts or (*MARK), and you want to make use of these facilities in
+ cases where matching fails. See the discussion of PCRE_NO_START_OPTI-
MIZE below.
LOCALE SUPPORT
- PCRE handles caseless matching, and determines whether characters are
- letters, digits, or whatever, by reference to a set of tables, indexed
- by character value. When running in UTF-8 mode, this applies only to
- characters with codes less than 128. By default, higher-valued codes
+ PCRE handles caseless matching, and determines whether characters are
+ letters, digits, or whatever, by reference to a set of tables, indexed
+ by character value. When running in UTF-8 mode, this applies only to
+ characters with codes less than 128. By default, higher-valued codes
never match escapes such as \w or \d, but they can be tested with \p if
- PCRE is built with Unicode character property support. Alternatively,
- the PCRE_UCP option can be set at compile time; this causes \w and
+ PCRE is built with Unicode character property support. Alternatively,
+ the PCRE_UCP option can be set at compile time; this causes \w and
friends to use Unicode property support instead of built-in tables. The
use of locales with Unicode is discouraged. If you are handling charac-
- ters with codes greater than 128, you should either use UTF-8 and Uni-
+ ters with codes greater than 128, you should either use UTF-8 and Uni-
code, or use locales, but not try to mix the two.
- PCRE contains an internal set of tables that are used when the final
- argument of pcre_compile() is NULL. These are sufficient for many
+ PCRE contains an internal set of tables that are used when the final
+ argument of pcre_compile() is NULL. These are sufficient for many
applications. Normally, the internal tables recognize only ASCII char-
acters. However, when PCRE is built, it is possible to cause the inter-
nal tables to be rebuilt in the default "C" locale of the local system,
which may cause them to be different.
- The internal tables can always be overridden by tables supplied by the
+ The internal tables can always be overridden by tables supplied by the
application that calls PCRE. These may be created in a different locale
- from the default. As more and more applications change to using Uni-
+ from the default. As more and more applications change to using Uni-
code, the need for this locale support is expected to die away.
- External tables are built by calling the pcre_maketables() function,
- which has no arguments, in the relevant locale. The result can then be
- passed to pcre_compile() or pcre_exec() as often as necessary. For
- example, to build and use tables that are appropriate for the French
- locale (where accented characters with values greater than 128 are
+ External tables are built by calling the pcre_maketables() function,
+ which has no arguments, in the relevant locale. The result can then be
+ passed to pcre_compile() or pcre_exec() as often as necessary. For
+ example, to build and use tables that are appropriate for the French
+ locale (where accented characters with values greater than 128 are
treated as letters), the following code could be used:
setlocale(LC_CTYPE, "fr_FR");
tables = pcre_maketables();
re = pcre_compile(..., tables);
- The locale name "fr_FR" is used on Linux and other Unix-like systems;
+ The locale name "fr_FR" is used on Linux and other Unix-like systems;
if you are using Windows, the name for the French locale is "french".
- When pcre_maketables() runs, the tables are built in memory that is
- obtained via pcre_malloc. It is the caller's responsibility to ensure
- that the memory containing the tables remains available for as long as
+ When pcre_maketables() runs, the tables are built in memory that is
+ obtained via pcre_malloc. It is the caller's responsibility to ensure
+ that the memory containing the tables remains available for as long as
it is needed.
The pointer that is passed to pcre_compile() is saved with the compiled
- pattern, and the same tables are used via this pointer by pcre_study()
+ pattern, and the same tables are used via this pointer by pcre_study()
and normally also by pcre_exec(). Thus, by default, for any single pat-
tern, compilation, studying and matching all happen in the same locale,
but different patterns can be compiled in different locales.
- It is possible to pass a table pointer or NULL (indicating the use of
- the internal tables) to pcre_exec(). Although not intended for this
- purpose, this facility could be used to match a pattern in a different
+ It is possible to pass a table pointer or NULL (indicating the use of
+ the internal tables) to pcre_exec(). Although not intended for this
+ purpose, this facility could be used to match a pattern in a different
locale from the one in which it was compiled. Passing table pointers at
run time is discussed below in the section on matching a pattern.
@@ -1655,15 +1678,15 @@ INFORMATION ABOUT A PATTERN
int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
int what, void *where);
- The pcre_fullinfo() function returns information about a compiled pat-
+ The pcre_fullinfo() function returns information about a compiled pat-
tern. It replaces the obsolete pcre_info() function, which is neverthe-
less retained for backwards compability (and is documented below).
- The first argument for pcre_fullinfo() is a pointer to the compiled
- pattern. The second argument is the result of pcre_study(), or NULL if
- the pattern was not studied. The third argument specifies which piece
- of information is required, and the fourth argument is a pointer to a
- variable to receive the data. The yield of the function is zero for
+ The first argument for pcre_fullinfo() is a pointer to the compiled
+ pattern. The second argument is the result of pcre_study(), or NULL if
+ the pattern was not studied. The third argument specifies which piece
+ of information is required, and the fourth argument is a pointer to a
+ variable to receive the data. The yield of the function is zero for
success, or one of the following negative numbers:
PCRE_ERROR_NULL the argument code was NULL
@@ -1671,9 +1694,9 @@ INFORMATION ABOUT A PATTERN
PCRE_ERROR_BADMAGIC the "magic number" was not found
PCRE_ERROR_BADOPTION the value of what was invalid
- The "magic number" is placed at the start of each compiled pattern as
- an simple check against passing an arbitrary memory pointer. Here is a
- typical call of pcre_fullinfo(), to obtain the length of the compiled
+ The "magic number" is placed at the start of each compiled pattern as
+ an simple check against passing an arbitrary memory pointer. Here is a
+ typical call of pcre_fullinfo(), to obtain the length of the compiled
pattern:
int rc;
@@ -1684,131 +1707,131 @@ INFORMATION ABOUT A PATTERN
PCRE_INFO_SIZE, /* what is required */
&length); /* where to put the data */
- The possible values for the third argument are defined in pcre.h, and
+ The possible values for the third argument are defined in pcre.h, and
are as follows:
PCRE_INFO_BACKREFMAX
- Return the number of the highest back reference in the pattern. The
- fourth argument should point to an int variable. Zero is returned if
+ Return the number of the highest back reference in the pattern. The
+ fourth argument should point to an int variable. Zero is returned if
there are no back references.
PCRE_INFO_CAPTURECOUNT
- Return the number of capturing subpatterns in the pattern. The fourth
+ Return the number of capturing subpatterns in the pattern. The fourth
argument should point to an int variable.
PCRE_INFO_DEFAULT_TABLES
- Return a pointer to the internal default character tables within PCRE.
- The fourth argument should point to an unsigned char * variable. This
+ Return a pointer to the internal default character tables within PCRE.
+ The fourth argument should point to an unsigned char * variable. This
information call is provided for internal use by the pcre_study() func-
- tion. External callers can cause PCRE to use its internal tables by
+ tion. External callers can cause PCRE to use its internal tables by
passing a NULL table pointer.
PCRE_INFO_FIRSTBYTE
- Return information about the first byte of any matched string, for a
- non-anchored pattern. The fourth argument should point to an int vari-
- able. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name
+ Return information about the first byte of any matched string, for a
+ non-anchored pattern. The fourth argument should point to an int vari-
+ able. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name
is still recognized for backwards compatibility.)
- If there is a fixed first byte, for example, from a pattern such as
+ If there is a fixed first byte, for example, from a pattern such as
(cat|cow|coyote), its value is returned. Otherwise, if either
- (a) the pattern was compiled with the PCRE_MULTILINE option, and every
+ (a) the pattern was compiled with the PCRE_MULTILINE option, and every
branch starts with "^", or
(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not
set (if it were set, the pattern would be anchored),
- -1 is returned, indicating that the pattern matches only at the start
- of a subject string or after any newline within the string. Otherwise
+ -1 is returned, indicating that the pattern matches only at the start
+ of a subject string or after any newline within the string. Otherwise
-2 is returned. For anchored patterns, -2 is returned.
PCRE_INFO_FIRSTTABLE
- If the pattern was studied, and this resulted in the construction of a
+ If the pattern was studied, and this resulted in the construction of a
256-bit table indicating a fixed set of bytes for the first byte in any
- matching string, a pointer to the table is returned. Otherwise NULL is
- returned. The fourth argument should point to an unsigned char * vari-
+ matching string, a pointer to the table is returned. Otherwise NULL is
+ returned. The fourth argument should point to an unsigned char * vari-
able.
PCRE_INFO_HASCRORLF
- Return 1 if the pattern contains any explicit matches for CR or LF
- characters, otherwise 0. The fourth argument should point to an int
- variable. An explicit match is either a literal CR or LF character, or
+ Return 1 if the pattern contains any explicit matches for CR or LF
+ characters, otherwise 0. The fourth argument should point to an int
+ variable. An explicit match is either a literal CR or LF character, or
\r or \n.
PCRE_INFO_JCHANGED
- Return 1 if the (?J) or (?-J) option setting is used in the pattern,
- otherwise 0. The fourth argument should point to an int variable. (?J)
+ Return 1 if the (?J) or (?-J) option setting is used in the pattern,
+ otherwise 0. The fourth argument should point to an int variable. (?J)
and (?-J) set and unset the local PCRE_DUPNAMES option, respectively.
PCRE_INFO_LASTLITERAL
- Return the value of the rightmost literal byte that must exist in any
- matched string, other than at its start, if such a byte has been
+ Return the value of the rightmost literal byte that must exist in any
+ matched string, other than at its start, if such a byte has been
recorded. The fourth argument should point to an int variable. If there
- is no such byte, -1 is returned. For anchored patterns, a last literal
- byte is recorded only if it follows something of variable length. For
+ is no such byte, -1 is returned. For anchored patterns, a last literal
+ byte is recorded only if it follows something of variable length. For
example, for the pattern /^a\d+z\d+/ the returned value is "z", but for
/^a\dz\d/ the returned value is -1.
PCRE_INFO_MINLENGTH
- If the pattern was studied and a minimum length for matching subject
- strings was computed, its value is returned. Otherwise the returned
- value is -1. The value is a number of characters, not bytes (this may
- be relevant in UTF-8 mode). The fourth argument should point to an int
- variable. A non-negative value is a lower bound to the length of any
- matching string. There may not be any strings of that length that do
+ If the pattern was studied and a minimum length for matching subject
+ strings was computed, its value is returned. Otherwise the returned
+ value is -1. The value is a number of characters, not bytes (this may
+ be relevant in UTF-8 mode). The fourth argument should point to an int
+ variable. A non-negative value is a lower bound to the length of any
+ matching string. There may not be any strings of that length that do
actually match, but every string that does match is at least that long.
PCRE_INFO_NAMECOUNT
PCRE_INFO_NAMEENTRYSIZE
PCRE_INFO_NAMETABLE
- PCRE supports the use of named as well as numbered capturing parenthe-
- ses. The names are just an additional way of identifying the parenthe-
+ PCRE supports the use of named as well as numbered capturing parenthe-
+ ses. The names are just an additional way of identifying the parenthe-
ses, which still acquire numbers. Several convenience functions such as
- pcre_get_named_substring() are provided for extracting captured sub-
- strings by name. It is also possible to extract the data directly, by
- first converting the name to a number in order to access the correct
+ pcre_get_named_substring() are provided for extracting captured sub-
+ strings by name. It is also possible to extract the data directly, by
+ first converting the name to a number in order to access the correct
pointers in the output vector (described with pcre_exec() below). To do
- the conversion, you need to use the name-to-number map, which is
+ the conversion, you need to use the name-to-number map, which is
described by these three values.
The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT
gives the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size
- of each entry; both of these return an int value. The entry size
- depends on the length of the longest name. PCRE_INFO_NAMETABLE returns
- a pointer to the first entry of the table (a pointer to char). The
+ of each entry; both of these return an int value. The entry size
+ depends on the length of the longest name. PCRE_INFO_NAMETABLE returns
+ a pointer to the first entry of the table (a pointer to char). The
first two bytes of each entry are the number of the capturing parenthe-
- sis, most significant byte first. The rest of the entry is the corre-
+ sis, most significant byte first. The rest of the entry is the corre-
sponding name, zero terminated.
- The names are in alphabetical order. Duplicate names may appear if (?|
+ The names are in alphabetical order. Duplicate names may appear if (?|
is used to create multiple groups with the same number, as described in
- the section on duplicate subpattern numbers in the pcrepattern page.
- Duplicate names for subpatterns with different numbers are permitted
- only if PCRE_DUPNAMES is set. In all cases of duplicate names, they
- appear in the table in the order in which they were found in the pat-
- tern. In the absence of (?| this is the order of increasing number;
+ the section on duplicate subpattern numbers in the pcrepattern page.
+ Duplicate names for subpatterns with different numbers are permitted
+ only if PCRE_DUPNAMES is set. In all cases of duplicate names, they
+ appear in the table in the order in which they were found in the pat-
+ tern. In the absence of (?| this is the order of increasing number;
when (?| is used this is not necessarily the case because later subpat-
terns may have lower numbers.
- As a simple example of the name/number table, consider the following
- pattern (assume PCRE_EXTENDED is set, so white space - including new-
+ As a simple example of the name/number table, consider the following
+ pattern (assume PCRE_EXTENDED is set, so white space - including new-
lines - is ignored):
(?<date> (?<year>(\d\d)?\d\d) -
(?<month>\d\d) - (?<day>\d\d) )
- There are four named subpatterns, so the table has four entries, and
- each entry in the table is eight bytes long. The table is as follows,
+ There are four named subpatterns, so the table has four entries, and
+ each entry in the table is eight bytes long. The table is as follows,
with non-printing bytes shows in hexadecimal, and undefined bytes shown
as ??:
@@ -1817,31 +1840,31 @@ INFORMATION ABOUT A PATTERN
00 04 m o n t h 00
00 02 y e a r 00 ??
- When writing code to extract data from named subpatterns using the
- name-to-number map, remember that the length of the entries is likely
+ When writing code to extract data from named subpatterns using the
+ name-to-number map, remember that the length of the entries is likely
to be different for each compiled pattern.
PCRE_INFO_OKPARTIAL
- Return 1 if the pattern can be used for partial matching with
- pcre_exec(), otherwise 0. The fourth argument should point to an int
- variable. From release 8.00, this always returns 1, because the
- restrictions that previously applied to partial matching have been
- lifted. The pcrepartial documentation gives details of partial match-
+ Return 1 if the pattern can be used for partial matching with
+ pcre_exec(), otherwise 0. The fourth argument should point to an int
+ variable. From release 8.00, this always returns 1, because the
+ restrictions that previously applied to partial matching have been
+ lifted. The pcrepartial documentation gives details of partial match-
ing.
PCRE_INFO_OPTIONS
- Return a copy of the options with which the pattern was compiled. The
- fourth argument should point to an unsigned long int variable. These
+ Return a copy of the options with which the pattern was compiled. The
+ fourth argument should point to an unsigned long int variable. These
option bits are those specified in the call to pcre_compile(), modified
by any top-level option settings at the start of the pattern itself. In
- other words, they are the options that will be in force when matching
- starts. For example, if the pattern /(?im)abc(?-i)d/ is compiled with
- the PCRE_EXTENDED option, the result is PCRE_CASELESS, PCRE_MULTILINE,
+ other words, they are the options that will be in force when matching
+ starts. For example, if the pattern /(?im)abc(?-i)d/ is compiled with
+ the PCRE_EXTENDED option, the result is PCRE_CASELESS, PCRE_MULTILINE,
and PCRE_EXTENDED.
- A pattern is automatically anchored by PCRE if all of its top-level
+ A pattern is automatically anchored by PCRE if all of its top-level
alternatives begin with one of the following:
^ unless PCRE_MULTILINE is set
@@ -1855,7 +1878,7 @@ INFORMATION ABOUT A PATTERN
PCRE_INFO_SIZE
- Return the size of the compiled pattern, that is, the value that was
+ Return the size of the compiled pattern, that is, the value that was
passed as the argument to pcre_malloc() when PCRE was getting memory in
which to place the compiled data. The fourth argument should point to a
size_t variable.
@@ -1863,10 +1886,10 @@ INFORMATION ABOUT A PATTERN
PCRE_INFO_STUDYSIZE
Return the size of the data block pointed to by the study_data field in
- a pcre_extra block. That is, it is the value that was passed to
+ a pcre_extra block. That is, it is the value that was passed to
pcre_malloc() when PCRE was getting memory into which to place the data
- created by pcre_study(). If pcre_extra is NULL, or there is no study
- data, zero is returned. The fourth argument should point to a size_t
+ created by pcre_study(). If pcre_extra is NULL, or there is no study
+ data, zero is returned. The fourth argument should point to a size_t
variable.
@@ -1874,21 +1897,21 @@ OBSOLETE INFO FUNCTION
int pcre_info(const pcre *code, int *optptr, int *firstcharptr);
- The pcre_info() function is now obsolete because its interface is too
- restrictive to return all the available data about a compiled pattern.
- New programs should use pcre_fullinfo() instead. The yield of
- pcre_info() is the number of capturing subpatterns, or one of the fol-
+ The pcre_info() function is now obsolete because its interface is too
+ restrictive to return all the available data about a compiled pattern.
+ New programs should use pcre_fullinfo() instead. The yield of
+ pcre_info() is the number of capturing subpatterns, or one of the fol-
lowing negative numbers:
PCRE_ERROR_NULL the argument code was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
- If the optptr argument is not NULL, a copy of the options with which
- the pattern was compiled is placed in the integer it points to (see
+ If the optptr argument is not NULL, a copy of the options with which
+ the pattern was compiled is placed in the integer it points to (see
PCRE_INFO_OPTIONS above).
- If the pattern is not anchored and the firstcharptr argument is not
- NULL, it is used to pass back information about the first character of
+ If the pattern is not anchored and the firstcharptr argument is not
+ NULL, it is used to pass back information about the first character of
any matched string (see PCRE_INFO_FIRSTBYTE above).
@@ -1896,21 +1919,21 @@ REFERENCE COUNTS
int pcre_refcount(pcre *code, int adjust);
- The pcre_refcount() function is used to maintain a reference count in
+ The pcre_refcount() function is used to maintain a reference count in
the data block that contains a compiled pattern. It is provided for the
- benefit of applications that operate in an object-oriented manner,
+ benefit of applications that operate in an object-oriented manner,
where different parts of the application may be using the same compiled
pattern, but you want to free the block when they are all done.
When a pattern is compiled, the reference count field is initialized to
- zero. It is changed only by calling this function, whose action is to
- add the adjust value (which may be positive or negative) to it. The
+ zero. It is changed only by calling this function, whose action is to
+ add the adjust value (which may be positive or negative) to it. The
yield of the function is the new value. However, the value of the count
- is constrained to lie between 0 and 65535, inclusive. If the new value
+ is constrained to lie between 0 and 65535, inclusive. If the new value
is outside these limits, it is forced to the appropriate limit value.
- Except when it is zero, the reference count is not correctly preserved
- if a pattern is compiled on one host and then transferred to a host
+ Except when it is zero, the reference count is not correctly preserved
+ if a pattern is compiled on one host and then transferred to a host
whose byte-order is different. (This seems a highly unlikely scenario.)
@@ -1920,18 +1943,18 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
const char *subject, int length, int startoffset,
int options, int *ovector, int ovecsize);
- The function pcre_exec() is called to match a subject string against a
- compiled pattern, which is passed in the code argument. If the pattern
- was studied, the result of the study should be passed in the extra
- argument. This function is the main matching facility of the library,
+ The function pcre_exec() is called to match a subject string against a
+ compiled pattern, which is passed in the code argument. If the pattern
+ was studied, the result of the study should be passed in the extra
+ argument. This function is the main matching facility of the library,
and it operates in a Perl-like manner. For specialist use there is also
- an alternative matching function, which is described below in the sec-
+ an alternative matching function, which is described below in the sec-
tion about the pcre_dfa_exec() function.
- In most applications, the pattern will have been compiled (and option-
- ally studied) in the same process that calls pcre_exec(). However, it
+ In most applications, the pattern will have been compiled (and option-
+ ally studied) in the same process that calls pcre_exec(). However, it
is possible to save compiled patterns and study data, and then use them
- later in different processes, possibly even on different hosts. For a
+ later in different processes, possibly even on different hosts. For a
discussion about this, see the pcreprecompile documentation.
Here is an example of a simple call to pcre_exec():
@@ -1950,10 +1973,10 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
Extra data for pcre_exec()
- If the extra argument is not NULL, it must point to a pcre_extra data
- block. The pcre_study() function returns such a block (when it doesn't
- return NULL), but you can also create one for yourself, and pass addi-
- tional information in it. The pcre_extra block contains the following
+ If the extra argument is not NULL, it must point to a pcre_extra data
+ block. The pcre_study() function returns such a block (when it doesn't
+ return NULL), but you can also create one for yourself, and pass addi-
+ tional information in it. The pcre_extra block contains the following
fields (not necessarily in this order):
unsigned long int flags;
@@ -1964,7 +1987,7 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
const unsigned char *tables;
unsigned char **mark;
- The flags field is a bitmap that specifies which of the other fields
+ The flags field is a bitmap that specifies which of the other fields
are set. The flag bits are:
PCRE_EXTRA_STUDY_DATA
@@ -1974,96 +1997,96 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
PCRE_EXTRA_TABLES
PCRE_EXTRA_MARK
- Other flag bits should be set to zero. The study_data field is set in
- the pcre_extra block that is returned by pcre_study(), together with
+ Other flag bits should be set to zero. The study_data field is set in
+ the pcre_extra block that is returned by pcre_study(), together with
the appropriate flag bit. You should not set this yourself, but you may
- add to the block by setting the other fields and their corresponding
+ add to the block by setting the other fields and their corresponding
flag bits.
The match_limit field provides a means of preventing PCRE from using up
- a vast amount of resources when running patterns that are not going to
- match, but which have a very large number of possibilities in their
- search trees. The classic example is a pattern that uses nested unlim-
+ a vast amount of resources when running patterns that are not going to
+ match, but which have a very large number of possibilities in their
+ search trees. The classic example is a pattern that uses nested unlim-
ited repeats.
- Internally, PCRE uses a function called match() which it calls repeat-
- edly (sometimes recursively). The limit set by match_limit is imposed
- on the number of times this function is called during a match, which
- has the effect of limiting the amount of backtracking that can take
+ Internally, PCRE uses a function called match() which it calls repeat-
+ edly (sometimes recursively). The limit set by match_limit is imposed
+ on the number of times this function is called during a match, which
+ has the effect of limiting the amount of backtracking that can take
place. For patterns that are not anchored, the count restarts from zero
for each position in the subject string.
- The default value for the limit can be set when PCRE is built; the
- default default is 10 million, which handles all but the most extreme
- cases. You can override the default by suppling pcre_exec() with a
- pcre_extra block in which match_limit is set, and
- PCRE_EXTRA_MATCH_LIMIT is set in the flags field. If the limit is
+ The default value for the limit can be set when PCRE is built; the
+ default default is 10 million, which handles all but the most extreme
+ cases. You can override the default by suppling pcre_exec() with a
+ pcre_extra block in which match_limit is set, and
+ PCRE_EXTRA_MATCH_LIMIT is set in the flags field. If the limit is
exceeded, pcre_exec() returns PCRE_ERROR_MATCHLIMIT.
- The match_limit_recursion field is similar to match_limit, but instead
+ The match_limit_recursion field is similar to match_limit, but instead
of limiting the total number of times that match() is called, it limits
- the depth of recursion. The recursion depth is a smaller number than
- the total number of calls, because not all calls to match() are recur-
+ the depth of recursion. The recursion depth is a smaller number than
+ the total number of calls, because not all calls to match() are recur-
sive. This limit is of use only if it is set smaller than match_limit.
- Limiting the recursion depth limits the amount of stack that can be
+ Limiting the recursion depth limits the amount of stack that can be
used, or, when PCRE has been compiled to use memory on the heap instead
of the stack, the amount of heap memory that can be used.
- The default value for match_limit_recursion can be set when PCRE is
- built; the default default is the same value as the default for
- match_limit. You can override the default by suppling pcre_exec() with
- a pcre_extra block in which match_limit_recursion is set, and
- PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the flags field. If the
+ The default value for match_limit_recursion can be set when PCRE is
+ built; the default default is the same value as the default for
+ match_limit. You can override the default by suppling pcre_exec() with
+ a pcre_extra block in which match_limit_recursion is set, and
+ PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the flags field. If the
limit is exceeded, pcre_exec() returns PCRE_ERROR_RECURSIONLIMIT.
- The callout_data field is used in conjunction with the "callout" fea-
+ The callout_data field is used in conjunction with the "callout" fea-
ture, and is described in the pcrecallout documentation.
- The tables field is used to pass a character tables pointer to
- pcre_exec(); this overrides the value that is stored with the compiled
- pattern. A non-NULL value is stored with the compiled pattern only if
- custom tables were supplied to pcre_compile() via its tableptr argu-
+ The tables field is used to pass a character tables pointer to
+ pcre_exec(); this overrides the value that is stored with the compiled
+ pattern. A non-NULL value is stored with the compiled pattern only if
+ custom tables were supplied to pcre_compile() via its tableptr argu-
ment. If NULL is passed to pcre_exec() using this mechanism, it forces
- PCRE's internal tables to be used. This facility is helpful when re-
- using patterns that have been saved after compiling with an external
- set of tables, because the external tables might be at a different
- address when pcre_exec() is called. See the pcreprecompile documenta-
+ PCRE's internal tables to be used. This facility is helpful when re-
+ using patterns that have been saved after compiling with an external
+ set of tables, because the external tables might be at a different
+ address when pcre_exec() is called. See the pcreprecompile documenta-
tion for a discussion of saving compiled patterns for later use.
- If PCRE_EXTRA_MARK is set in the flags field, the mark field must be
- set to point to a char * variable. If the pattern contains any back-
- tracking control verbs such as (*MARK:NAME), and the execution ends up
- with a name to pass back, a pointer to the name string (zero termi-
- nated) is placed in the variable pointed to by the mark field. The
- names are within the compiled pattern; if you wish to retain such a
- name you must copy it before freeing the memory of a compiled pattern.
- If there is no name to pass back, the variable pointed to by the mark
- field set to NULL. For details of the backtracking control verbs, see
+ If PCRE_EXTRA_MARK is set in the flags field, the mark field must be
+ set to point to a char * variable. If the pattern contains any back-
+ tracking control verbs such as (*MARK:NAME), and the execution ends up
+ with a name to pass back, a pointer to the name string (zero termi-
+ nated) is placed in the variable pointed to by the mark field. The
+ names are within the compiled pattern; if you wish to retain such a
+ name you must copy it before freeing the memory of a compiled pattern.
+ If there is no name to pass back, the variable pointed to by the mark
+ field set to NULL. For details of the backtracking control verbs, see
the section entitled "Backtracking control" in the pcrepattern documen-
tation.
Option bits for pcre_exec()
- The unused bits of the options argument for pcre_exec() must be zero.
- The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx,
- PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART,
- PCRE_NO_START_OPTIMIZE, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_SOFT, and
+ The unused bits of the options argument for pcre_exec() must be zero.
+ The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx,
+ PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART,
+ PCRE_NO_START_OPTIMIZE, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_SOFT, and
PCRE_PARTIAL_HARD.
PCRE_ANCHORED
- The PCRE_ANCHORED option limits pcre_exec() to matching at the first
- matching position. If a pattern was compiled with PCRE_ANCHORED, or
- turned out to be anchored by virtue of its contents, it cannot be made
+ The PCRE_ANCHORED option limits pcre_exec() to matching at the first
+ matching position. If a pattern was compiled with PCRE_ANCHORED, or
+ turned out to be anchored by virtue of its contents, it cannot be made
unachored at matching time.
PCRE_BSR_ANYCRLF
PCRE_BSR_UNICODE
These options (which are mutually exclusive) control what the \R escape
- sequence matches. The choice is either to match only CR, LF, or CRLF,
- or to match any Unicode newline sequence. These options override the
+ sequence matches. The choice is either to match only CR, LF, or CRLF,
+ or to match any Unicode newline sequence. These options override the
choice that was made or defaulted when the pattern was compiled.
PCRE_NEWLINE_CR
@@ -2072,149 +2095,152 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
- These options override the newline definition that was chosen or
- defaulted when the pattern was compiled. For details, see the descrip-
- tion of pcre_compile() above. During matching, the newline choice
- affects the behaviour of the dot, circumflex, and dollar metacharac-
- ters. It may also alter the way the match position is advanced after a
+ These options override the newline definition that was chosen or
+ defaulted when the pattern was compiled. For details, see the descrip-
+ tion of pcre_compile() above. During matching, the newline choice
+ affects the behaviour of the dot, circumflex, and dollar metacharac-
+ ters. It may also alter the way the match position is advanced after a
match failure for an unanchored pattern.
- When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is
- set, and a match attempt for an unanchored pattern fails when the cur-
- rent position is at a CRLF sequence, and the pattern contains no
- explicit matches for CR or LF characters, the match position is
+ When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is
+ set, and a match attempt for an unanchored pattern fails when the cur-
+ rent position is at a CRLF sequence, and the pattern contains no
+ explicit matches for CR or LF characters, the match position is
advanced by two characters instead of one, in other words, to after the
CRLF.
The above rule is a compromise that makes the most common cases work as
- expected. For example, if the pattern is .+A (and the PCRE_DOTALL
+ expected. For example, if the pattern is .+A (and the PCRE_DOTALL
option is not set), it does not match the string "\r\nA" because, after
- failing at the start, it skips both the CR and the LF before retrying.
- However, the pattern [\r\n]A does match that string, because it con-
+ failing at the start, it skips both the CR and the LF before retrying.
+ However, the pattern [\r\n]A does match that string, because it con-
tains an explicit CR or LF reference, and so advances only by one char-
acter after the first failure.
An explicit match for CR of LF is either a literal appearance of one of
- those characters, or one of the \r or \n escape sequences. Implicit
- matches such as [^X] do not count, nor does \s (which includes CR and
+ those characters, or one of the \r or \n escape sequences. Implicit
+ matches such as [^X] do not count, nor does \s (which includes CR and
LF in the characters that it matches).
- Notwithstanding the above, anomalous effects may still occur when CRLF
+ Notwithstanding the above, anomalous effects may still occur when CRLF
is a valid newline sequence and explicit \r or \n escapes appear in the
pattern.
PCRE_NOTBOL
This option specifies that first character of the subject string is not
- the beginning of a line, so the circumflex metacharacter should not
- match before it. Setting this without PCRE_MULTILINE (at compile time)
- causes circumflex never to match. This option affects only the behav-
+ the beginning of a line, so the circumflex metacharacter should not
+ match before it. Setting this without PCRE_MULTILINE (at compile time)
+ causes circumflex never to match. This option affects only the behav-
iour of the circumflex metacharacter. It does not affect \A.
PCRE_NOTEOL
This option specifies that the end of the subject string is not the end
- of a line, so the dollar metacharacter should not match it nor (except
- in multiline mode) a newline immediately before it. Setting this with-
+ of a line, so the dollar metacharacter should not match it nor (except
+ in multiline mode) a newline immediately before it. Setting this with-
out PCRE_MULTILINE (at compile time) causes dollar never to match. This
- option affects only the behaviour of the dollar metacharacter. It does
+ option affects only the behaviour of the dollar metacharacter. It does
not affect \Z or \z.
PCRE_NOTEMPTY
An empty string is not considered to be a valid match if this option is
- set. If there are alternatives in the pattern, they are tried. If all
- the alternatives match the empty string, the entire match fails. For
+ set. If there are alternatives in the pattern, they are tried. If all
+ the alternatives match the empty string, the entire match fails. For
example, if the pattern
a?b?
- is applied to a string not beginning with "a" or "b", it matches an
- empty string at the start of the subject. With PCRE_NOTEMPTY set, this
+ is applied to a string not beginning with "a" or "b", it matches an
+ empty string at the start of the subject. With PCRE_NOTEMPTY set, this
match is not valid, so PCRE searches further into the string for occur-
rences of "a" or "b".
PCRE_NOTEMPTY_ATSTART
- This is like PCRE_NOTEMPTY, except that an empty string match that is
- not at the start of the subject is permitted. If the pattern is
+ This is like PCRE_NOTEMPTY, except that an empty string match that is
+ not at the start of the subject is permitted. If the pattern is
anchored, such a match can occur only if the pattern contains \K.
- Perl has no direct equivalent of PCRE_NOTEMPTY or
- PCRE_NOTEMPTY_ATSTART, but it does make a special case of a pattern
- match of the empty string within its split() function, and when using
- the /g modifier. It is possible to emulate Perl's behaviour after
+ Perl has no direct equivalent of PCRE_NOTEMPTY or
+ PCRE_NOTEMPTY_ATSTART, but it does make a special case of a pattern
+ match of the empty string within its split() function, and when using
+ the /g modifier. It is possible to emulate Perl's behaviour after
matching a null string by first trying the match again at the same off-
- set with PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED, and then if that
+ set with PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED, and then if that
fails, by advancing the starting offset (see below) and trying an ordi-
- nary match again. There is some code that demonstrates how to do this
- in the pcredemo sample program. In the most general case, you have to
- check to see if the newline convention recognizes CRLF as a newline,
- and if so, and the current character is CR followed by LF, advance the
+ nary match again. There is some code that demonstrates how to do this
+ in the pcredemo sample program. In the most general case, you have to
+ check to see if the newline convention recognizes CRLF as a newline,
+ and if so, and the current character is CR followed by LF, advance the
starting offset by two characters instead of one.
PCRE_NO_START_OPTIMIZE
- There are a number of optimizations that pcre_exec() uses at the start
- of a match, in order to speed up the process. For example, if it is
+ There are a number of optimizations that pcre_exec() uses at the start
+ of a match, in order to speed up the process. For example, if it is
known that an unanchored match must start with a specific character, it
- searches the subject for that character, and fails immediately if it
- cannot find it, without actually running the main matching function.
+ searches the subject for that character, and fails immediately if it
+ cannot find it, without actually running the main matching function.
This means that a special item such as (*COMMIT) at the start of a pat-
- tern is not considered until after a suitable starting point for the
- match has been found. When callouts or (*MARK) items are in use, these
+ tern is not considered until after a suitable starting point for the
+ match has been found. When callouts or (*MARK) items are in use, these
"start-up" optimizations can cause them to be skipped if the pattern is
- never actually used. The start-up optimizations are in effect a pre-
+ never actually used. The start-up optimizations are in effect a pre-
scan of the subject that takes place before the pattern is run.
- The PCRE_NO_START_OPTIMIZE option disables the start-up optimizations,
- possibly causing performance to suffer, but ensuring that in cases
- where the result is "no match", the callouts do occur, and that items
+ The PCRE_NO_START_OPTIMIZE option disables the start-up optimizations,
+ possibly causing performance to suffer, but ensuring that in cases
+ where the result is "no match", the callouts do occur, and that items
such as (*COMMIT) and (*MARK) are considered at every possible starting
- position in the subject string. If PCRE_NO_START_OPTIMIZE is set at
+ position in the subject string. If PCRE_NO_START_OPTIMIZE is set at
compile time, it cannot be unset at matching time.
- Setting PCRE_NO_START_OPTIMIZE can change the outcome of a matching
+ Setting PCRE_NO_START_OPTIMIZE can change the outcome of a matching
operation. Consider the pattern
(*COMMIT)ABC
- When this is compiled, PCRE records the fact that a match must start
- with the character "A". Suppose the subject string is "DEFABC". The
- start-up optimization scans along the subject, finds "A" and runs the
- first match attempt from there. The (*COMMIT) item means that the pat-
- tern must match the current starting position, which in this case, it
- does. However, if the same match is run with PCRE_NO_START_OPTIMIZE
- set, the initial scan along the subject string does not happen. The
- first match attempt is run starting from "D" and when this fails,
- (*COMMIT) prevents any further matches being tried, so the overall
- result is "no match". If the pattern is studied, more start-up opti-
- mizations may be used. For example, a minimum length for the subject
+ When this is compiled, PCRE records the fact that a match must start
+ with the character "A". Suppose the subject string is "DEFABC". The
+ start-up optimization scans along the subject, finds "A" and runs the
+ first match attempt from there. The (*COMMIT) item means that the pat-
+ tern must match the current starting position, which in this case, it
+ does. However, if the same match is run with PCRE_NO_START_OPTIMIZE
+ set, the initial scan along the subject string does not happen. The
+ first match attempt is run starting from "D" and when this fails,
+ (*COMMIT) prevents any further matches being tried, so the overall
+ result is "no match". If the pattern is studied, more start-up opti-
+ mizations may be used. For example, a minimum length for the subject
may be recorded. Consider the pattern
(*MARK:A)(X|Y)
- The minimum length for a match is one character. If the subject is
- "ABC", there will be attempts to match "ABC", "BC", "C", and then
- finally an empty string. If the pattern is studied, the final attempt
- does not take place, because PCRE knows that the subject is too short,
- and so the (*MARK) is never encountered. In this case, studying the
- pattern does not affect the overall match result, which is still "no
+ The minimum length for a match is one character. If the subject is
+ "ABC", there will be attempts to match "ABC", "BC", "C", and then
+ finally an empty string. If the pattern is studied, the final attempt
+ does not take place, because PCRE knows that the subject is too short,
+ and so the (*MARK) is never encountered. In this case, studying the
+ pattern does not affect the overall match result, which is still "no
match", but it does affect the auxiliary information that is returned.
PCRE_NO_UTF8_CHECK
When PCRE_UTF8 is set at compile time, the validity of the subject as a
- UTF-8 string is automatically checked when pcre_exec() is subsequently
- called. The value of startoffset is also checked to ensure that it
- points to the start of a UTF-8 character. There is a discussion about
- the validity of UTF-8 strings in the section on UTF-8 support in the
- main pcre page. If an invalid UTF-8 sequence of bytes is found,
- pcre_exec() returns the error PCRE_ERROR_BADUTF8 or, if PCRE_PAR-
- TIAL_HARD is set and the problem is a truncated UTF-8 character at the
- end of the subject, PCRE_ERROR_SHORTUTF8. If startoffset contains a
- value that does not point to the start of a UTF-8 character (or to the
- end of the subject), PCRE_ERROR_BADUTF8_OFFSET is returned.
+ UTF-8 string is automatically checked when pcre_exec() is subsequently
+ called. The value of startoffset is also checked to ensure that it
+ points to the start of a UTF-8 character. There is a discussion about
+ the validity of UTF-8 strings in the section on UTF-8 support in the
+ main pcre page. If an invalid UTF-8 sequence of bytes is found,
+ pcre_exec() returns the error PCRE_ERROR_BADUTF8 or, if PCRE_PAR-
+ TIAL_HARD is set and the problem is a truncated UTF-8 character at the
+ end of the subject, PCRE_ERROR_SHORTUTF8. In both cases, information
+ about the precise nature of the error may also be returned (see the
+ descriptions of these errors in the section entitled Error return val-
+ ues from pcre_exec() below). If startoffset contains a value that does
+ not point to the start of a UTF-8 character (or to the end of the sub-
+ ject), PCRE_ERROR_BADUTF8_OFFSET is returned.
If you already know that your subject is valid, and you want to skip
these checks for performance reasons, you can set the
@@ -2444,13 +2470,19 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
PCRE_ERROR_BADUTF8 (-10)
A string that contains an invalid UTF-8 byte sequence was passed as a
- subject. However, if PCRE_PARTIAL_HARD is set and the problem is a
- truncated UTF-8 character at the end of the subject, PCRE_ERROR_SHORT-
- UTF8 is used instead.
+ subject, and the PCRE_NO_UTF8_CHECK option was not set. If the size of
+ the output vector (ovecsize) is at least 2, the byte offset to the
+ start of the the invalid UTF-8 character is placed in the first ele-
+ ment, and a reason code is placed in the second element. The reason
+ codes are listed in the following section. For backward compatibility,
+ if PCRE_PARTIAL_HARD is set and the problem is a truncated UTF-8 char-
+ acter at the end of the subject (reason codes 1 to 5),
+ PCRE_ERROR_SHORTUTF8 is returned instead of PCRE_ERROR_BADUTF8.
PCRE_ERROR_BADUTF8_OFFSET (-11)
- The UTF-8 byte sequence that was passed as a subject was valid, but the
+ The UTF-8 byte sequence that was passed as a subject was checked and
+ found to be valid (the PCRE_NO_UTF8_CHECK option was not set), but the
value of startoffset did not point to the beginning of a UTF-8 charac-
ter or the end of the subject.
@@ -2492,12 +2524,97 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
PCRE_ERROR_SHORTUTF8 (-25)
- The subject string ended with an incomplete (truncated) UTF-8 charac-
- ter, and the PCRE_PARTIAL_HARD option was set. Without this option,
- PCRE_ERROR_BADUTF8 is returned in this situation.
+ This error is returned instead of PCRE_ERROR_BADUTF8 when the subject
+ string ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD
+ option is set. Information about the failure is returned as for
+ PCRE_ERROR_BADUTF8. It is in fact sufficient to detect this case, but
+ this special error code for PCRE_PARTIAL_HARD precedes the implementa-
+ tion of returned information; it is retained for backwards compatibil-
+ ity.
+
+ PCRE_ERROR_RECURSELOOP (-26)
+
+ This error is returned when pcre_exec() detects a recursion loop within
+ the pattern. Specifically, it means that either the whole pattern or a
+ subpattern has been called recursively for the second time at the same
+ position in the subject string. Some simple patterns that might do this
+ are detected and faulted at compile time, but more complicated cases,
+ in particular mutual recursions between two different subpatterns, can-
+ not be detected until run time.
Error numbers -16 to -20 and -22 are not used by pcre_exec().
+ Reason codes for invalid UTF-8 strings
+
+ When pcre_exec() returns either PCRE_ERROR_BADUTF8 or PCRE_ERROR_SHORT-
+ UTF8, and the size of the output vector (ovecsize) is at least 2, the
+ offset of the start of the invalid UTF-8 character is placed in the
+ first output vector element (ovector[0]) and a reason code is placed in
+ the second element (ovector[1]). The reason codes are given names in
+ the pcre.h header file:
+
+ PCRE_UTF8_ERR1
+ PCRE_UTF8_ERR2
+ PCRE_UTF8_ERR3
+ PCRE_UTF8_ERR4
+ PCRE_UTF8_ERR5
+
+ The string ends with a truncated UTF-8 character; the code specifies
+ how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
+ characters to be no longer than 4 bytes, the encoding scheme (origi-
+ nally defined by RFC 2279) allows for up to 6 bytes, and this is
+ checked first; hence the possibility of 4 or 5 missing bytes.
+
+ PCRE_UTF8_ERR6
+ PCRE_UTF8_ERR7
+ PCRE_UTF8_ERR8
+ PCRE_UTF8_ERR9
+ PCRE_UTF8_ERR10
+
+ The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
+ the character do not have the binary value 0b10 (that is, either the
+ most significant bit is 0, or the next bit is 1).
+
+ PCRE_UTF8_ERR11
+ PCRE_UTF8_ERR12
+
+ A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
+ long; these code points are excluded by RFC 3629.
+
+ PCRE_UTF8_ERR13
+
+ A 4-byte character has a value greater than 0x10fff; these code points
+ are excluded by RFC 3629.
+
+ PCRE_UTF8_ERR14
+
+ A 3-byte character has a value in the range 0xd800 to 0xdfff; this
+ range of code points are reserved by RFC 3629 for use with UTF-16, and
+ so are excluded from UTF-8.
+
+ PCRE_UTF8_ERR15
+ PCRE_UTF8_ERR16
+ PCRE_UTF8_ERR17
+ PCRE_UTF8_ERR18
+ PCRE_UTF8_ERR19
+
+ A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
+ for a value that can be represented by fewer bytes, which is invalid.
+ For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
+ rect coding uses just one byte.
+
+ PCRE_UTF8_ERR20
+
+ The two most significant bits of the first byte of a character have the
+ binary value 0b10 (that is, the most significant bit is 1 and the sec-
+ ond is 0). Such a byte can only validly occur as the second or subse-
+ quent byte of a multi-byte character.
+
+ PCRE_UTF8_ERR21
+
+ The first byte of a character has the value 0xfe or 0xff. These values
+ can never occur in a valid UTF-8 string.
+
EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
@@ -2673,9 +2790,9 @@ DUPLICATE SUBPATTERN NAMES
the name-to-number table for the given name. The function itself
returns the length of each entry, or PCRE_ERROR_NOSUBSTRING (-7) if
there are none. The format of the table is described above in the sec-
- tion entitled Information about a pattern. Given all the relevant
- entries for the name, you can extract each of their numbers, and hence
- the captured data, if any.
+ tion entitled Information about a pattern above. Given all the rele-
+ vant entries for the name, you can extract each of their numbers, and
+ hence the captured data, if any.
FINDING ALL POSSIBLE MATCHES
@@ -2874,8 +2991,8 @@ AUTHOR
REVISION
- Last updated: 21 November 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 28 July 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
@@ -2956,21 +3073,22 @@ THE CALLOUT INTERFACE
only argument to the callout function is a pointer to a pcre_callout
block. This structure contains the following fields:
- int version;
- int callout_number;
- int *offset_vector;
- const char *subject;
- int subject_length;
- int start_match;
- int current_position;
- int capture_top;
- int capture_last;
- void *callout_data;
- int pattern_position;
- int next_item_length;
+ int version;
+ int callout_number;
+ int *offset_vector;
+ const char *subject;
+ int subject_length;
+ int start_match;
+ int current_position;
+ int capture_top;
+ int capture_last;
+ void *callout_data;
+ int pattern_position;
+ int next_item_length;
+ const unsigned char *mark;
The version field is an integer containing the version number of the
- block format. The initial version was 0; the current version is 1. The
+ block format. The initial version was 0; the current version is 2. The
version number will change again in future if additional fields are
added, but the intention is never to remove any of the existing fields.
@@ -3030,6 +3148,12 @@ THE CALLOUT INTERFACE
in distinguishing between different automatic callouts, which all have
the same callout number. However, they are set for all callouts.
+ The mark field is present from version 2 of the pcre_callout structure.
+ In callouts from pcre_exec() it contains a pointer to the zero-termi-
+ nated name of the most recently passed (*MARK) item in the match, or
+ NULL if there are no (*MARK)s in the current matching path. In callouts
+ from pcre_dfa_exec() this field always contains NULL.
+
RETURN VALUES
@@ -3056,8 +3180,8 @@ AUTHOR
REVISION
- Last updated: 21 November 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 31 July 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
@@ -3078,10 +3202,12 @@ DIFFERENCES BETWEEN PCRE AND PERL
of what it does have are given in the section on UTF-8 support in the
main pcre page.
- 2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl
- permits them, but they do not mean what you might think. For example,
- (?!a){3} does not assert that the next three characters are not "a". It
- just asserts that the next character is not "a" three times.
+ 2. PCRE allows repeat quantifiers only on parenthesized assertions, but
+ they do not mean what you might think. For example, (?!a){3} does not
+ assert that the next three characters are not "a". It just asserts that
+ the next character is not "a" three times (in principle: PCRE optimizes
+ this to run the assertion just once). Perl allows repeat quantifiers on
+ other assertions such as \b, but these do not seem to have any use.
3. Capturing subpatterns that occur inside negative lookahead asser-
tions are counted, but their entries in the offsets vector are never
@@ -3096,9 +3222,11 @@ DIFFERENCES BETWEEN PCRE AND PERL
the pattern to represent a binary zero.
5. The following Perl escape sequences are not supported: \l, \u, \L,
- \U, and \N. In fact these are implemented by Perl's general string-han-
- dling and are not part of its pattern matching engine. If any of these
- are encountered by PCRE, an error is generated.
+ \U, and \N when followed by a character name or Unicode value. (\N on
+ its own, matching a non-newline character, is supported.) In fact these
+ are implemented by Perl's general string-handling and are not part of
+ its pattern matching engine. If any of these are encountered by PCRE,
+ an error is generated.
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE
is built with Unicode character property support. The properties that
@@ -3110,10 +3238,15 @@ DIFFERENCES BETWEEN PCRE AND PERL
tion of Unicode characters, there is no need to implement the somewhat
messy concept of surrogates."
- 7. PCRE does support the \Q...\E escape for quoting substrings. Charac-
- ters in between are treated as literals. This is slightly different
- from Perl in that $ and @ are also handled as literals inside the
- quotes. In Perl, they cause variable interpolation (but of course PCRE
+ 7. PCRE implements a simpler version of \X than Perl, which changed to
+ make \X match what Unicode calls an "extended grapheme cluster". This
+ is more complicated than an extended Unicode sequence, which is what
+ PCRE matches.
+
+ 8. PCRE does support the \Q...\E escape for quoting substrings. Charac-
+ ters in between are treated as literals. This is slightly different
+ from Perl in that $ and @ are also handled as literals inside the
+ quotes. In Perl, they cause variable interpolation (but of course PCRE
does not have variables). Note the following examples:
Pattern PCRE matches Perl matches
@@ -3123,58 +3256,60 @@ DIFFERENCES BETWEEN PCRE AND PERL
\Qabc\$xyz\E abc\$xyz abc\$xyz
\Qabc\E\$\Qxyz\E abc$xyz abc$xyz
- The \Q...\E sequence is recognized both inside and outside character
+ The \Q...\E sequence is recognized both inside and outside character
classes.
- 8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
- constructions. However, there is support for recursive patterns. This
- is not available in Perl 5.8, but it is in Perl 5.10. Also, the PCRE
- "callout" feature allows an external function to be called during pat-
+ 9. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
+ constructions. However, there is support for recursive patterns. This
+ is not available in Perl 5.8, but it is in Perl 5.10. Also, the PCRE
+ "callout" feature allows an external function to be called during pat-
tern matching. See the pcrecallout documentation for details.
- 9. Subpatterns that are called recursively or as "subroutines" are
- always treated as atomic groups in PCRE. This is like Python, but
- unlike Perl. There is a discussion of an example that explains this in
- more detail in the section on recursion differences from Perl in the
+ 10. Subpatterns that are called recursively or as "subroutines" are
+ always treated as atomic groups in PCRE. This is like Python, but
+ unlike Perl. There is a discussion of an example that explains this in
+ more detail in the section on recursion differences from Perl in the
pcrepattern page.
- 10. There are some differences that are concerned with the settings of
- captured strings when part of a pattern is repeated. For example,
- matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
+ 11. There are some differences that are concerned with the settings of
+ captured strings when part of a pattern is repeated. For example,
+ matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
unset, but in PCRE it is set to "b".
- 11. PCRE's handling of duplicate subpattern numbers and duplicate sub-
+ 12. PCRE's handling of duplicate subpattern numbers and duplicate sub-
pattern names is not as general as Perl's. This is a consequence of the
fact the PCRE works internally just with numbers, using an external ta-
- ble to translate between numbers and names. In particular, a pattern
- such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
- the same number but different names, is not supported, and causes an
- error at compile time. If it were allowed, it would not be possible to
- distinguish which parentheses matched, because both names map to cap-
+ ble to translate between numbers and names. In particular, a pattern
+ such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
+ the same number but different names, is not supported, and causes an
+ error at compile time. If it were allowed, it would not be possible to
+ distinguish which parentheses matched, because both names map to cap-
turing subpattern number 1. To avoid this confusing situation, an error
is given at compile time.
- 12. Perl recognizes comments in some places that PCRE doesn't, for
- example, between the ( and ? at the start of a subpattern.
+ 13. Perl recognizes comments in some places that PCRE does not, for
+ example, between the ( and ? at the start of a subpattern. If the /x
+ modifier is set, Perl allows whitespace between ( and ? but PCRE never
+ does, even if the PCRE_EXTENDED option is set.
- 13. PCRE provides some extensions to the Perl regular expression facil-
- ities. Perl 5.10 includes new features that are not in earlier ver-
- sions of Perl, some of which (such as named parentheses) have been in
+ 14. PCRE provides some extensions to the Perl regular expression facil-
+ ities. Perl 5.10 includes new features that are not in earlier ver-
+ sions of Perl, some of which (such as named parentheses) have been in
PCRE for some time. This list is with respect to Perl 5.10:
- (a) Although lookbehind assertions in PCRE must match fixed length
- strings, each alternative branch of a lookbehind assertion can match a
- different length of string. Perl requires them all to have the same
+ (a) Although lookbehind assertions in PCRE must match fixed length
+ strings, each alternative branch of a lookbehind assertion can match a
+ different length of string. Perl requires them all to have the same
length.
- (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
+ (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
meta-character matches only at the very end of the string.
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no spe-
cial meaning is faulted. Otherwise, like Perl, the backslash is quietly
ignored. (Perl can be made to issue a warning.)
- (d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
+ (d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
fiers is inverted, that is, by default they are not greedy, but if fol-
lowed by a question mark they are.
@@ -3182,10 +3317,10 @@ DIFFERENCES BETWEEN PCRE AND PERL
tried only at the first matching position in the subject string.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART,
- and PCRE_NO_AUTO_CAPTURE options for pcre_exec() have no Perl equiva-
+ and PCRE_NO_AUTO_CAPTURE options for pcre_exec() have no Perl equiva-
lents.
- (g) The \R escape sequence can be restricted to match only CR, LF, or
+ (g) The \R escape sequence can be restricted to match only CR, LF, or
CRLF by the PCRE_BSR_ANYCRLF option.
(h) The callout facility is PCRE-specific.
@@ -3195,10 +3330,10 @@ DIFFERENCES BETWEEN PCRE AND PERL
(j) Patterns compiled by PCRE can be saved and re-used at a later time,
even on different hosts that have the other endianness.
- (k) The alternative matching function (pcre_dfa_exec()) matches in a
+ (k) The alternative matching function (pcre_dfa_exec()) matches in a
different way and is not Perl-compatible.
- (l) PCRE recognizes some special sequences such as (*CR) at the start
+ (l) PCRE recognizes some special sequences such as (*CR) at the start
of a pattern that set overall options that cannot be changed within the
pattern.
@@ -3212,8 +3347,8 @@ AUTHOR
REVISION
- Last updated: 31 October 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 24 July 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
@@ -3415,7 +3550,11 @@ BACKSLASH
\Qabc\E\$\Qxyz\E abc$xyz abc$xyz
The \Q...\E sequence is recognized both inside and outside character
- classes. An isolated \E that is not preceded by \Q is ignored.
+ classes. An isolated \E that is not preceded by \Q is ignored. If \Q
+ is not followed by \E later in the pattern, the literal interpretation
+ continues to the end of the pattern (that is, \E is assumed at the
+ end). If the isolated \Q is inside a character class, this causes an
+ error, because the character class is not terminated.
Non-printing characters
@@ -3808,6 +3947,10 @@ BACKSLASH
None of them have codepoints less than 256, so in non-UTF-8 mode \X
matches any one character.
+ Note that recent versions of Perl have changed \X to match what Unicode
+ calls an "extended grapheme cluster", which has a more complicated def-
+ inition.
+
Matching characters by Unicode property is not fast, because PCRE has
to search a structure that contains data for over fifteen thousand
characters. That is why the traditional escape sequences such as \d and
@@ -4431,7 +4574,7 @@ REPETITION
an escape such as \d or \pL that matches a single character
a character class
a back reference (see next section)
- a parenthesized subpattern (unless it is an assertion)
+ a parenthesized subpattern (including assertions)
a recursive or "subroutine" call to a subpattern
The general repetition quantifier specifies a minimum and maximum num-
@@ -4807,13 +4950,29 @@ ASSERTIONS
matched in the normal way, except that it does not cause the current
matching position to be changed.
- Assertion subpatterns are not capturing subpatterns, and may not be
- repeated, because it makes no sense to assert the same thing several
- times. If any kind of assertion contains capturing subpatterns within
- it, these are counted for the purposes of numbering the capturing sub-
- patterns in the whole pattern. However, substring capturing is carried
- out only for positive assertions, because it does not make sense for
- negative assertions.
+ Assertion subpatterns are not capturing subpatterns. If such an asser-
+ tion contains capturing subpatterns within it, these are counted for
+ the purposes of numbering the capturing subpatterns in the whole pat-
+ tern. However, substring capturing is carried out only for positive
+ assertions, because it does not make sense for negative assertions.
+
+ For compatibility with Perl, assertion subpatterns may be repeated;
+ though it makes no sense to assert the same thing several times, the
+ side effect of capturing parentheses may occasionally be useful. In
+ practice, there only three cases:
+
+ (1) If the quantifier is {0}, the assertion is never obeyed during
+ matching. However, it may contain internal capturing parenthesized
+ groups that are called from elsewhere via the subroutine mechanism.
+
+ (2) If quantifier is {0,n} where n is greater than zero, it is treated
+ as if it were {0,1}. At run time, the rest of the pattern match is
+ tried with and without the assertion, the order depending on the greed-
+ iness of the quantifier.
+
+ (3) If the minimum repetition is greater than zero, the quantifier is
+ ignored. The assertion is obeyed just once when encountered during
+ matching.
Lookahead assertions
@@ -5433,7 +5592,9 @@ BACKTRACKING CONTROL
If any of these verbs are used in an assertion or subroutine subpattern
(including recursive subpatterns), their effect is confined to that
- subpattern; it does not extend to the surrounding pattern. Note that
+ subpattern; it does not extend to the surrounding pattern, with one
+ exception: a *MARK that is encountered in a positive assertion is
+ passed back (compare capturing parentheses in assertions). Note that
such subpatterns are processed as anchored at the point where they are
tested.
@@ -5519,6 +5680,10 @@ BACKTRACKING CONTROL
efficient way of obtaining this information than putting each alterna-
tive in its own capturing parentheses.
+ If (*MARK) is encountered in a positive assertion, its name is recorded
+ and passed back if it is the last-encountered. This does not happen for
+ negative assetions.
+
A name may also be returned after a failed match if the final path
through the pattern involves (*MARK). However, unless (*MARK) used in
conjunction with (*COMMIT), this is unlikely to happen for an unan-
@@ -5691,8 +5856,8 @@ AUTHOR
REVISION
- Last updated: 21 November 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 24 July 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
@@ -7255,7 +7420,7 @@ PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE
RE_Options object, set the appropriate options, and pass this object to
a RE constructor. Example:
- RE_options opt;
+ RE_Options opt;
opt.set_caseless(true);
if (RE("HELLO", opt).PartialMatch("hello world")) ...
@@ -7380,6 +7545,7 @@ AUTHOR
REVISION
Last updated: 17 March 2009
+ Minor typo fixed: 25 July 2011
------------------------------------------------------------------------------
@@ -7482,7 +7648,9 @@ PCRE DISCUSSION OF STACK USAGE
in order to remember the state of the match so that it can back up and
try a different alternative if the first one fails. As matching pro-
ceeds deeper and deeper into the tree of possibilities, the recursion
- depth increases.
+ depth increases. The match() function is also called in other circum-
+ stances, for example, whenever a parenthesized sub-pattern is entered,
+ and in certain cases of repetition.
Not all calls of match() increase the recursion depth; for an item such
as a* it may be called several times at the same level, after matching
@@ -7614,8 +7782,8 @@ AUTHOR
REVISION
- Last updated: 03 January 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 22 July 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index c481eec..38dce95 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -1548,7 +1548,7 @@ in the main
.\"
page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_exec()\fP returns
the error PCRE_ERROR_BADUTF8 or, if PCRE_PARTIAL_HARD is set and the problem is
-a truncated UTF-8 character at the end of the subject, PCRE_ERROR_SHORTUTF8. In
+a truncated UTF-8 character at the end of the subject, PCRE_ERROR_SHORTUTF8. In
both cases, information about the precise nature of the error may also be
returned (see the descriptions of these errors in the section entitled \fIError
return values from\fP \fBpcre_exec()\fP
@@ -1810,7 +1810,7 @@ PCRE_ERROR_SHORTUTF8 is returned instead of PCRE_ERROR_BADUTF8.
.sp
PCRE_ERROR_BADUTF8_OFFSET (-11)
.sp
-The UTF-8 byte sequence that was passed as a subject was checked and found to
+The UTF-8 byte sequence that was passed as a subject was checked and found to
be valid (the PCRE_NO_UTF8_CHECK option was not set), but the value of
\fIstartoffset\fP did not point to the beginning of a UTF-8 character or the
end of the subject.
@@ -1865,9 +1865,9 @@ retained for backwards compatibility.
.sp
PCRE_ERROR_RECURSELOOP (-26)
.sp
-This error is returned when \fBpcre_exec()\fP detects a recursion loop within
-the pattern. Specifically, it means that either the whole pattern or a
-subpattern has been called recursively for the second time at the same position
+This error is returned when \fBpcre_exec()\fP detects a recursion loop within
+the pattern. Specifically, it means that either the whole pattern or a
+subpattern has been called recursively for the second time at the same position
in the subject string. Some simple patterns that might do this are detected and
faulted at compile time, but more complicated cases, in particular mutual
recursions between two different subpatterns, cannot be detected until run
@@ -1880,10 +1880,10 @@ Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
.SS "Reason codes for invalid UTF-8 strings"
.rs
.sp
-When \fBpcre_exec()\fP returns either PCRE_ERROR_BADUTF8 or
-PCRE_ERROR_SHORTUTF8, and the size of the output vector (\fIovecsize\fP) is at
-least 2, the offset of the start of the invalid UTF-8 character is placed in
-the first output vector element (\fIovector[0]\fP) and a reason code is placed
+When \fBpcre_exec()\fP returns either PCRE_ERROR_BADUTF8 or
+PCRE_ERROR_SHORTUTF8, and the size of the output vector (\fIovecsize\fP) is at
+least 2, the offset of the start of the invalid UTF-8 character is placed in
+the first output vector element (\fIovector[0]\fP) and a reason code is placed
in the second element (\fIovector[1]\fP). The reason codes are given names in
the \fBpcre.h\fP header file:
.sp
@@ -1893,10 +1893,10 @@ the \fBpcre.h\fP header file:
PCRE_UTF8_ERR4
PCRE_UTF8_ERR5
.sp
-The string ends with a truncated UTF-8 character; the code specifies how many
+The string ends with a truncated UTF-8 character; the code specifies how many
bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be
no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279)
-allows for up to 6 bytes, and this is checked first; hence the possibility of
+allows for up to 6 bytes, and this is checked first; hence the possibility of
4 or 5 missing bytes.
.sp
PCRE_UTF8_ERR6
@@ -1905,42 +1905,42 @@ allows for up to 6 bytes, and this is checked first; hence the possibility of
PCRE_UTF8_ERR9
PCRE_UTF8_ERR10
.sp
-The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the
+The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the
character do not have the binary value 0b10 (that is, either the most
significant bit is 0, or the next bit is 1).
-.sp
+.sp
PCRE_UTF8_ERR11
PCRE_UTF8_ERR12
.sp
-A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long;
-these code points are excluded by RFC 3629.
-.sp
+A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long;
+these code points are excluded by RFC 3629.
+.sp
PCRE_UTF8_ERR13
.sp
-A 4-byte character has a value greater than 0x10fff; these code points are
+A 4-byte character has a value greater than 0x10fff; these code points are
excluded by RFC 3629.
-.sp
+.sp
PCRE_UTF8_ERR14
.sp
A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of
-code points are reserved by RFC 3629 for use with UTF-16, and so are excluded
+code points are reserved by RFC 3629 for use with UTF-16, and so are excluded
from UTF-8.
-.sp
+.sp
PCRE_UTF8_ERR15
PCRE_UTF8_ERR16
PCRE_UTF8_ERR17
PCRE_UTF8_ERR18
PCRE_UTF8_ERR19
.sp
-A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a
-value that can be represented by fewer bytes, which is invalid. For example,
+A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a
+value that can be represented by fewer bytes, which is invalid. For example,
the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just
one byte.
.sp
PCRE_UTF8_ERR20
.sp
-The two most significant bits of the first byte of a character have the binary
-value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a
+The two most significant bits of the first byte of a character have the binary
+value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a
byte can only validly occur as the second or subsequent byte of a multi-byte
character.
.sp
diff --git a/doc/pcrebuild.3 b/doc/pcrebuild.3
index 3f907b0..2bc4494 100644
--- a/doc/pcrebuild.3
+++ b/doc/pcrebuild.3
@@ -32,6 +32,18 @@ The following sections include descriptions of options whose names begin with
exists as well, but as it specifies the default, it is not described.
.
.
+.SH "BUILDING SHARED AND STATIC LIBRARIES"
+.rs
+.sp
+The PCRE building process uses \fBlibtool\fP to build both shared and static
+Unix libraries by default. You can suppress one of these by adding one of
+.sp
+ --disable-shared
+ --disable-static
+.sp
+to the \fBconfigure\fP command, as required.
+.
+.
.SH "C++ SUPPORT"
.rs
.sp
@@ -132,18 +144,6 @@ selected when PCRE is built can be overridden when the library functions are
called.
.
.
-.SH "BUILDING SHARED AND STATIC LIBRARIES"
-.rs
-.sp
-The PCRE building process uses \fBlibtool\fP to build both shared and static
-Unix libraries by default. You can suppress one of these by adding one of
-.sp
- --disable-shared
- --disable-static
-.sp
-to the \fBconfigure\fP command, as required.
-.
-.
.SH "POSIX MALLOC USAGE"
.rs
.sp
@@ -298,6 +298,23 @@ relevant libraries are installed on your system. Configuration will fail if
they are not.
.
.
+.SH "PCREGREP BUFFER SIZE"
+.rs
+.sp
+\fBpcregrep\fP uses an internal buffer to hold a "window" on the file it is
+scanning, in order to be able to output "before" and "after" lines when it
+finds a match. The size of the buffer is controlled by a parameter whose
+default value is 20K. The buffer itself is three times this size, but because
+of the way it is used for holding "before" lines, the longest line that is
+guaranteed to be processable is the parameter size. You can change the default
+parameter value by adding, for example,
+.sp
+ --with-pcregrep-bufsize=50K
+.sp
+to the \fBconfigure\fP command. The caller of \fPpcregrep\fP can, however,
+override this value by specifying a run-time option.
+.
+.
.SH "PCRETEST OPTION FOR LIBREADLINE SUPPORT"
.rs
.sp
@@ -350,6 +367,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 29 September 2009
-Copyright (c) 1997-2009 University of Cambridge.
+Last updated: 02 August 2011
+Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcrecallout.3 b/doc/pcrecallout.3
index 6910998..4b3b172 100644
--- a/doc/pcrecallout.3
+++ b/doc/pcrecallout.3
@@ -87,7 +87,7 @@ block. This structure contains the following fields:
void *\fIcallout_data\fP;
int \fIpattern_position\fP;
int \fInext_item_length\fP;
- const unsigned char *\fImark\fP;
+ const unsigned char *\fImark\fP;
.sp
The \fIversion\fP field is an integer containing the version number of the
block format. The initial version was 0; the current version is 2. The version
@@ -154,8 +154,8 @@ The \fIpattern_position\fP and \fInext_item_length\fP fields are intended to
help in distinguishing between different automatic callouts, which all have the
same callout number. However, they are set for all callouts.
.P
-The \fImark\fP field is present from version 2 of the \fIpcre_callout\fP
-structure. In callouts from \fBpcre_exec()\fP it contains a pointer to the
+The \fImark\fP field is present from version 2 of the \fIpcre_callout\fP
+structure. In callouts from \fBpcre_exec()\fP it contains a pointer to the
zero-terminated name of the most recently passed (*MARK) item in the match, or
NULL if there are no (*MARK)s in the current matching path. In callouts from
\fBpcre_dfa_exec()\fP this field always contains NULL.
diff --git a/doc/pcrecompat.3 b/doc/pcrecompat.3
index 97e598c..37ff217 100644
--- a/doc/pcrecompat.3
+++ b/doc/pcrecompat.3
@@ -23,8 +23,8 @@ page.
2. PCRE allows repeat quantifiers only on parenthesized assertions, but they do
not mean what you might think. For example, (?!a){3} does not assert that the
next three characters are not "a". It just asserts that the next character is
-not "a" three times (in principle: PCRE optimizes this to run the assertion
-just once). Perl allows repeat quantifiers on other assertions such as \b, but
+not "a" three times (in principle: PCRE optimizes this to run the assertion
+just once). Perl allows repeat quantifiers on other assertions such as \eb, but
these do not seem to have any use.
.P
3. Capturing subpatterns that occur inside negative lookahead assertions are
@@ -39,7 +39,7 @@ terminated by zero. The escape sequence \e0 can be used in the pattern to
represent a binary zero.
.P
5. The following Perl escape sequences are not supported: \el, \eu, \eL,
-\eU, and \eN when followed by a character name or Unicode value. (\eN on its
+\eU, and \eN when followed by a character name or Unicode value. (\eN on its
own, matching a non-newline character, is supported.) In fact these are
implemented by Perl's general string-handling and are not part of its pattern
matching engine. If any of these are encountered by PCRE, an error is
@@ -55,7 +55,7 @@ the internal representation of Unicode characters, there is no need to
implement the somewhat messy concept of surrogates."
.P
7. PCRE implements a simpler version of \eX than Perl, which changed to make
-\eX match what Unicode calls an "extended grapheme cluster". This is more
+\eX match what Unicode calls an "extended grapheme cluster". This is more
complicated than an extended Unicode sequence, which is what PCRE matches.
.P
8. PCRE does support the \eQ...\eE escape for quoting substrings. Characters in
@@ -112,8 +112,8 @@ names map to capturing subpattern number 1. To avoid this confusing situation,
an error is given at compile time.
.P
13. Perl recognizes comments in some places that PCRE does not, for example,
-between the ( and ? at the start of a subpattern. If the /x modifier is set,
-Perl allows whitespace between ( and ? but PCRE never does, even if the
+between the ( and ? at the start of a subpattern. If the /x modifier is set,
+Perl allows whitespace between ( and ? but PCRE never does, even if the
PCRE_EXTENDED option is set.
.P
14. PCRE provides some extensions to the Perl regular expression facilities.
diff --git a/doc/pcregrep.1 b/doc/pcregrep.1
index 317c103..514e94f 100644
--- a/doc/pcregrep.1
+++ b/doc/pcregrep.1
@@ -46,11 +46,11 @@ change how \fBpcregrep\fP behaves. In particular, the \fB-M\fP option makes it
possible to search for patterns that span line boundaries. What defines a line
boundary is controlled by the \fB-N\fP (\fB--newline\fP) option.
.P
-The amount of memory used for buffering files that are being scanned is
+The amount of memory used for buffering files that are being scanned is
controlled by a parameter that can be set by the \fB--buffer-size\fP option.
-The default value for this parameter is specified when \fBpcregrep\fP is built,
-with the default default being 20K. A block of memory three times this size is
-used (to allow for buffering "before" and "after" lines). An error occurs if a
+The default value for this parameter is specified when \fBpcregrep\fP is built,
+with the default default being 20K. A block of memory three times this size is
+used (to allow for buffering "before" and "after" lines). An error occurs if a
line overflows the buffer.
.P
Patterns are limited to 8K or BUFSIZ bytes, whichever is the greater. BUFSIZ is
@@ -100,7 +100,7 @@ standard input is always so treated.
The order in which some of the options appear can affect the output. For
example, both the \fB-h\fP and \fB-l\fP options affect the printing of file
names. Whichever comes later in the command line will be the one that takes
-effect. Numerical values for options may be followed by K or M, to signify
+effect. Numerical values for options may be followed by K or M, to signify
multiplication by 1024 or 1024*1024 respectively.
.TP 10
\fB--\fP
diff --git a/doc/pcregrep.txt b/doc/pcregrep.txt
index 1881b42..d1c9499 100644
--- a/doc/pcregrep.txt
+++ b/doc/pcregrep.txt
@@ -49,19 +49,26 @@ DESCRIPTION
What defines a line boundary is controlled by the -N (--newline)
option.
- Patterns are limited to 8K or BUFSIZ characters, whichever is the
- greater. BUFSIZ is defined in <stdio.h>. When there is more than one
- pattern (specified by the use of -e and/or -f), each pattern is applied
- to each line in the order in which they are defined, except that all
- the -e patterns are tried before the -f patterns.
-
- By default, as soon as one pattern matches (or fails to match when -v
- is used), no further patterns are considered. However, if --colour (or
+ The amount of memory used for buffering files that are being scanned is
+ controlled by a parameter that can be set by the --buffer-size option.
+ The default value for this parameter is specified when pcregrep is
+ built, with the default default being 20K. A block of memory three
+ times this size is used (to allow for buffering "before" and "after"
+ lines). An error occurs if a line overflows the buffer.
+
+ Patterns are limited to 8K or BUFSIZ bytes, whichever is the greater.
+ BUFSIZ is defined in <stdio.h>. When there is more than one pattern
+ (specified by the use of -e and/or -f), each pattern is applied to each
+ line in the order in which they are defined, except that all the -e
+ patterns are tried before the -f patterns.
+
+ By default, as soon as one pattern matches (or fails to match when -v
+ is used), no further patterns are considered. However, if --colour (or
--color) is used to colour the matching substrings, or if --only-match-
- ing, --file-offsets, or --line-offsets is used to output only the part
- of the line that matched (either shown literally, or as an offset),
- scanning resumes immediately following the match, so that further
- matches on the same line can be found. If there are multiple patterns,
+ ing, --file-offsets, or --line-offsets is used to output only the part
+ of the line that matched (either shown literally, or as an offset),
+ scanning resumes immediately following the match, so that further
+ matches on the same line can be found. If there are multiple patterns,
they are all tried on the remainder of the line, but patterns that fol-
low the one that matched are not tried on the earlier part of the line.
@@ -69,36 +76,37 @@ DESCRIPTION
in which multiple patterns are specified can affect the output when one
of the above options is used.
- Patterns that can match an empty string are accepted, but empty string
+ Patterns that can match an empty string are accepted, but empty string
matches are never recognized. An example is the pattern
- "(super)?(man)?", in which all components are optional. This pattern
- finds all occurrences of both "super" and "man"; the output differs
- from matching with "super|man" when only the matching substrings are
+ "(super)?(man)?", in which all components are optional. This pattern
+ finds all occurrences of both "super" and "man"; the output differs
+ from matching with "super|man" when only the matching substrings are
being shown.
- If the LC_ALL or LC_CTYPE environment variable is set, pcregrep uses
- the value to set a locale when calling the PCRE library. The --locale
+ If the LC_ALL or LC_CTYPE environment variable is set, pcregrep uses
+ the value to set a locale when calling the PCRE library. The --locale
option can be used to override this.
SUPPORT FOR COMPRESSED FILES
- It is possible to compile pcregrep so that it uses libz or libbz2 to
- read files whose names end in .gz or .bz2, respectively. You can find
+ It is possible to compile pcregrep so that it uses libz or libbz2 to
+ read files whose names end in .gz or .bz2, respectively. You can find
out whether your binary has support for one or both of these file types
by running it with the --help option. If the appropriate support is not
- present, files are treated as plain text. The standard input is always
+ present, files are treated as plain text. The standard input is always
so treated.
OPTIONS
- The order in which some of the options appear can affect the output.
- For example, both the -h and -l options affect the printing of file
- names. Whichever comes later in the command line will be the one that
- takes effect.
+ The order in which some of the options appear can affect the output.
+ For example, both the -h and -l options affect the printing of file
+ names. Whichever comes later in the command line will be the one that
+ takes effect. Numerical values for options may be followed by K or M,
+ to signify multiplication by 1024 or 1024*1024 respectively.
- -- This terminate the list of options. It is useful if the next
+ -- This terminates the list of options. It is useful if the next
item on the command line starts with a hyphen but is not an
option. This allows for the processing of patterns and file-
names that start with hyphens.
@@ -123,151 +131,155 @@ OPTIONS
pcregrep guarantees to have up to 8K of preceding text avail-
able for context output.
+ --buffer-size=number
+ Set the parameter that controls how much memory is used for
+ buffering files that are being scanned.
+
-C number, --context=number
- Output number lines of context both before and after each
- matching line. This is equivalent to setting both -A and -B
+ Output number lines of context both before and after each
+ matching line. This is equivalent to setting both -A and -B
to the same value.
-c, --count
- Do not output individual lines from the files that are being
+ Do not output individual lines from the files that are being
scanned; instead output the number of lines that would other-
- wise have been shown. If no lines are selected, the number
- zero is output. If several files are are being scanned, a
- count is output for each of them. However, if the --files-
- with-matches option is also used, only those files whose
+ wise have been shown. If no lines are selected, the number
+ zero is output. If several files are are being scanned, a
+ count is output for each of them. However, if the --files-
+ with-matches option is also used, only those files whose
counts are greater than zero are listed. When -c is used, the
-A, -B, and -C options are ignored.
--colour, --color
If this option is given without any data, it is equivalent to
- "--colour=auto". If data is required, it must be given in
+ "--colour=auto". If data is required, it must be given in
the same shell item, separated by an equals sign.
--colour=value, --color=value
This option specifies under what circumstances the parts of a
line that matched a pattern should be coloured in the output.
- By default, the output is not coloured. The value (which is
- optional, see above) may be "never", "always", or "auto". In
- the latter case, colouring happens only if the standard out-
- put is connected to a terminal. More resources are used when
- colouring is enabled, because pcregrep has to search for all
- possible matches in a line, not just one, in order to colour
+ By default, the output is not coloured. The value (which is
+ optional, see above) may be "never", "always", or "auto". In
+ the latter case, colouring happens only if the standard out-
+ put is connected to a terminal. More resources are used when
+ colouring is enabled, because pcregrep has to search for all
+ possible matches in a line, not just one, in order to colour
them all.
The colour that is used can be specified by setting the envi-
ronment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
of this variable should be a string of two numbers, separated
- by a semicolon. They are copied directly into the control
- string for setting colour on a terminal, so it is your
- responsibility to ensure that they make sense. If neither of
- the environment variables is set, the default is "1;31",
+ by a semicolon. They are copied directly into the control
+ string for setting colour on a terminal, so it is your
+ responsibility to ensure that they make sense. If neither of
+ the environment variables is set, the default is "1;31",
which gives red.
-D action, --devices=action
- If an input path is not a regular file or a directory,
- "action" specifies how it is to be processed. Valid values
+ If an input path is not a regular file or a directory,
+ "action" specifies how it is to be processed. Valid values
are "read" (the default) or "skip" (silently skip the path).
-d action, --directories=action
If an input path is a directory, "action" specifies how it is
- to be processed. Valid values are "read" (the default),
- "recurse" (equivalent to the -r option), or "skip" (silently
- skip the path). In the default case, directories are read as
- if they were ordinary files. In some operating systems the
- effect of reading a directory like this is an immediate end-
+ to be processed. Valid values are "read" (the default),
+ "recurse" (equivalent to the -r option), or "skip" (silently
+ skip the path). In the default case, directories are read as
+ if they were ordinary files. In some operating systems the
+ effect of reading a directory like this is an immediate end-
of-file.
-e pattern, --regex=pattern, --regexp=pattern
Specify a pattern to be matched. This option can be used mul-
tiple times in order to specify several patterns. It can also
- be used as a way of specifying a single pattern that starts
- with a hyphen. When -e is used, no argument pattern is taken
- from the command line; all arguments are treated as file
- names. There is an overall maximum of 100 patterns. They are
- applied to each line in the order in which they are defined
+ be used as a way of specifying a single pattern that starts
+ with a hyphen. When -e is used, no argument pattern is taken
+ from the command line; all arguments are treated as file
+ names. There is an overall maximum of 100 patterns. They are
+ applied to each line in the order in which they are defined
until one matches (or fails to match if -v is used). If -f is
- used with -e, the command line patterns are matched first,
- followed by the patterns from the file, independent of the
- order in which these options are specified. Note that multi-
+ used with -e, the command line patterns are matched first,
+ followed by the patterns from the file, independent of the
+ order in which these options are specified. Note that multi-
ple use of -e is not the same as a single pattern with alter-
natives. For example, X|Y finds the first character in a line
- that is X or Y, whereas if the two patterns are given sepa-
+ that is X or Y, whereas if the two patterns are given sepa-
rately, pcregrep finds X if it is present, even if it follows
- Y in the line. It finds Y only if there is no X in the line.
- This really matters only if you are using -o to show the
+ Y in the line. It finds Y only if there is no X in the line.
+ This really matters only if you are using -o to show the
part(s) of the line that matched.
--exclude=pattern
When pcregrep is searching the files in a directory as a con-
- sequence of the -r (recursive search) option, any regular
+ sequence of the -r (recursive search) option, any regular
files whose names match the pattern are excluded. Subdirecto-
- ries are not excluded by this option; they are searched
- recursively, subject to the --exclude-dir and --include_dir
- options. The pattern is a PCRE regular expression, and is
+ ries are not excluded by this option; they are searched
+ recursively, subject to the --exclude-dir and --include_dir
+ options. The pattern is a PCRE regular expression, and is
matched against the final component of the file name (not the
- entire path). If a file name matches both --include and
- --exclude, it is excluded. There is no short form for this
+ entire path). If a file name matches both --include and
+ --exclude, it is excluded. There is no short form for this
option.
--exclude-dir=pattern
- When pcregrep is searching the contents of a directory as a
- consequence of the -r (recursive search) option, any subdi-
- rectories whose names match the pattern are excluded. (Note
- that the --exclude option does not affect subdirectories.)
- The pattern is a PCRE regular expression, and is matched
- against the final component of the name (not the entire
- path). If a subdirectory name matches both --include-dir and
- --exclude-dir, it is excluded. There is no short form for
+ When pcregrep is searching the contents of a directory as a
+ consequence of the -r (recursive search) option, any subdi-
+ rectories whose names match the pattern are excluded. (Note
+ that the --exclude option does not affect subdirectories.)
+ The pattern is a PCRE regular expression, and is matched
+ against the final component of the name (not the entire
+ path). If a subdirectory name matches both --include-dir and
+ --exclude-dir, it is excluded. There is no short form for
this option.
-F, --fixed-strings
- Interpret each pattern as a list of fixed strings, separated
- by newlines, instead of as a regular expression. The -w
- (match as a word) and -x (match whole line) options can be
+ Interpret each pattern as a list of fixed strings, separated
+ by newlines, instead of as a regular expression. The -w
+ (match as a word) and -x (match whole line) options can be
used with -F. They apply to each of the fixed strings. A line
is selected if any of the fixed strings are found in it (sub-
ject to -w or -x, if present).
-f filename, --file=filename
- Read a number of patterns from the file, one per line, and
- match them against each line of input. A data line is output
+ Read a number of patterns from the file, one per line, and
+ match them against each line of input. A data line is output
if any of the patterns match it. The filename can be given as
"-" to refer to the standard input. When -f is used, patterns
- specified on the command line using -e may also be present;
+ specified on the command line using -e may also be present;
they are tested before the file's patterns. However, no other
- pattern is taken from the command line; all arguments are
- treated as file names. There is an overall maximum of 100
+ pattern is taken from the command line; all arguments are
+ treated as file names. There is an overall maximum of 100
patterns. Trailing white space is removed from each line, and
- blank lines are ignored. An empty file contains no patterns
- and therefore matches nothing. See also the comments about
- multiple patterns versus a single pattern with alternatives
+ blank lines are ignored. An empty file contains no patterns
+ and therefore matches nothing. See also the comments about
+ multiple patterns versus a single pattern with alternatives
in the description of -e above.
--file-offsets
- Instead of showing lines or parts of lines that match, show
- each match as an offset from the start of the file and a
- length, separated by a comma. In this mode, no context is
- shown. That is, the -A, -B, and -C options are ignored. If
+ Instead of showing lines or parts of lines that match, show
+ each match as an offset from the start of the file and a
+ length, separated by a comma. In this mode, no context is
+ shown. That is, the -A, -B, and -C options are ignored. If
there is more than one match in a line, each of them is shown
- separately. This option is mutually exclusive with --line-
+ separately. This option is mutually exclusive with --line-
offsets and --only-matching.
-H, --with-filename
- Force the inclusion of the filename at the start of output
- lines when searching a single file. By default, the filename
- is not shown in this case. For matching lines, the filename
+ Force the inclusion of the filename at the start of output
+ lines when searching a single file. By default, the filename
+ is not shown in this case. For matching lines, the filename
is followed by a colon; for context lines, a hyphen separator
- is used. If a line number is also being output, it follows
+ is used. If a line number is also being output, it follows
the file name.
-h, --no-filename
- Suppress the output filenames when searching multiple files.
- By default, filenames are shown when multiple files are
- searched. For matching lines, the filename is followed by a
- colon; for context lines, a hyphen separator is used. If a
+ Suppress the output filenames when searching multiple files.
+ By default, filenames are shown when multiple files are
+ searched. For matching lines, the filename is followed by a
+ colon; for context lines, a hyphen separator is used. If a
line number is also being output, it follows the file name.
- --help Output a help message, giving brief details of the command
+ --help Output a help message, giving brief details of the command
options and file type support, and then exit.
-i, --ignore-case
@@ -277,38 +289,38 @@ OPTIONS
When pcregrep is searching the files in a directory as a con-
sequence of the -r (recursive search) option, only those reg-
ular files whose names match the pattern are included. Subdi-
- rectories are always included and searched recursively, sub-
+ rectories are always included and searched recursively, sub-
ject to the --include-dir and --exclude-dir options. The pat-
tern is a PCRE regular expression, and is matched against the
- final component of the file name (not the entire path). If a
+ final component of the file name (not the entire path). If a
file name matches both --include and --exclude, it is
excluded. There is no short form for this option.
--include-dir=pattern
- When pcregrep is searching the contents of a directory as a
- consequence of the -r (recursive search) option, only those
- subdirectories whose names match the pattern are included.
- (Note that the --include option does not affect subdirecto-
- ries.) The pattern is a PCRE regular expression, and is
- matched against the final component of the name (not the
- entire path). If a subdirectory name matches both --include-
+ When pcregrep is searching the contents of a directory as a
+ consequence of the -r (recursive search) option, only those
+ subdirectories whose names match the pattern are included.
+ (Note that the --include option does not affect subdirecto-
+ ries.) The pattern is a PCRE regular expression, and is
+ matched against the final component of the name (not the
+ entire path). If a subdirectory name matches both --include-
dir and --exclude-dir, it is excluded. There is no short form
for this option.
-L, --files-without-match
- Instead of outputting lines from the files, just output the
- names of the files that do not contain any lines that would
- have been output. Each file name is output once, on a sepa-
+ Instead of outputting lines from the files, just output the
+ names of the files that do not contain any lines that would
+ have been output. Each file name is output once, on a sepa-
rate line.
-l, --files-with-matches
- Instead of outputting lines from the files, just output the
+ Instead of outputting lines from the files, just output the
names of the files containing lines that would have been out-
- put. Each file name is output once, on a separate line.
- Searching normally stops as soon as a matching line is found
- in a file. However, if the -c (count) option is also used,
- matching continues in order to obtain the correct count, and
- those files that have at least one match are listed along
+ put. Each file name is output once, on a separate line.
+ Searching normally stops as soon as a matching line is found
+ in a file. However, if the -c (count) option is also used,
+ matching continues in order to obtain the correct count, and
+ those files that have at least one match are listed along
with their counts. Using this option with -c is a way of sup-
pressing the listing of files with no matches.
@@ -318,173 +330,173 @@ OPTIONS
input)" is used. There is no short form for this option.
--line-buffered
- When this option is given, input is read and processed line
- by line, and the output is flushed after each write. By
- default, input is read in large chunks, unless pcregrep can
- determine that it is reading from a terminal (which is cur-
- rently possible only in Unix environments). Output to termi-
- nal is normally automatically flushed by the operating sys-
- tem. This option can be useful when the input or output is
- attached to a pipe and you do not want pcregrep to buffer up
- large amounts of data. However, its use will affect perfor-
+ When this option is given, input is read and processed line
+ by line, and the output is flushed after each write. By
+ default, input is read in large chunks, unless pcregrep can
+ determine that it is reading from a terminal (which is cur-
+ rently possible only in Unix environments). Output to termi-
+ nal is normally automatically flushed by the operating sys-
+ tem. This option can be useful when the input or output is
+ attached to a pipe and you do not want pcregrep to buffer up
+ large amounts of data. However, its use will affect perfor-
mance, and the -M (multiline) option ceases to work.
--line-offsets
- Instead of showing lines or parts of lines that match, show
+ Instead of showing lines or parts of lines that match, show
each match as a line number, the offset from the start of the
- line, and a length. The line number is terminated by a colon
- (as usual; see the -n option), and the offset and length are
- separated by a comma. In this mode, no context is shown.
- That is, the -A, -B, and -C options are ignored. If there is
- more than one match in a line, each of them is shown sepa-
+ line, and a length. The line number is terminated by a colon
+ (as usual; see the -n option), and the offset and length are
+ separated by a comma. In this mode, no context is shown.
+ That is, the -A, -B, and -C options are ignored. If there is
+ more than one match in a line, each of them is shown sepa-
rately. This option is mutually exclusive with --file-offsets
and --only-matching.
--locale=locale-name
- This option specifies a locale to be used for pattern match-
- ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
- ronment variables. If no locale is specified, the PCRE
- library's default (usually the "C" locale) is used. There is
+ This option specifies a locale to be used for pattern match-
+ ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
+ ronment variables. If no locale is specified, the PCRE
+ library's default (usually the "C" locale) is used. There is
no short form for this option.
--match-limit=number
- Processing some regular expression patterns can require a
- very large amount of memory, leading in some cases to a pro-
- gram crash if not enough is available. Other patterns may
- take a very long time to search for all possible matching
- strings. The pcre_exec() function that is called by pcregrep
- to do the matching has two parameters that can limit the
+ Processing some regular expression patterns can require a
+ very large amount of memory, leading in some cases to a pro-
+ gram crash if not enough is available. Other patterns may
+ take a very long time to search for all possible matching
+ strings. The pcre_exec() function that is called by pcregrep
+ to do the matching has two parameters that can limit the
resources that it uses.
- The --match-limit option provides a means of limiting
+ The --match-limit option provides a means of limiting
resource usage when processing patterns that are not going to
match, but which have a very large number of possibilities in
- their search trees. The classic example is a pattern that
- uses nested unlimited repeats. Internally, PCRE uses a func-
- tion called match() which it calls repeatedly (sometimes
- recursively). The limit set by --match-limit is imposed on
- the number of times this function is called during a match,
- which has the effect of limiting the amount of backtracking
+ their search trees. The classic example is a pattern that
+ uses nested unlimited repeats. Internally, PCRE uses a func-
+ tion called match() which it calls repeatedly (sometimes
+ recursively). The limit set by --match-limit is imposed on
+ the number of times this function is called during a match,
+ which has the effect of limiting the amount of backtracking
that can take place.
The --recursion-limit option is similar to --match-limit, but
instead of limiting the total number of times that match() is
called, it limits the depth of recursive calls, which in turn
- limits the amount of memory that can be used. The recursion
- depth is a smaller number than the total number of calls,
+ limits the amount of memory that can be used. The recursion
+ depth is a smaller number than the total number of calls,
because not all calls to match() are recursive. This limit is
of use only if it is set smaller than --match-limit.
- There are no short forms for these options. The default set-
- tings are specified when the PCRE library is compiled, with
+ There are no short forms for these options. The default set-
+ tings are specified when the PCRE library is compiled, with
the default default being 10 million.
-M, --multiline
- Allow patterns to match more than one line. When this option
+ Allow patterns to match more than one line. When this option
is given, patterns may usefully contain literal newline char-
- acters and internal occurrences of ^ and $ characters. The
- output for a successful match may consist of more than one
- line, the last of which is the one in which the match ended.
+ acters and internal occurrences of ^ and $ characters. The
+ output for a successful match may consist of more than one
+ line, the last of which is the one in which the match ended.
If the matched string ends with a newline sequence the output
ends at the end of that line.
- When this option is set, the PCRE library is called in "mul-
- tiline" mode. There is a limit to the number of lines that
- can be matched, imposed by the way that pcregrep buffers the
- input file as it scans it. However, pcregrep ensures that at
+ When this option is set, the PCRE library is called in "mul-
+ tiline" mode. There is a limit to the number of lines that
+ can be matched, imposed by the way that pcregrep buffers the
+ input file as it scans it. However, pcregrep ensures that at
least 8K characters or the rest of the document (whichever is
- the shorter) are available for forward matching, and simi-
+ the shorter) are available for forward matching, and simi-
larly the previous 8K characters (or all the previous charac-
- ters, if fewer than 8K) are guaranteed to be available for
- lookbehind assertions. This option does not work when input
+ ters, if fewer than 8K) are guaranteed to be available for
+ lookbehind assertions. This option does not work when input
is read line by line (see --line-buffered.)
-N newline-type, --newline=newline-type
- The PCRE library supports five different conventions for
- indicating the ends of lines. They are the single-character
- sequences CR (carriage return) and LF (linefeed), the two-
- character sequence CRLF, an "anycrlf" convention, which rec-
- ognizes any of the preceding three types, and an "any" con-
+ The PCRE library supports five different conventions for
+ indicating the ends of lines. They are the single-character
+ sequences CR (carriage return) and LF (linefeed), the two-
+ character sequence CRLF, an "anycrlf" convention, which rec-
+ ognizes any of the preceding three types, and an "any" con-
vention, in which any Unicode line ending sequence is assumed
- to end a line. The Unicode sequences are the three just men-
- tioned, plus VT (vertical tab, U+000B), FF (formfeed,
- U+000C), NEL (next line, U+0085), LS (line separator,
+ to end a line. The Unicode sequences are the three just men-
+ tioned, plus VT (vertical tab, U+000B), FF (form feed,
+ U+000C), NEL (next line, U+0085), LS (line separator,
U+2028), and PS (paragraph separator, U+2029).
When the PCRE library is built, a default line-ending
- sequence is specified. This is normally the standard
+ sequence is specified. This is normally the standard
sequence for the operating system. Unless otherwise specified
- by this option, pcregrep uses the library's default. The
+ by this option, pcregrep uses the library's default. The
possible values for this option are CR, LF, CRLF, ANYCRLF, or
- ANY. This makes it possible to use pcregrep on files that
- have come from other environments without having to modify
- their line endings. If the data that is being scanned does
- not agree with the convention set by this option, pcregrep
+ ANY. This makes it possible to use pcregrep on files that
+ have come from other environments without having to modify
+ their line endings. If the data that is being scanned does
+ not agree with the convention set by this option, pcregrep
may behave in strange ways.
-n, --line-number
Precede each output line by its line number in the file, fol-
- lowed by a colon for matching lines or a hyphen for context
- lines. If the filename is also being output, it precedes the
+ lowed by a colon for matching lines or a hyphen for context
+ lines. If the filename is also being output, it precedes the
line number. This option is forced if --line-offsets is used.
-o, --only-matching
Show only the part of the line that matched a pattern instead
- of the whole line. In this mode, no context is shown. That
- is, the -A, -B, and -C options are ignored. If there is more
- than one match in a line, each of them is shown separately.
- If -o is combined with -v (invert the sense of the match to
- find non-matching lines), no output is generated, but the
- return code is set appropriately. If the matched portion of
- the line is empty, nothing is output unless the file name or
- line number are being printed, in which case they are shown
+ of the whole line. In this mode, no context is shown. That
+ is, the -A, -B, and -C options are ignored. If there is more
+ than one match in a line, each of them is shown separately.
+ If -o is combined with -v (invert the sense of the match to
+ find non-matching lines), no output is generated, but the
+ return code is set appropriately. If the matched portion of
+ the line is empty, nothing is output unless the file name or
+ line number are being printed, in which case they are shown
on an otherwise empty line. This option is mutually exclusive
with --file-offsets and --line-offsets.
-onumber, --only-matching=number
- Show only the part of the line that matched the capturing
+ Show only the part of the line that matched the capturing
parentheses of the given number. Up to 32 capturing parenthe-
ses are supported. Because these options can be given without
- an argument (see above), if an argument is present, it must
- be given in the same shell item, for example, -o3 or --only-
- matching=2. The comments given for the non-argument case
- above also apply to this case. If the specified capturing
- parentheses do not exist in the pattern, or were not set in
- the match, nothing is output unless the file name or line
+ an argument (see above), if an argument is present, it must
+ be given in the same shell item, for example, -o3 or --only-
+ matching=2. The comments given for the non-argument case
+ above also apply to this case. If the specified capturing
+ parentheses do not exist in the pattern, or were not set in
+ the match, nothing is output unless the file name or line
number are being printed.
-q, --quiet
Work quietly, that is, display nothing except error messages.
- The exit status indicates whether or not any matches were
+ The exit status indicates whether or not any matches were
found.
-r, --recursive
- If any given path is a directory, recursively scan the files
- it contains, taking note of any --include and --exclude set-
- tings. By default, a directory is read as a normal file; in
- some operating systems this gives an immediate end-of-file.
- This option is a shorthand for setting the -d option to
+ If any given path is a directory, recursively scan the files
+ it contains, taking note of any --include and --exclude set-
+ tings. By default, a directory is read as a normal file; in
+ some operating systems this gives an immediate end-of-file.
+ This option is a shorthand for setting the -d option to
"recurse".
--recursion-limit=number
See --match-limit above.
-s, --no-messages
- Suppress error messages about non-existent or unreadable
- files. Such files are quietly skipped. However, the return
+ Suppress error messages about non-existent or unreadable
+ files. Such files are quietly skipped. However, the return
code is still 2, even if matches were found in other files.
-u, --utf-8
- Operate in UTF-8 mode. This option is available only if PCRE
- has been compiled with UTF-8 support. Both patterns and sub-
+ Operate in UTF-8 mode. This option is available only if PCRE
+ has been compiled with UTF-8 support. Both patterns and sub-
ject lines must be valid strings of UTF-8 characters.
-V, --version
- Write the version numbers of pcregrep and the PCRE library
+ Write the version numbers of pcregrep and the PCRE library
that is being used to the standard error stream.
-v, --invert-match
- Invert the sense of the match, so that lines which do not
+ Invert the sense of the match, so that lines which do not
match any of the patterns are the ones that are found.
-w, --word-regex, --word-regexp
@@ -492,105 +504,105 @@ OPTIONS
lent to having \b at the start and end of the pattern.
-x, --line-regex, --line-regexp
- Force the patterns to be anchored (each must start matching
- at the beginning of a line) and in addition, require them to
- match entire lines. This is equivalent to having ^ and $
+ Force the patterns to be anchored (each must start matching
+ at the beginning of a line) and in addition, require them to
+ match entire lines. This is equivalent to having ^ and $
characters at the start and end of each alternative branch in
every pattern.
ENVIRONMENT VARIABLES
- The environment variables LC_ALL and LC_CTYPE are examined, in that
- order, for a locale. The first one that is set is used. This can be
- overridden by the --locale option. If no locale is set, the PCRE
+ The environment variables LC_ALL and LC_CTYPE are examined, in that
+ order, for a locale. The first one that is set is used. This can be
+ overridden by the --locale option. If no locale is set, the PCRE
library's default (usually the "C" locale) is used.
NEWLINES
- The -N (--newline) option allows pcregrep to scan files with different
- newline conventions from the default. However, the setting of this
- option does not affect the way in which pcregrep writes information to
- the standard error and output streams. It uses the string "\n" in C
- printf() calls to indicate newlines, relying on the C I/O library to
- convert this to an appropriate sequence if the output is sent to a
+ The -N (--newline) option allows pcregrep to scan files with different
+ newline conventions from the default. However, the setting of this
+ option does not affect the way in which pcregrep writes information to
+ the standard error and output streams. It uses the string "\n" in C
+ printf() calls to indicate newlines, relying on the C I/O library to
+ convert this to an appropriate sequence if the output is sent to a
file.
OPTIONS COMPATIBILITY
- Many of the short and long forms of pcregrep's options are the same as
- in the GNU grep program (version 2.5.4). Any long option of the form
- --xxx-regexp (GNU terminology) is also available as --xxx-regex (PCRE
- terminology). However, the --file-offsets, --include-dir, --line-off-
+ Many of the short and long forms of pcregrep's options are the same as
+ in the GNU grep program (version 2.5.4). Any long option of the form
+ --xxx-regexp (GNU terminology) is also available as --xxx-regex (PCRE
+ terminology). However, the --file-offsets, --include-dir, --line-off-
sets, --locale, --match-limit, -M, --multiline, -N, --newline, --recur-
sion-limit, -u, and --utf-8 options are specific to pcregrep, as is the
use of the --only-matching option with a capturing parentheses number.
- Although most of the common options work the same way, a few are dif-
- ferent in pcregrep. For example, the --include option's argument is a
- glob for GNU grep, but a regular expression for pcregrep. If both the
- -c and -l options are given, GNU grep lists only file names, without
+ Although most of the common options work the same way, a few are dif-
+ ferent in pcregrep. For example, the --include option's argument is a
+ glob for GNU grep, but a regular expression for pcregrep. If both the
+ -c and -l options are given, GNU grep lists only file names, without
counts, but pcregrep gives the counts.
OPTIONS WITH DATA
There are four different ways in which an option with data can be spec-
- ified. If a short form option is used, the data may follow immedi-
+ ified. If a short form option is used, the data may follow immedi-
ately, or (with one exception) in the next command line item. For exam-
ple:
-f/some/file
-f /some/file
- The exception is the -o option, which may appear with or without data.
- Because of this, if data is present, it must follow immediately in the
+ The exception is the -o option, which may appear with or without data.
+ Because of this, if data is present, it must follow immediately in the
same item, for example -o3.
- If a long form option is used, the data may appear in the same command
- line item, separated by an equals character, or (with two exceptions)
+ If a long form option is used, the data may appear in the same command
+ line item, separated by an equals character, or (with two exceptions)
it may appear in the next command line item. For example:
--file=/some/file
--file /some/file
- Note, however, that if you want to supply a file name beginning with ~
- as data in a shell command, and have the shell expand ~ to a home
+ Note, however, that if you want to supply a file name beginning with ~
+ as data in a shell command, and have the shell expand ~ to a home
directory, you must separate the file name from the option, because the
shell does not treat ~ specially unless it is at the start of an item.
- The exceptions to the above are the --colour (or --color) and --only-
- matching options, for which the data is optional. If one of these
- options does have data, it must be given in the first form, using an
+ The exceptions to the above are the --colour (or --color) and --only-
+ matching options, for which the data is optional. If one of these
+ options does have data, it must be given in the first form, using an
equals character. Otherwise pcregrep will assume that it has no data.
MATCHING ERRORS
- It is possible to supply a regular expression that takes a very long
- time to fail to match certain lines. Such patterns normally involve
- nested indefinite repeats, for example: (a+)*\d when matched against a
- line of a's with no final digit. The PCRE matching function has a
- resource limit that causes it to abort in these circumstances. If this
+ It is possible to supply a regular expression that takes a very long
+ time to fail to match certain lines. Such patterns normally involve
+ nested indefinite repeats, for example: (a+)*\d when matched against a
+ line of a's with no final digit. The PCRE matching function has a
+ resource limit that causes it to abort in these circumstances. If this
happens, pcregrep outputs an error message and the line that caused the
- problem to the standard error stream. If there are more than 20 such
+ problem to the standard error stream. If there are more than 20 such
errors, pcregrep gives up.
- The --match-limit option of pcregrep can be used to set the overall
- resource limit; there is a second option called --recursion-limit that
- sets a limit on the amount of memory (usually stack) that is used (see
+ The --match-limit option of pcregrep can be used to set the overall
+ resource limit; there is a second option called --recursion-limit that
+ sets a limit on the amount of memory (usually stack) that is used (see
the discussion of these options above).
DIAGNOSTICS
Exit status is 0 if any matches were found, 1 if no matches were found,
- and 2 for syntax errors and non-existent or inacessible files (even if
- matches were found in other files) or too many matching errors. Using
- the -s option to suppress error messages about inaccessble files does
- not affect the return code.
+ and 2 for syntax errors, overlong lines, non-existent or inaccessible
+ files (even if matches were found in other files) or too many matching
+ errors. Using the -s option to suppress error messages about inaccessi-
+ ble files does not affect the return code.
SEE ALSO
@@ -607,5 +619,5 @@ AUTHOR
REVISION
- Last updated: 14 January 2011
+ Last updated: 30 July 2011
Copyright (c) 1997-2011 University of Cambridge.
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index f4c6cc3..81085b2 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -220,8 +220,8 @@ Perl, $ and @ cause variable interpolation. Note the following examples:
\eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz
.sp
The \eQ...\eE sequence is recognized both inside and outside character classes.
-An isolated \eE that is not preceded by \eQ is ignored. If \eQ is not followed
-by \eE later in the pattern, the literal interpretation continues to the end of
+An isolated \eE that is not preceded by \eQ is ignored. If \eQ is not followed
+by \eE later in the pattern, the literal interpretation continues to the end of
the pattern (that is, \eE is assumed at the end). If the isolated \eQ is inside
a character class, this causes an error, because the character class is not
terminated.
@@ -757,7 +757,7 @@ Characters with the "mark" property are typically accents that affect the
preceding character. None of them have codepoints less than 256, so in
non-UTF-8 mode \eX matches any one character.
.P
-Note that recent versions of Perl have changed \eX to match what Unicode calls
+Note that recent versions of Perl have changed \eX to match what Unicode calls
an "extended grapheme cluster", which has a more complicated definition.
.P
Matching characters by Unicode property is not fast, because PCRE has to search
@@ -1836,23 +1836,23 @@ capturing is carried out only for positive assertions, because it does not make
sense for negative assertions.
.P
For compatibility with Perl, assertion subpatterns may be repeated; though
-it makes no sense to assert the same thing several times, the side effect of
+it makes no sense to assert the same thing several times, the side effect of
capturing parentheses may occasionally be useful. In practice, there only three
cases:
.sp
-(1) If the quantifier is {0}, the assertion is never obeyed during matching.
-However, it may contain internal capturing parenthesized groups that are called
+(1) If the quantifier is {0}, the assertion is never obeyed during matching.
+However, it may contain internal capturing parenthesized groups that are called
from elsewhere via the
.\" HTML <a href="#subpatternsassubroutines">
.\" </a>
subroutine mechanism.
.\"
.sp
-(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
-were {0,1}. At run time, the rest of the pattern match is tried with and
+(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
+were {0,1}. At run time, the rest of the pattern match is tried with and
without the assertion, the order depending on the greediness of the quantifier.
.sp
-(3) If the minimum repetition is greater than zero, the quantifier is ignored.
+(3) If the minimum repetition is greater than zero, the quantifier is ignored.
The assertion is obeyed just once when encountered during matching.
.
.
@@ -2605,8 +2605,8 @@ indicates which of the two alternatives matched. This is a more efficient way
of obtaining this information than putting each alternative in its own
capturing parentheses.
.P
-If (*MARK) is encountered in a positive assertion, its name is recorded and
-passed back if it is the last-encountered. This does not happen for negative
+If (*MARK) is encountered in a positive assertion, its name is recorded and
+passed back if it is the last-encountered. This does not happen for negative
assetions.
.P
A name may also be returned after a failed match if the final path through the
diff --git a/doc/pcrestack.3 b/doc/pcrestack.3
index 7b30485..81aaaf0 100644
--- a/doc/pcrestack.3
+++ b/doc/pcrestack.3
@@ -9,7 +9,7 @@ When you call \fBpcre_exec()\fP, it makes use of an internal function called
in order to remember the state of the match so that it can back up and try a
different alternative if the first one fails. As matching proceeds deeper and
deeper into the tree of possibilities, the recursion depth increases. The
-\fBmatch()\P function is also called in other circumstances, for example,
+\fBmatch()\fP function is also called in other circumstances, for example,
whenever a parenthesized sub-pattern is entered, and in certain cases of
repetition.
.P
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index 0f1c417..d2728b8 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -18,9 +18,9 @@ options, see the
.\" HREF
\fBpcreapi\fP
.\"
-documentation. The input for \fBpcretest\fP is a sequence of regular expression
-patterns and strings to be matched, as described below. The output shows the
-result of each match. Options on the command line and the patterns control PCRE
+documentation. The input for \fBpcretest\fP is a sequence of regular expression
+patterns and strings to be matched, as described below. The output shows the
+result of each match. Options on the command line and the patterns control PCRE
options and exactly what is output.
.
.
@@ -257,15 +257,15 @@ The \fB/+\fP modifier requests that as well as outputting the substring that
matched the entire pattern, \fBpcretest\fP should in addition output the
remainder of the subject string. This is useful for tests where the subject
contains multiple copies of the same substring. If the \fB+\fP modifier appears
-twice, the same action is taken for captured substrings. In each case the
-remainder is output on the following line with a plus character following the
+twice, the same action is taken for captured substrings. In each case the
+remainder is output on the following line with a plus character following the
capture number.
.P
-The \fB/=\fP modifier requests that the values of all potential captured
+The \fB/=\fP modifier requests that the values of all potential captured
parentheses be output after a match by \fBpcre_exec()\fP. By default, only
those up to the highest one actually used in the match are output
-(corresponding to the return code from \fBpcre_exec()\fP). Values in the
-offsets vector corresponding to higher numbers should be set to -1, and these
+(corresponding to the return code from \fBpcre_exec()\fP). Values in the
+offsets vector corresponding to higher numbers should be set to -1, and these
are output as "<unset>". This modifier gives a way of checking that this is
happening.
.P
@@ -319,10 +319,10 @@ pattern to be output.
.P
If the \fB/S\fP modifier appears once, it causes \fBpcre_study()\fP to be
called after the expression has been compiled, and the results used when the
-expression is matched. If \fB/S\fP appears twice, it suppresses studying, even
-if it was requested externally by the \fB-s\fP command line option. This makes
-it possible to specify that certain patterns are always studied, and others are
-never studied, independently of \fB-s\fP. This feature is used in the test
+expression is matched. If \fB/S\fP appears twice, it suppresses studying, even
+if it was requested externally by the \fB-s\fP command line option. This makes
+it possible to specify that certain patterns are always studied, and others are
+never studied, independently of \fB-s\fP. This feature is used in the test
files in a few cases where the output is different when the pattern is studied.
.P
The \fB/T\fP modifier must be followed by a single digit. It causes a specific
@@ -551,7 +551,7 @@ include characters before the actual match start if a lookbehind assertion,
\eK, \eb, or \eB was involved.) For any other return, \fBpcretest\fP outputs
the PCRE negative error number and a short descriptive phrase. If the error is
a failed UTF-8 string check, the byte offset of the start of the failing
-character and the reason code are also output, provided that the size of the
+character and the reason code are also output, provided that the size of the
output vector is at least two. Here is an example of an interactive
\fBpcretest\fP run.
.sp
@@ -603,13 +603,13 @@ matching attempts are output in sequence, like this:
0: ipp
1: pp
.sp
-"No match" is output only if the first match attempt fails. Here is an example
-of a failure message (the offset 4 that is specified by \e>4 is past the end of
+"No match" is output only if the first match attempt fails. Here is an example
+of a failure message (the offset 4 that is specified by \e>4 is past the end of
the subject string):
.sp
re> /xyz/
- data> xyz\>4
- Error -24 (bad offset value)
+ data> xyz\e>4
+ Error -24 (bad offset value)
.P
If any of the sequences \fB\eC\fP, \fB\eG\fP, or \fB\eL\fP are present in a
data line that is successfully matched, the substrings extracted by the
@@ -725,7 +725,7 @@ a change of latest mark is passed to the callout function. For example:
+10 ^^ b
Latest Mark: X
+11 ^ ^ c
- +12 ^ ^
+ +12 ^ ^
0: abc
.sp
The mark changes between matching "a" and "b", but stays the same for the rest
diff --git a/doc/pcretest.txt b/doc/pcretest.txt
index 7f67d6f..a7c42fa 100644
--- a/doc/pcretest.txt
+++ b/doc/pcretest.txt
@@ -7,26 +7,30 @@ NAME
SYNOPSIS
- pcretest [options] [source] [destination]
+ pcretest [options] [input file [output file]]
pcretest was written as a test program for the PCRE regular expression
library itself, but it can also be used for experimenting with regular
expressions. This document describes the features of the test program;
for details of the regular expressions themselves, see the pcrepattern
documentation. For details of the PCRE library function calls and their
- options, see the pcreapi documentation.
+ options, see the pcreapi documentation. The input for pcretest is a
+ sequence of regular expression patterns and strings to be matched, as
+ described below. The output shows the result of each match. Options on
+ the command line and the patterns control PCRE options and exactly what
+ is output.
-OPTIONS
+COMMAND LINE OPTIONS
- -b Behave as if each regex has the /B (show bytecode) modifier;
- the internal form is output after compilation.
+ -b Behave as if each pattern has the /B (show byte code) modi-
+ fier; the internal form is output after compilation.
-C Output the version number of the PCRE library, and all avail-
able information about the optional features that are
included, and then exit.
- -d Behave as if each regex has the /D (debug) modifier; the
+ -d Behave as if each pattern has the /D (debug) modifier; the
internal form and information about the compiled pattern is
output after compilation; -d is equivalent to -b -i.
@@ -37,7 +41,7 @@ OPTIONS
-help Output a brief summary these options and then exit.
- -i Behave as if each regex has the /I modifier; information
+ -i Behave as if each pattern has the /I modifier; information
about the compiled pattern is given after compilation.
-M Behave as if each data line contains the \M escape sequence;
@@ -47,33 +51,52 @@ OPTIONS
-m Output the size of each compiled pattern after it has been
compiled. This is equivalent to adding /M to each regular
- expression. For compatibility with earlier versions of
- pcretest, -s is a synonym for -m.
-
- -o osize Set the number of elements in the output vector that is used
- when calling pcre_exec() or pcre_dfa_exec() to be osize. The
- default value is 45, which is enough for 14 capturing subex-
- pressions for pcre_exec() or 22 different matches for
- pcre_dfa_exec(). The vector size can be changed for individ-
- ual matching calls by including \O in the data line (see
+ expression.
+
+ -o osize Set the number of elements in the output vector that is used
+ when calling pcre_exec() or pcre_dfa_exec() to be osize. The
+ default value is 45, which is enough for 14 capturing subex-
+ pressions for pcre_exec() or 22 different matches for
+ pcre_dfa_exec(). The vector size can be changed for individ-
+ ual matching calls by including \O in the data line (see
below).
- -p Behave as if each regex has the /P modifier; the POSIX wrap-
- per API is used to call PCRE. None of the other options has
- any effect when -p is set.
+ -p Behave as if each pattern has the /P modifier; the POSIX
+ wrapper API is used to call PCRE. None of the other options
+ has any effect when -p is set.
- -q Do not output the version number of pcretest at the start of
+ -q Do not output the version number of pcretest at the start of
execution.
- -S size On Unix-like systems, set the size of the runtime stack to
+ -S size On Unix-like systems, set the size of the run-time stack to
size megabytes.
- -t Run each compile, study, and match many times with a timer,
- and output resulting time per compile or match (in millisec-
- onds). Do not set -m with -t, because you will then get the
- size output a zillion times, and the timing will be dis-
- torted. You can control the number of iterations that are
- used for timing by following -t with a number (as a separate
+ -s Behave as if each pattern has the /S modifier; in other
+ words, force each pattern to be studied. If the /I or /D
+ option is present on a pattern (requesting output about the
+ compiled pattern), information about the result of studying
+ is not included when studying is caused only by -s and nei-
+ ther -i nor -d is present on the command line. This behaviour
+ means that the output from tests that are run with and with-
+ out -s should be identical, except when options that output
+ information about the actual running of a match are set. The
+ -M, -t, and -tm options, which give information about
+ resources used, are likely to produce different output with
+ and without -s. Output may also differ if the /C option is
+ present on an individual pattern. This uses callouts to trace
+ the the matching process, and this may be different between
+ studied and non-studied patterns. If the pattern contains
+ (*MARK) items there may also be differences, for the same
+ reason. The -s command line option can be overridden for spe-
+ cific patterns that should never be studied (see the /S
+ option below).
+
+ -t Run each compile, study, and match many times with a timer,
+ and output resulting time per compile or match (in millisec-
+ onds). Do not set -m with -t, because you will then get the
+ size output a zillion times, and the timing will be dis-
+ torted. You can control the number of iterations that are
+ used for timing by following -t with a number (as a separate
item on the command line). For example, "-t 1000" would iter-
ate 1000 times. The default is to iterate 500000 times.
@@ -83,78 +106,78 @@ OPTIONS
DESCRIPTION
- If pcretest is given two filename arguments, it reads from the first
+ If pcretest is given two filename arguments, it reads from the first
and writes to the second. If it is given only one filename argument, it
- reads from that file and writes to stdout. Otherwise, it reads from
- stdin and writes to stdout, and prompts for each line of input, using
+ reads from that file and writes to stdout. Otherwise, it reads from
+ stdin and writes to stdout, and prompts for each line of input, using
"re>" to prompt for regular expressions, and "data>" to prompt for data
lines.
- When pcretest is built, a configuration option can specify that it
- should be linked with the libreadline library. When this is done, if
+ When pcretest is built, a configuration option can specify that it
+ should be linked with the libreadline library. When this is done, if
the input is from a terminal, it is read using the readline() function.
- This provides line-editing and history facilities. The output from the
+ This provides line-editing and history facilities. The output from the
-help option states whether or not readline() will be used.
The program handles any number of sets of input on a single input file.
- Each set starts with a regular expression, and continues with any num-
+ Each set starts with a regular expression, and continues with any num-
ber of data lines to be matched against the pattern.
- Each data line is matched separately and independently. If you want to
+ Each data line is matched separately and independently. If you want to
do multi-line matches, you have to use the \n escape sequence (or \r or
\r\n, etc., depending on the newline setting) in a single line of input
- to encode the newline sequences. There is no limit on the length of
- data lines; the input buffer is automatically extended if it is too
+ to encode the newline sequences. There is no limit on the length of
+ data lines; the input buffer is automatically extended if it is too
small.
- An empty line signals the end of the data lines, at which point a new
- regular expression is read. The regular expressions are given enclosed
+ An empty line signals the end of the data lines, at which point a new
+ regular expression is read. The regular expressions are given enclosed
in any non-alphanumeric delimiters other than backslash, for example:
/(a|bc)x+yz/
- White space before the initial delimiter is ignored. A regular expres-
- sion may be continued over several input lines, in which case the new-
- line characters are included within it. It is possible to include the
+ White space before the initial delimiter is ignored. A regular expres-
+ sion may be continued over several input lines, in which case the new-
+ line characters are included within it. It is possible to include the
delimiter within the pattern by escaping it, for example
/abc\/def/
- If you do so, the escape and the delimiter form part of the pattern,
- but since delimiters are always non-alphanumeric, this does not affect
- its interpretation. If the terminating delimiter is immediately fol-
+ If you do so, the escape and the delimiter form part of the pattern,
+ but since delimiters are always non-alphanumeric, this does not affect
+ its interpretation. If the terminating delimiter is immediately fol-
lowed by a backslash, for example,
/abc/\
- then a backslash is added to the end of the pattern. This is done to
- provide a way of testing the error condition that arises if a pattern
+ then a backslash is added to the end of the pattern. This is done to
+ provide a way of testing the error condition that arises if a pattern
finishes with a backslash, because
/abc\/
- is interpreted as the first line of a pattern that starts with "abc/",
+ is interpreted as the first line of a pattern that starts with "abc/",
causing pcretest to read the next line as a continuation of the regular
expression.
PATTERN MODIFIERS
- A pattern may be followed by any number of modifiers, which are mostly
- single characters. Following Perl usage, these are referred to below
- as, for example, "the /i modifier", even though the delimiter of the
- pattern need not always be a slash, and no slash is used when writing
- modifiers. Whitespace may appear between the final pattern delimiter
+ A pattern may be followed by any number of modifiers, which are mostly
+ single characters. Following Perl usage, these are referred to below
+ as, for example, "the /i modifier", even though the delimiter of the
+ pattern need not always be a slash, and no slash is used when writing
+ modifiers. White space may appear between the final pattern delimiter
and the first modifier, and between the modifiers themselves.
The /i, /m, /s, and /x modifiers set the PCRE_CASELESS, PCRE_MULTILINE,
- PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when pcre_com-
- pile() is called. These four modifier letters have the same effect as
+ PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when pcre_com-
+ pile() is called. These four modifier letters have the same effect as
they do in Perl. For example:
/caseless/i
- The following table shows additional modifiers for setting PCRE com-
+ The following table shows additional modifiers for setting PCRE com-
pile-time options that do not correspond to anything in Perl:
/8 PCRE_UTF8
@@ -178,48 +201,59 @@ PATTERN MODIFIERS
/<bsr_anycrlf> PCRE_BSR_ANYCRLF
/<bsr_unicode> PCRE_BSR_UNICODE
- The modifiers that are enclosed in angle brackets are literal strings
- as shown, including the angle brackets, but the letters can be in
- either case. This example sets multiline matching with CRLF as the line
- ending sequence:
+ The modifiers that are enclosed in angle brackets are literal strings
+ as shown, including the angle brackets, but the letters within can be
+ in either case. This example sets multiline matching with CRLF as the
+ line ending sequence:
- /^abc/m<crlf>
+ /^abc/m<CRLF>
As well as turning on the PCRE_UTF8 option, the /8 modifier also causes
- any non-printing characters in output strings to be printed using the
- \x{hh...} notation if they are valid UTF-8 sequences. Full details of
+ any non-printing characters in output strings to be printed using the
+ \x{hh...} notation if they are valid UTF-8 sequences. Full details of
the PCRE options are given in the pcreapi documentation.
Finding all matches in a string
- Searching for all possible matches within each subject string can be
- requested by the /g or /G modifier. After finding a match, PCRE is
+ Searching for all possible matches within each subject string can be
+ requested by the /g or /G modifier. After finding a match, PCRE is
called again to search the remainder of the subject string. The differ-
ence between /g and /G is that the former uses the startoffset argument
- to pcre_exec() to start searching at a new point within the entire
- string (which is in effect what Perl does), whereas the latter passes
- over a shortened substring. This makes a difference to the matching
+ to pcre_exec() to start searching at a new point within the entire
+ string (which is in effect what Perl does), whereas the latter passes
+ over a shortened substring. This makes a difference to the matching
process if the pattern begins with a lookbehind assertion (including \b
or \B).
- If any call to pcre_exec() in a /g or /G sequence matches an empty
- string, the next call is done with the PCRE_NOTEMPTY_ATSTART and
- PCRE_ANCHORED flags set in order to search for another, non-empty,
- match at the same point. If this second match fails, the start offset
- is advanced, and the normal match is retried. This imitates the way
+ If any call to pcre_exec() in a /g or /G sequence matches an empty
+ string, the next call is done with the PCRE_NOTEMPTY_ATSTART and
+ PCRE_ANCHORED flags set in order to search for another, non-empty,
+ match at the same point. If this second match fails, the start offset
+ is advanced, and the normal match is retried. This imitates the way
Perl handles such cases when using the /g modifier or the split() func-
- tion. Normally, the start offset is advanced by one character, but if
- the newline convention recognizes CRLF as a newline, and the current
+ tion. Normally, the start offset is advanced by one character, but if
+ the newline convention recognizes CRLF as a newline, and the current
character is CR followed by LF, an advance of two is used.
Other modifiers
There are yet more modifiers for controlling the way pcretest operates.
- The /+ modifier requests that as well as outputting the substring that
- matched the entire pattern, pcretest should in addition output the
- remainder of the subject string. This is useful for tests where the
- subject contains multiple copies of the same substring.
+ The /+ modifier requests that as well as outputting the substring that
+ matched the entire pattern, pcretest should in addition output the
+ remainder of the subject string. This is useful for tests where the
+ subject contains multiple copies of the same substring. If the + modi-
+ fier appears twice, the same action is taken for captured substrings.
+ In each case the remainder is output on the following line with a plus
+ character following the capture number.
+
+ The /= modifier requests that the values of all potential captured
+ parentheses be output after a match by pcre_exec(). By default, only
+ those up to the highest one actually used in the match are output (cor-
+ responding to the return code from pcre_exec()). Values in the offsets
+ vector corresponding to higher numbers should be set to -1, and these
+ are output as "<unset>". This modifier gives a way of checking that
+ this is happening.
The /B modifier is a debugging feature. It requests that pcretest out-
put a representation of the compiled byte code after compilation. Nor-
@@ -270,8 +304,14 @@ PATTERN MODIFIERS
The /M modifier causes the size of memory block used to hold the com-
piled pattern to be output.
- The /S modifier causes pcre_study() to be called after the expression
- has been compiled, and the results used when the expression is matched.
+ If the /S modifier appears once, it causes pcre_study() to be called
+ after the expression has been compiled, and the results used when the
+ expression is matched. If /S appears twice, it suppresses studying,
+ even if it was requested externally by the -s command line option. This
+ makes it possible to specify that certain patterns are always studied,
+ and others are never studied, independently of -s. This feature is used
+ in the test files in a few cases where the output is different when the
+ pattern is studied.
The /T modifier must be followed by a single digit. It causes a spe-
cific set of built-in character tables to be passed to pcre_compile().
@@ -306,7 +346,7 @@ PATTERN MODIFIERS
DATA LINES
Before each data line is passed to pcre_exec(), leading and trailing
- whitespace is removed, and it is then scanned for \ escapes. Some of
+ white space is removed, and it is then scanned for \ escapes. Some of
these are pretty esoteric features, intended for checking out some of
the more complicated features of PCRE. If you are just testing "ordi-
nary" regular expressions, you probably don't need any of these. The
@@ -315,7 +355,7 @@ DATA LINES
\a alarm (BEL, \x07)
\b backspace (\x08)
\e escape (\x27)
- \f formfeed (\x0c)
+ \f form feed (\x0c)
\n newline (\x0a)
\qdd set the PCRE_MATCH_LIMIT limit to dd
(any number of digits)
@@ -463,11 +503,14 @@ DEFAULT OUTPUT FROM PCRETEST
(Note that this is the entire substring that was inspected during the
partial match; it may include characters before the actual match start
if a lookbehind assertion, \K, \b, or \B was involved.) For any other
- returns, it outputs the PCRE negative error number. Here is an example
- of an interactive pcretest run.
+ return, pcretest outputs the PCRE negative error number and a short
+ descriptive phrase. If the error is a failed UTF-8 string check, the
+ byte offset of the start of the failing character and the reason code
+ are also output, provided that the size of the output vector is at
+ least two. Here is an example of an interactive pcretest run.
$ pcretest
- PCRE version 7.0 30-Nov-2006
+ PCRE version 8.13 2011-04-30
re> /^abc(\d+)/
data> abc123
@@ -476,12 +519,12 @@ DEFAULT OUTPUT FROM PCRETEST
data> xyz
No match
- Note that unset capturing substrings that are not followed by one that
- is set are not returned by pcre_exec(), and are not shown by pcretest.
- In the following example, there are two capturing substrings, but when
- the first data line is matched, the second, unset substring is not
- shown. An "internal" unset substring is shown as "<unset>", as for the
- second data line.
+ Unset capturing substrings that are not followed by one that is set are
+ not returned by pcre_exec(), and are not shown by pcretest. In the fol-
+ lowing example, there are two capturing substrings, but when the first
+ data line is matched, the second, unset substring is not shown. An
+ "internal" unset substring is shown as "<unset>", as for the second
+ data line.
re> /(a)|(b)/
data> a
@@ -492,11 +535,11 @@ DEFAULT OUTPUT FROM PCRETEST
1: <unset>
2: b
- If the strings contain any non-printing characters, they are output as
- \0x escapes, or as \x{...} escapes if the /8 modifier was present on
- the pattern. See below for the definition of non-printing characters.
- If the pattern has the /+ modifier, the output for substring 0 is fol-
- lowed by the the rest of the subject string, identified by "0+" like
+ If the strings contain any non-printing characters, they are output as
+ \0x escapes, or as \x{...} escapes if the /8 modifier was present on
+ the pattern. See below for the definition of non-printing characters.
+ If the pattern has the /+ modifier, the output for substring 0 is fol-
+ lowed by the the rest of the subject string, identified by "0+" like
this:
re> /cat/+
@@ -504,7 +547,7 @@ DEFAULT OUTPUT FROM PCRETEST
0: cat
0+ aract
- If the pattern has the /g or /G modifier, the results of successive
+ If the pattern has the /g or /G modifier, the results of successive
matching attempts are output in sequence, like this:
re> /\Bi(\w\w)/g
@@ -516,26 +559,32 @@ DEFAULT OUTPUT FROM PCRETEST
0: ipp
1: pp
- "No match" is output only if the first match attempt fails.
+ "No match" is output only if the first match attempt fails. Here is an
+ example of a failure message (the offset 4 that is specified by \>4 is
+ past the end of the subject string):
- If any of the sequences \C, \G, or \L are present in a data line that
- is successfully matched, the substrings extracted by the convenience
+ re> /xyz/
+ data> xyz\>4
+ Error -24 (bad offset value)
+
+ If any of the sequences \C, \G, or \L are present in a data line that
+ is successfully matched, the substrings extracted by the convenience
functions are output with C, G, or L after the string number instead of
a colon. This is in addition to the normal full list. The string length
- (that is, the return from the extraction function) is given in paren-
+ (that is, the return from the extraction function) is given in paren-
theses after each string for \C and \G.
Note that whereas patterns can be continued over several lines (a plain
">" prompt is used for continuations), data lines may not. However new-
- lines can be included in data by means of the \n escape (or \r, \r\n,
+ lines can be included in data by means of the \n escape (or \r, \r\n,
etc., depending on the newline sequence setting).
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
- When the alternative matching function, pcre_dfa_exec(), is used (by
- means of the \D escape sequence or the -dfa command line option), the
- output consists of a list of all the matches that start at the first
+ When the alternative matching function, pcre_dfa_exec(), is used (by
+ means of the \D escape sequence or the -dfa command line option), the
+ output consists of a list of all the matches that start at the first
point in the subject where there is at least one match. For example:
re> /(tang|tangerine|tan)/
@@ -544,11 +593,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
1: tang
2: tan
- (Using the normal matching function on this data finds only "tang".)
- The longest matching string is always given first (and numbered zero).
+ (Using the normal matching function on this data finds only "tang".)
+ The longest matching string is always given first (and numbered zero).
After a PCRE_ERROR_PARTIAL return, the output is "Partial match:", fol-
- lowed by the partially matching substring. (Note that this is the
- entire substring that was inspected during the partial match; it may
+ lowed by the partially matching substring. (Note that this is the
+ entire substring that was inspected during the partial match; it may
include characters before the actual match start if a lookbehind asser-
tion, \K, \b, or \B was involved.)
@@ -564,16 +613,16 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
1: tan
0: tan
- Since the matching function does not support substring capture, the
- escape sequences that are concerned with captured substrings are not
+ Since the matching function does not support substring capture, the
+ escape sequences that are concerned with captured substrings are not
relevant.
RESTARTING AFTER A PARTIAL MATCH
When the alternative matching function has given the PCRE_ERROR_PARTIAL
- return, indicating that the subject partially matched the pattern, you
- can restart the match with additional subject data by means of the \R
+ return, indicating that the subject partially matched the pattern, you
+ can restart the match with additional subject data by means of the \R
escape sequence. For example:
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@@ -582,30 +631,30 @@ RESTARTING AFTER A PARTIAL MATCH
data> n05\R\D
0: n05
- For further information about partial matching, see the pcrepartial
+ For further information about partial matching, see the pcrepartial
documentation.
CALLOUTS
- If the pattern contains any callout requests, pcretest's callout func-
- tion is called during matching. This works with both matching func-
+ If the pattern contains any callout requests, pcretest's callout func-
+ tion is called during matching. This works with both matching func-
tions. By default, the called function displays the callout number, the
- start and current positions in the text at the callout time, and the
+ start and current positions in the text at the callout time, and the
next pattern item to be tested. For example, the output
--->pqrabcdef
0 ^ ^ \d
- indicates that callout number 0 occurred for a match attempt starting
- at the fourth character of the subject string, when the pointer was at
- the seventh character of the data, and when the next pattern item was
- \d. Just one circumflex is output if the start and current positions
+ indicates that callout number 0 occurred for a match attempt starting
+ at the fourth character of the subject string, when the pointer was at
+ the seventh character of the data, and when the next pattern item was
+ \d. Just one circumflex is output if the start and current positions
are the same.
Callouts numbered 255 are assumed to be automatic callouts, inserted as
- a result of the /C pattern modifier. In this case, instead of showing
- the callout number, the offset in the pattern, preceded by a plus, is
+ a result of the /C pattern modifier. In this case, instead of showing
+ the callout number, the offset in the pattern, preceded by a plus, is
output. For example:
re> /\d?[A-E]\*/C
@@ -617,9 +666,29 @@ CALLOUTS
+10 ^ ^
0: E*
+ If a pattern contains (*MARK) items, an additional line is output when-
+ ever a change of latest mark is passed to the callout function. For
+ example:
+
+ re> /a(*MARK:X)bc/C
+ data> abc
+ --->abc
+ +0 ^ a
+ +1 ^^ (*MARK:X)
+ +10 ^^ b
+ Latest Mark: X
+ +11 ^ ^ c
+ +12 ^ ^
+ 0: abc
+
+ The mark changes between matching "a" and "b", but stays the same for
+ the rest of the match, so nothing more is output. If, as a result of
+ backtracking, the mark reverts to being unset, the text "<unset>" is
+ output.
+
The callout function in pcretest returns zero (carry on matching) by
default, but you can use a \C item in a data line (as described above)
- to change this.
+ to change this and other parameters of the callout.
Inserting callouts can be helpful when using pcretest to check compli-
cated regular expressions. For further information about callouts, see
@@ -641,8 +710,8 @@ NON-PRINTING CHARACTERS
SAVING AND RELOADING COMPILED PATTERNS
The facilities described in this section are not available when the
- POSIX inteface to PCRE is being used, that is, when the /P pattern mod-
- ifier is specified.
+ POSIX interface to PCRE is being used, that is, when the /P pattern
+ modifier is specified.
When the POSIX interface is not in use, you can cause pcretest to write
a compiled pattern to a file, by following the modifiers with > and a
@@ -663,13 +732,13 @@ SAVING AND RELOADING COMPILED PATTERNS
diately after the compiled pattern. After writing the file, pcretest
expects to read a new pattern.
- A saved pattern can be reloaded into pcretest by specifing < and a file
- name instead of a pattern. The name of the file must not contain a <
- character, as otherwise pcretest will interpret the line as a pattern
+ A saved pattern can be reloaded into pcretest by specifying < and a
+ file name instead of a pattern. The name of the file must not contain a
+ < character, as otherwise pcretest will interpret the line as a pattern
delimited by < characters. For example:
re> </some/file
- Compiled regex loaded from /some/file
+ Compiled pattern loaded from /some/file
No study data
When the pattern has been loaded, pcretest proceeds to read data lines
@@ -709,5 +778,5 @@ AUTHOR
REVISION
- Last updated: 21 November 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 01 August 2011
+ Copyright (c) 1997-2011 University of Cambridge.
diff --git a/maint/README b/maint/README
index 2705c01..78e9fa6 100644
--- a/maint/README
+++ b/maint/README
@@ -102,17 +102,19 @@ distribution for a new release.
. Ensure that the version number and version date are correct in configure.ac.
. If new build options have been added, ensure that they are added to the CMake
- files as well as to the autoconf files.
+ files as well as to the autoconf files. The relevant files are CMakeLists.txt
+ and config-cmake.h.in. After making a release tarball, test it out with CMake
+ if there have been changes here.
. Run ./autogen.sh to ensure everything is up-to-date.
. Compile and test with many different config options, and combinations of
options. The maint/ManyConfigTests script now encapsulates this testing.
-. Run perltest.pl on the test data for tests 1, 4, 6, and 11. The output should
- match the PCRE test output, apart from the version identification at the
- start of each test. The other tests are not Perl-compatible (they use various
- PCRE-specific features or options).
+. Run perltest.pl on the test data for tests 1, 4, 6, 11, and 12. The output
+ should match the PCRE test output, apart from the version identification at
+ the start of each test. The other tests are not Perl-compatible (they use
+ various PCRE-specific features or options).
. Test with valgrind by running "RunTest valgrind". There is also "RunGrepTest
valgrind", though that takes quite a long time.
@@ -321,4 +323,4 @@ others are relatively new.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
-Last updated: 12 January 2011
+Last updated: 02 August 2011
diff --git a/pcre.h.in b/pcre.h.in
index 5193b9d..fe37f75 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -281,7 +281,7 @@ typedef struct pcre_callout_block {
int pattern_position; /* Offset to next item in the pattern */
int next_item_length; /* Length of next item in the pattern */
/* ------------------- Added for Version 2 -------------------------- */
- const unsigned char *mark; /* Pointer to current mark or NULL */
+ const unsigned char *mark; /* Pointer to current mark or NULL */
/* ------------------------------------------------------------------ */
} pcre_callout_block;
diff --git a/pcre_compile.c b/pcre_compile.c
index cce60de..e728696 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -409,7 +409,7 @@ static const char error_texts[] =
"(*MARK) must have an argument\0"
"this version of PCRE is not compiled with PCRE_UCP support\0"
"\\c must be followed by an ASCII character\0"
- "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
+ "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -681,7 +681,7 @@ else
*errorcodeptr = ERR37;
break;
- /* In a character class, \g is just a literal "g". Outside a character
+ /* In a character class, \g is just a literal "g". Outside a character
class, \g must be followed by one of a number of specific things:
(1) A number, either plain or braced. If positive, it is an absolute
@@ -921,7 +921,7 @@ else
}
/* Perl supports \N{name} for character names, as well as plain \N for "not
-newline". PCRE does not support \N{name}. However, it does support
+newline". PCRE does not support \N{name}. However, it does support
quantification such as \N{2,3}. */
if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
@@ -1982,28 +1982,28 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
/* For a recursion/subroutine call, if its end has been reached, which
implies a backward reference subroutine call, we can scan it. If it's a
forward reference subroutine call, we can't. To detect forward reference
- we have to scan up the list that is kept in the workspace. This function is
- called only when doing the real compile, not during the pre-compile that
+ we have to scan up the list that is kept in the workspace. This function is
+ called only when doing the real compile, not during the pre-compile that
measures the size of the compiled pattern. */
if (c == OP_RECURSE)
{
const uschar *scode;
BOOL empty_branch;
-
+
/* Test for forward reference */
-
+
for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
- if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
+ if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
/* Not a forward reference, test for completed backward reference */
-
+
empty_branch = FALSE;
scode = cd->start_code + GET(code, 1);
if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
-
+
/* Completed backwards reference */
-
+
do
{
if (could_be_empty_branch(scode, endcode, utf8, cd))
@@ -2014,7 +2014,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
scode += GET(scode, 1);
}
while (*scode == OP_ALT);
-
+
if (!empty_branch) return FALSE; /* All branches are non-empty */
continue;
}
@@ -2240,7 +2240,7 @@ return TRUE;
the current branch of the current pattern to see if it could match the empty
string. If it could, we must look outwards for branches at other levels,
stopping when we pass beyond the bracket which is the subject of the recursion.
-This function is called only during the real compile, not during the
+This function is called only during the real compile, not during the
pre-compile.
Arguments:
@@ -2296,7 +2296,7 @@ A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
It seems that the appearance of a nested POSIX class supersedes an apparent
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
a digit. Also, unescaped square brackets may also appear as part of class
-names. For example, [:a[:abc]b:] gives unknown class "[:abc]b:]"in Perl.
+names. For example, [:a[:abc]b:] gives unknown class "[:abc]b:]"in Perl.
Arguments:
ptr pointer to the initial [
@@ -2312,8 +2312,8 @@ int terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
for (++ptr; *ptr != 0; ptr++)
{
- if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
- ptr++;
+ if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+ ptr++;
else
{
if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
@@ -2325,7 +2325,7 @@ for (++ptr; *ptr != 0; ptr++)
(ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
ptr[1] == CHAR_EQUALS_SIGN) &&
check_posix_syntax(ptr, endptr))
- return FALSE;
+ return FALSE;
}
}
return FALSE;
@@ -3035,7 +3035,7 @@ Arguments:
firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
reqbyteptr set to the last literal character required, else < 0
bcptr points to current branch chain
- cond_depth conditional nesting depth
+ cond_depth conditional nesting depth
cd contains pointers to tables etc.
lengthptr NULL during the real compile phase
points to length accumulator during pre-compile phase
@@ -3075,7 +3075,7 @@ uschar *save_hwm = NULL;
uschar classbits[32];
/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
-must not do this for other options (e.g. PCRE_EXTENDED) because they may change
+must not do this for other options (e.g. PCRE_EXTENDED) because they may change
dynamically as we process the pattern. */
#ifdef SUPPORT_UTF8
@@ -4250,13 +4250,13 @@ for (;; ptr++)
ptr++;
}
else repeat_type = greedy_default;
-
- /* If previous was a recursion call, wrap it in atomic brackets so that
+
+ /* If previous was a recursion call, wrap it in atomic brackets so that
previous becomes the atomic group. All recursions were so wrapped in the
past, but it no longer happens for non-repeated recursions. In fact, the
repeated ones could be re-implemented independently so as not to need this,
but for the moment we rely on the code for repeating groups. */
-
+
if (*previous == OP_RECURSE)
{
memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
@@ -4266,18 +4266,18 @@ for (;; ptr++)
PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
code += 2 + 2 * LINK_SIZE;
length_prevgroup = 3 + 3*LINK_SIZE;
-
+
/* When actually compiling, we need to check whether this was a forward
reference, and if so, adjust the offset. */
-
+
if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
{
int offset = GET(cd->hwm, -LINK_SIZE);
if (offset == previous + 1 - cd->start_code)
- PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
- }
- }
-
+ PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
+ }
+ }
+
/* Now handle repetition for the different types of item. */
/* If previous was a character match, abolish the item and generate a
@@ -4575,32 +4575,32 @@ for (;; ptr++)
opcodes such as BRA and CBRA, as this is the place where they get converted
into the more special varieties such as BRAPOS and SBRA. A test for >=
OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
- ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
+ ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
repetition of assertions, but now it does, for Perl compatibility. */
- else if (*previous >= OP_ASSERT && *previous <= OP_COND)
+ else if (*previous >= OP_ASSERT && *previous <= OP_COND)
{
register int i;
int len = (int)(code - previous);
uschar *bralink = NULL;
uschar *brazeroptr = NULL;
-
- /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
+
+ /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
we just ignore the repeat. */
if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
- goto END_REPEAT;
+ goto END_REPEAT;
- /* There is no sense in actually repeating assertions. The only potential
- use of repetition is in cases when the assertion is optional. Therefore,
- if the minimum is greater than zero, just ignore the repeat. If the
- maximum is not not zero or one, set it to 1. */
+ /* There is no sense in actually repeating assertions. The only potential
+ use of repetition is in cases when the assertion is optional. Therefore,
+ if the minimum is greater than zero, just ignore the repeat. If the
+ maximum is not not zero or one, set it to 1. */
if (*previous < OP_ONCE) /* Assertion */
{
if (repeat_min > 0) goto END_REPEAT;
if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
- }
+ }
/* The case of a zero minimum is special because of the need to stick
OP_BRAZERO in front of it, and because the group appears once in the
@@ -4834,9 +4834,9 @@ for (;; ptr++)
{
uschar *ketcode = code - 1 - LINK_SIZE;
uschar *bracode = ketcode - GET(ketcode, 1);
-
- if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
- if (*bracode == OP_ONCE)
+
+ if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
+ if (*bracode == OP_ONCE)
*ketcode = OP_KETRMAX + repeat_type;
else
{
@@ -4887,12 +4887,12 @@ for (;; ptr++)
there are special alternative opcodes for this case. For anything else, we
wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
notation is just syntactic sugar, taken from Sun's Java package, but the
- special opcodes can optimize it.
-
+ special opcodes can optimize it.
+
Possessively repeated subpatterns have already been handled in the code
- just above, so possessive_quantifier is always FALSE for them at this
+ just above, so possessive_quantifier is always FALSE for them at this
stage.
-
+
Note that the repeated item starts at tempcode, not at previous, which
might be the first part of a string whose (former) last char we repeated.
@@ -4997,10 +4997,10 @@ for (;; ptr++)
previous = NULL;
while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
namelen = (int)(ptr - name);
-
+
/* It appears that Perl allows any characters whatsoever, other than
- a closing parenthesis, to appear in arguments, so we no longer insist on
- letters, digits, and underscores. */
+ a closing parenthesis, to appear in arguments, so we no longer insist on
+ letters, digits, and underscores. */
if (*ptr == CHAR_COLON)
{
@@ -5008,7 +5008,7 @@ for (;; ptr++)
while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
arglen = (int)(ptr - arg);
}
-
+
if (*ptr != CHAR_RIGHT_PARENTHESIS)
{
*errorcodeptr = ERR60;
@@ -5022,7 +5022,7 @@ for (;; ptr++)
if (namelen == verbs[i].len &&
strncmp((char *)name, vn, namelen) == 0)
{
- /* Check for open captures before ACCEPT and convert it to
+ /* Check for open captures before ACCEPT and convert it to
ASSERT_ACCEPT if in an assertion. */
if (verbs[i].op == OP_ACCEPT)
@@ -5032,7 +5032,7 @@ for (;; ptr++)
{
*errorcodeptr = ERR59;
goto FAILED;
- }
+ }
cd->had_accept = TRUE;
for (oc = cd->open_caps; oc != NULL; oc = oc->next)
{
@@ -5333,7 +5333,7 @@ for (;; ptr++)
/* ------------------------------------------------------------ */
case CHAR_EQUALS_SIGN: /* Positive lookahead */
bravalue = OP_ASSERT;
- cd->assert_depth += 1;
+ cd->assert_depth += 1;
ptr++;
break;
@@ -5348,7 +5348,7 @@ for (;; ptr++)
continue;
}
bravalue = OP_ASSERT_NOT;
- cd->assert_depth += 1;
+ cd->assert_depth += 1;
break;
@@ -5358,13 +5358,13 @@ for (;; ptr++)
{
case CHAR_EQUALS_SIGN: /* Positive lookbehind */
bravalue = OP_ASSERTBACK;
- cd->assert_depth += 1;
+ cd->assert_depth += 1;
ptr += 2;
break;
case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
bravalue = OP_ASSERTBACK_NOT;
- cd->assert_depth += 1;
+ cd->assert_depth += 1;
ptr += 2;
break;
@@ -5608,7 +5608,7 @@ for (;; ptr++)
temp = cd->end_pattern;
cd->end_pattern = ptr;
- recno = find_parens(cd, name, namelen,
+ recno = find_parens(cd, name, namelen,
(options & PCRE_EXTENDED) != 0, utf8);
cd->end_pattern = temp;
if (recno < 0) recno = 0; /* Forward ref; set dummy number */
@@ -5764,10 +5764,10 @@ for (;; ptr++)
/* If not a forward reference, and the subpattern is still open,
this is a recursive call. We check to see if this is a left
recursion that could loop for ever, and diagnose that case. We
- must not, however, do this check if we are in a conditional
+ must not, however, do this check if we are in a conditional
subpattern because the condition might be testing for recursion in
- a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
- Forever loops are also detected at runtime, so those that occur in
+ a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
+ Forever loops are also detected at runtime, so those that occur in
conditional subpatterns will be picked up then. */
else if (GET(called, 1) == 0 && cond_depth <= 0 &&
@@ -5779,7 +5779,7 @@ for (;; ptr++)
}
/* Insert the recursion/subroutine item. */
-
+
*code = OP_RECURSE;
PUT(code, 1, (int)(called - cd->start_code));
code += 1 + LINK_SIZE;
@@ -5903,12 +5903,12 @@ for (;; ptr++)
be able to pass its address because some compilers complain otherwise. */
previous = code; /* For handling repetition */
- *code = bravalue;
- tempcode = code;
+ *code = bravalue;
+ tempcode = code;
tempreqvary = cd->req_varyopt; /* Save value before bracket */
length_prevgroup = 0; /* Initialize for pre-compile phase */
-
- if (!compile_regex(
+
+ if (!compile_regex(
newoptions, /* The complete new option state */
&tempcode, /* Where to put code (updated) */
&ptr, /* Input pointer (updated) */
@@ -5917,7 +5917,7 @@ for (;; ptr++)
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
reset_bracount, /* True if (?| group */
skipbytes, /* Skip over bracket number */
- cond_depth +
+ cond_depth +
((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
&subfirstbyte, /* For possible first char */
&subreqbyte, /* For possible last char */
@@ -5927,9 +5927,9 @@ for (;; ptr++)
&length_prevgroup /* Pre-compile phase */
))
goto FAILED;
-
+
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
- cd->assert_depth -= 1;
+ cd->assert_depth -= 1;
/* At the end of compiling, code is still pointing to the start of the
group, while tempcode has been updated to point past the end of the group
@@ -6177,14 +6177,14 @@ for (;; ptr++)
ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
{
*errorcodeptr = ERR69;
- break;
+ break;
}
is_recurse = FALSE;
terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
goto NAMED_REF_OR_RECURSE;
- }
+ }
/* Back references are handled specially; must disable firstbyte if
not set to cope with cases like (?=(\w+))\1: which would otherwise set
@@ -6382,7 +6382,7 @@ Arguments:
lookbehind TRUE if this is a lookbehind assertion
reset_bracount TRUE to reset the count for each branch
skipbytes skip this many bytes at start (for brackets and OP_COND)
- cond_depth depth of nesting for conditional subpatterns
+ cond_depth depth of nesting for conditional subpatterns
firstbyteptr place to put the first required character, or a negative number
reqbyteptr place to put the last required character, or a negative number
bcptr pointer to the chain of currently open branches
@@ -6396,7 +6396,7 @@ Returns: TRUE on success
static BOOL
compile_regex(int options, uschar **codeptr, const uschar **ptrptr,
int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
- int cond_depth, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
+ int cond_depth, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
compile_data *cd, int *lengthptr)
{
const uschar *ptr = *ptrptr;
@@ -6434,8 +6434,8 @@ pre-compile phase to find out whether anything has yet been compiled or not. */
/* If this is a capturing subpattern, add to the chain of open capturing items
so that we can detect them if (*ACCEPT) is encountered. This is also used to
-detect groups that contain recursive back references to themselves. Note that
-only OP_CBRA need be tested here; changing this opcode to one of its variants,
+detect groups that contain recursive back references to themselves. Note that
+only OP_CBRA need be tested here; changing this opcode to one of its variants,
e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
if (*code == OP_CBRA)
@@ -6476,7 +6476,7 @@ for (;;)
into the length. */
if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
- &branchreqbyte, &bc, cond_depth, cd,
+ &branchreqbyte, &bc, cond_depth, cd,
(lengthptr == NULL)? NULL : &length))
{
*ptrptr = ptr;
@@ -6910,7 +6910,7 @@ do {
case OP_EXACT:
scode += 2;
/* Fall through */
-
+
case OP_CHAR:
case OP_PLUS:
case OP_MINPLUS:
@@ -6923,7 +6923,7 @@ do {
case OP_EXACTI:
scode += 2;
/* Fall through */
-
+
case OP_CHARI:
case OP_PLUSI:
case OP_MINPLUSI:
@@ -7083,7 +7083,7 @@ utf8 = (options & PCRE_UTF8) != 0;
/* Can't support UTF8 unless PCRE has been compiled to include the code. The
return of an error code from _pcre_valid_utf8() is a new feature, introduced in
-release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
+release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
not used here. */
#ifdef SUPPORT_UTF8
@@ -7116,7 +7116,7 @@ if ((options & PCRE_UCP) != 0)
if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
{
- errorcode = ERR56;
+ errorcode = ERR56;
goto PCRE_EARLY_ERROR_RETURN;
}
@@ -7202,7 +7202,7 @@ outside can help speed up starting point checks. */
ptr += skipatstart;
code = cworkspace;
*code = OP_BRA;
-(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
+(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
FALSE, 0, 0, &firstbyte, &reqbyte, NULL, cd, &length);
if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index dbd52a1..ff3d5bd 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -540,7 +540,7 @@ else
{
int length = 1 + LINK_SIZE +
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
- *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
+ *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
2:0);
do
{
@@ -621,7 +621,7 @@ for (;;)
for (i = 0; i < active_count; i++)
{
stateblock *current_state = active_states + i;
- BOOL caseless = FALSE;
+ BOOL caseless = FALSE;
const uschar *code;
int state_offset = current_state->offset;
int count, codevalue, rrc;
@@ -738,11 +738,11 @@ for (;;)
/* ========================================================================== */
/* Reached a closing bracket. If not at the end of the pattern, carry
- on with the next opcode. For repeating opcodes, also add the repeat
- state. Note that KETRPOS will always be encountered at the end of the
- subpattern, because the possessive subpattern repeats are always handled
+ on with the next opcode. For repeating opcodes, also add the repeat
+ state. Note that KETRPOS will always be encountered at the end of the
+ subpattern, because the possessive subpattern repeats are always handled
using recursive calls. Thus, it never adds any new states.
-
+
At the end of the (sub)pattern, unless we have an empty string and
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
start of the subject, save the match data, shifting up all previous
@@ -751,7 +751,7 @@ for (;;)
case OP_KET:
case OP_KETRMIN:
case OP_KETRMAX:
- case OP_KETRPOS:
+ case OP_KETRPOS:
if (code != end_code)
{
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
@@ -2179,7 +2179,7 @@ for (;;)
checking (c) can be multibyte. */
case OP_NOTI:
- if (clen > 0 && c != d && c != fcc[d])
+ if (clen > 0 && c != d && c != fcc[d])
{ ADD_NEW(state_offset + dlen + 1, 0); }
break;
@@ -2192,7 +2192,7 @@ for (;;)
case OP_NOTPOSPLUSI:
caseless = TRUE;
codevalue -= OP_STARI - OP_STAR;
-
+
/* Fall through */
case OP_PLUS:
case OP_MINPLUS:
@@ -2560,7 +2560,7 @@ for (;;)
cb.capture_top = 1;
cb.capture_last = -1;
cb.callout_data = md->callout_data;
- cb.mark = NULL; /* No (*MARK) support */
+ cb.mark = NULL; /* No (*MARK) support */
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
}
if (rrc > 0) break; /* Fail this thread */
@@ -2587,7 +2587,7 @@ for (;;)
{
int value = GET2(code, LINK_SIZE+2);
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
- if (md->recursive != NULL)
+ if (md->recursive != NULL)
{ ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
}
@@ -2626,31 +2626,31 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_RECURSE:
{
- dfa_recursion_info *ri;
+ dfa_recursion_info *ri;
int local_offsets[1000];
int local_workspace[1000];
const uschar *callpat = start_code + GET(code, 1);
- int recno = (callpat == md->start_code)? 0 :
- GET2(callpat, 1 + LINK_SIZE);
+ int recno = (callpat == md->start_code)? 0 :
+ GET2(callpat, 1 + LINK_SIZE);
int rc;
DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
-
+
/* Check for repeating a recursion without advancing the subject
pointer. This should catch convoluted mutual recursions. (Some simple
cases are caught at compile time.) */
-
- for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
- if (recno == ri->group_num && ptr == ri->subject_position)
- return PCRE_ERROR_RECURSELOOP;
- /* Remember this recursion and where we started it so as to
+ for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
+ if (recno == ri->group_num && ptr == ri->subject_position)
+ return PCRE_ERROR_RECURSELOOP;
+
+ /* Remember this recursion and where we started it so as to
catch infinite loops. */
-
+
new_recursive.group_num = recno;
new_recursive.subject_position = ptr;
new_recursive.prevrec = md->recursive;
- md->recursive = &new_recursive;
+ md->recursive = &new_recursive;
rc = internal_dfa_exec(
md, /* fixed match data */
@@ -2665,7 +2665,7 @@ for (;;)
md->recursive = new_recursive.prevrec; /* Done this recursion */
- DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
+ DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
rc));
/* Ran out of internal offsets */
@@ -2703,27 +2703,27 @@ for (;;)
case OP_SBRAPOS:
case OP_CBRAPOS:
case OP_SCBRAPOS:
- case OP_BRAPOSZERO:
+ case OP_BRAPOSZERO:
{
int charcount, matched_count;
const uschar *local_ptr = ptr;
BOOL allow_zero;
-
+
if (codevalue == OP_BRAPOSZERO)
{
allow_zero = TRUE;
codevalue = *(++code); /* Codevalue will be one of above BRAs */
}
- else allow_zero = FALSE;
-
- /* Loop to match the subpattern as many times as possible as if it were
- a complete pattern. */
-
+ else allow_zero = FALSE;
+
+ /* Loop to match the subpattern as many times as possible as if it were
+ a complete pattern. */
+
for (matched_count = 0;; matched_count++)
{
int local_offsets[2];
int local_workspace[1000];
-
+
int rc = internal_dfa_exec(
md, /* fixed match data */
code, /* this subexpression's code */
@@ -2734,31 +2734,31 @@ for (;;)
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
rlevel); /* function recursion level */
-
+
/* Failed to match */
-
- if (rc < 0)
+
+ if (rc < 0)
{
if (rc != PCRE_ERROR_NOMATCH) return rc;
break;
- }
-
+ }
+
/* Matched: break the loop if zero characters matched. */
-
+
charcount = local_offsets[1] - local_offsets[0];
- if (charcount == 0) break;
+ if (charcount == 0) break;
local_ptr += charcount; /* Advance temporary position ptr */
- }
+ }
/* At this point we have matched the subpattern matched_count
- times, and local_ptr is pointing to the character after the end of the
- last match. */
+ times, and local_ptr is pointing to the character after the end of the
+ last match. */
if (matched_count > 0 || allow_zero)
- {
+ {
const uschar *end_subpattern = code;
int next_state_offset;
-
+
do { end_subpattern += GET(end_subpattern, 1); }
while (*end_subpattern == OP_ALT);
next_state_offset =
@@ -2779,14 +2779,14 @@ for (;;)
{
const uschar *p = ptr;
const uschar *pp = local_ptr;
- charcount = pp - p;
+ charcount = pp - p;
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
}
- }
- }
+ }
+ }
break;
-
+
/*-----------------------------------------------------------------*/
case OP_ONCE:
{
@@ -2892,7 +2892,7 @@ for (;;)
cb.capture_top = 1;
cb.capture_last = -1;
cb.callout_data = md->callout_data;
- cb.mark = NULL; /* No (*MARK) support */
+ cb.mark = NULL; /* No (*MARK) support */
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
}
if (rrc == 0)
@@ -3143,7 +3143,7 @@ back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- int erroroffset;
+ int erroroffset;
int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
if (errorcode != 0)
{
@@ -3151,12 +3151,12 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
offsets[0] = erroroffset;
offsets[1] = errorcode;
- }
+ }
return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
- }
+ }
if (start_offset > 0 && start_offset < length &&
- (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
+ (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
return PCRE_ERROR_BADUTF8_OFFSET;
}
#endif
@@ -3395,7 +3395,7 @@ for (;;)
/* OK, now we can do the business */
md->start_used_ptr = current_subject;
- md->recursive = NULL;
+ md->recursive = NULL;
rc = internal_dfa_exec(
md, /* fixed match data */
diff --git a/pcre_exec.c b/pcre_exec.c
index a09d78c..b1ab387 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -1082,7 +1082,7 @@ for (;;)
cb.capture_top = offset_top/2;
cb.capture_last = md->capture_last;
cb.callout_data = md->callout_data;
- cb.mark = markptr;
+ cb.mark = markptr;
if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
@@ -1477,7 +1477,7 @@ for (;;)
cb.capture_top = offset_top/2;
cb.capture_last = md->capture_last;
cb.callout_data = md->callout_data;
- cb.mark = markptr;
+ cb.mark = markptr;
if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
@@ -1505,17 +1505,17 @@ for (;;)
{
recursion_info *ri;
int recno;
-
+
callpat = md->start_code + GET(ecode, 1);
recno = (callpat == md->start_code)? 0 :
- GET2(callpat, 1 + LINK_SIZE);
-
- /* Check for repeating a recursion without advancing the subject pointer.
+ GET2(callpat, 1 + LINK_SIZE);
+
+ /* Check for repeating a recursion without advancing the subject pointer.
This should catch convoluted mutual recursions. (Some simple cases are
- caught at compile time.) */
-
+ caught at compile time.) */
+
for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
- if (recno == ri->group_num && eptr == ri->subject_position)
+ if (recno == ri->group_num && eptr == ri->subject_position)
RRETURN(PCRE_ERROR_RECURSELOOP);
/* Add to "recursing stack" */
@@ -2033,7 +2033,7 @@ for (;;)
SCHECK_PARTIAL();
MRRETURN(MATCH_NOMATCH);
}
- eptr++;
+ eptr++;
ecode++;
break;
@@ -5182,11 +5182,11 @@ for (;;)
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
- else
+ else
{
eptr = md->end_subject; /* Unlimited UTF-8 repeat */
SCHECK_PARTIAL();
- }
+ }
break;
/* The byte case is the same as non-UTF8 */
@@ -6388,7 +6388,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
the pattern to -1 for backwards compatibility. It is documented that this
happens. In earlier versions, the whole set of potential capturing offsets
was set to -1 each time round the loop, but this is handled differently now.
- "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
+ "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
those at the end that need unsetting here. We can't just unset them all at
the start of the whole thing because they may get set in one branch that is
not the final matching branch. */
diff --git a/pcre_internal.h b/pcre_internal.h
index 7be3628..7f35828 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -1741,7 +1741,7 @@ typedef struct compile_data {
int final_bracount; /* Saved value after first pass */
int top_backref; /* Maximum back reference */
unsigned int backref_map; /* Bitmap of low back refs */
- int assert_depth; /* Depth of nested assertions */
+ int assert_depth; /* Depth of nested assertions */
int external_options; /* External (initial) options */
int external_flags; /* External flag bits to be set */
int req_varyopt; /* "After variable item" flag for reqbyte */
@@ -1768,7 +1768,7 @@ typedef struct recursion_info {
int group_num; /* Number of group that was called */
int *offset_save; /* Pointer to start of saved offsets */
int saved_max; /* Number of saved offsets */
- USPTR subject_position; /* Position at start of recursion */
+ USPTR subject_position; /* Position at start of recursion */
} recursion_info;
/* A similar structure for pcre_dfa_exec(). */
@@ -1781,7 +1781,7 @@ typedef struct dfa_recursion_info {
/* Structure for building a chain of data for holding the values of the subject
pointer at the start of each subpattern, so as to detect when an empty string
-has been matched by a subpattern - to break infinite loops; used by
+has been matched by a subpattern - to break infinite loops; used by
pcre_exec(). */
typedef struct eptrblock {
@@ -1835,7 +1835,7 @@ typedef struct match_data {
recursion_info *recursive; /* Linked list of recursion data */
void *callout_data; /* To pass back to callouts */
const uschar *mark; /* Mark pointer to pass back */
- const uschar *once_target; /* Where to back up to for atomic groups */
+ const uschar *once_target; /* Where to back up to for atomic groups */
} match_data;
/* A similar structure is used for the same purpose by the DFA matching
@@ -1854,7 +1854,7 @@ typedef struct dfa_match_data {
int nllen; /* Newline string length */
uschar nl[4]; /* Newline string when fixed */
void *callout_data; /* To pass back to callouts */
- dfa_recursion_info *recursive; /* Linked list of recursion data */
+ dfa_recursion_info *recursive; /* Linked list of recursion data */
} dfa_match_data;
/* Bit definitions for entries in the pcre_ctypes table. */
diff --git a/pcre_printint.src b/pcre_printint.src
index 73d9fce..3afcafc 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -180,7 +180,7 @@ utf8 = (options & PCRE_UTF8) != 0;
for(;;)
{
uschar *ccode;
- const char *flag = " ";
+ const char *flag = " ";
int c;
int extra = 0;
@@ -238,21 +238,21 @@ for(;;)
continue;
case OP_CBRA:
- case OP_CBRAPOS:
+ case OP_CBRAPOS:
case OP_SCBRA:
- case OP_SCBRAPOS:
+ case OP_SCBRAPOS:
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
else fprintf(f, " ");
fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
break;
case OP_BRA:
- case OP_BRAPOS:
+ case OP_BRAPOS:
case OP_SBRA:
- case OP_SBRAPOS:
+ case OP_SBRAPOS:
case OP_KETRMAX:
case OP_KETRMIN:
- case OP_KETRPOS:
+ case OP_KETRPOS:
case OP_ALT:
case OP_KET:
case OP_ASSERT:
@@ -307,7 +307,7 @@ for(;;)
case OP_MINQUERYI:
case OP_POSQUERYI:
flag = "/i";
- /* Fall through */
+ /* Fall through */
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
@@ -345,7 +345,7 @@ for(;;)
case OP_MINUPTOI:
case OP_POSUPTOI:
flag = "/i";
- /* Fall through */
+ /* Fall through */
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
@@ -378,7 +378,7 @@ for(;;)
case OP_NOTI:
flag = "/i";
- /* Fall through */
+ /* Fall through */
case OP_NOT:
c = code[1];
if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
@@ -395,7 +395,7 @@ for(;;)
case OP_NOTMINQUERYI:
case OP_NOTPOSQUERYI:
flag = "/i";
- /* Fall through */
+ /* Fall through */
case OP_NOTSTAR:
case OP_NOTMINSTAR:
@@ -417,7 +417,7 @@ for(;;)
case OP_NOTMINUPTOI:
case OP_NOTPOSUPTOI:
flag = "/i";
- /* Fall through */
+ /* Fall through */
case OP_NOTEXACT:
case OP_NOTUPTO:
@@ -429,7 +429,7 @@ for(;;)
if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
- else
+ else
if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
break;
@@ -441,7 +441,7 @@ for(;;)
case OP_REFI:
flag = "/i";
- /* Fall through */
+ /* Fall through */
case OP_REF:
fprintf(f, " %s \\%d", flag, GET2(code,1));
ccode = code + _pcre_OP_lengths[*code];
@@ -601,12 +601,12 @@ for(;;)
fprintf(f, " %s %s", OP_names[*code], code + 2 + LINK_SIZE);
extra += code[1+LINK_SIZE];
break;
-
+
case OP_CIRCM:
case OP_DOLLM:
flag = "/m";
- /* Fall through */
-
+ /* Fall through */
+
/* Anything else is just an item with no data, but possibly a flag. */
default:
diff --git a/pcre_study.c b/pcre_study.c
index e41d94b..ed2efe5 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -70,7 +70,7 @@ Arguments:
startcode pointer to start of the whole pattern
options the compiling options
had_accept pointer to flag for (*ACCEPT) encountered
- int RECURSE depth
+ int RECURSE depth
Returns: the minimum length
-1 if \C was encountered
@@ -131,7 +131,7 @@ for (;;)
d = find_minlength(cc, startcode, options, had_accept_ptr, recurse_depth);
if (d < 0) return d;
branchlength += d;
- if (*had_accept_ptr) return branchlength;
+ if (*had_accept_ptr) return branchlength;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
@@ -142,10 +142,10 @@ for (;;)
ACCEPT, it is essentially the same as END, but we set a flag so that
counting stops. */
- case OP_ACCEPT:
- case OP_ASSERT_ACCEPT:
+ case OP_ACCEPT:
+ case OP_ASSERT_ACCEPT:
*had_accept_ptr = TRUE;
- /* Fall through */
+ /* Fall through */
case OP_ALT:
case OP_KET:
case OP_KETRMAX:
@@ -277,7 +277,7 @@ for (;;)
cc++;
break;
- /* "Any newline" might match two characters, but it also might match just
+ /* "Any newline" might match two characters, but it also might match just
one. */
case OP_ANYNL:
@@ -377,12 +377,12 @@ for (;;)
d = 0;
had_recurse = TRUE;
}
- else
+ else
{
- d = find_minlength(cs, startcode, options, had_accept_ptr,
+ d = find_minlength(cs, startcode, options, had_accept_ptr,
recurse_depth);
- *had_accept_ptr = FALSE;
- }
+ *had_accept_ptr = FALSE;
+ }
}
else d = 0;
cc += 3;
@@ -418,9 +418,9 @@ for (;;)
branchlength += min * d;
break;
-
- /* We can easily detect direct recursion, but not mutual recursion. This is
- caught by a recursion depth count. */
+
+ /* We can easily detect direct recursion, but not mutual recursion. This is
+ caught by a recursion depth count. */
case OP_RECURSE:
cs = ce = (uschar *)startcode + GET(cc, 1);
@@ -429,11 +429,11 @@ for (;;)
if ((cc > cs && cc < ce) || recurse_depth > 10)
had_recurse = TRUE;
else
- {
+ {
branchlength += find_minlength(cs, startcode, options, had_accept_ptr,
recurse_depth + 1);
*had_accept_ptr = FALSE;
- }
+ }
cc += 1 + LINK_SIZE;
break;
@@ -501,7 +501,7 @@ for (;;)
case OP_THEN_ARG:
cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
break;
-
+
/* The remaining opcodes are just skipped over. */
case OP_CLOSE:
@@ -722,22 +722,22 @@ do
/* Fail for a valid opcode that implies no starting bits. */
case OP_ACCEPT:
- case OP_ASSERT_ACCEPT:
+ case OP_ASSERT_ACCEPT:
case OP_ALLANY:
case OP_ANY:
case OP_ANYBYTE:
case OP_CIRC:
- case OP_CIRCM:
+ case OP_CIRCM:
case OP_CLOSE:
case OP_COMMIT:
case OP_COND:
- case OP_CREF:
+ case OP_CREF:
case OP_DEF:
case OP_DOLL:
- case OP_DOLLM:
+ case OP_DOLLM:
case OP_END:
case OP_EOD:
- case OP_EODN:
+ case OP_EODN:
case OP_EXTUNI:
case OP_FAIL:
case OP_MARK:
@@ -745,7 +745,7 @@ do
case OP_NOT:
case OP_NOTEXACT:
case OP_NOTEXACTI:
- case OP_NOTI:
+ case OP_NOTI:
case OP_NOTMINPLUS:
case OP_NOTMINPLUSI:
case OP_NOTMINQUERY:
@@ -783,7 +783,7 @@ do
case OP_REFI:
case OP_REVERSE:
case OP_RREF:
- case OP_SCOND:
+ case OP_SCOND:
case OP_SET_SOM:
case OP_SKIP:
case OP_SKIP_ARG:
@@ -1160,7 +1160,7 @@ do
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
}
- /* Advance past the bit map, and act on what follows. For a zero
+ /* Advance past the bit map, and act on what follows. For a zero
minimum repeat, continue; otherwise stop processing. */
tcode += 32;
@@ -1178,7 +1178,7 @@ do
if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
else try_next = FALSE;
break;
-
+
default:
try_next = FALSE;
break;
diff --git a/pcre_tables.c b/pcre_tables.c
index 7c07686..e3e6dc1 100644
--- a/pcre_tables.c
+++ b/pcre_tables.c
@@ -256,7 +256,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Zp0 STR_Z STR_p "\0"
#define STRING_Zs0 STR_Z STR_s "\0"
-const char _pcre_utt_names[] =
+const char _pcre_utt_names[] =
STRING_Any0
STRING_Arabic0
STRING_Armenian0
@@ -396,145 +396,145 @@ const char _pcre_utt_names[] =
STRING_Zp0
STRING_Zs0;
-const ucp_type_table _pcre_utt[] = {
- { 0, PT_ANY, 0 },
- { 4, PT_SC, ucp_Arabic },
- { 11, PT_SC, ucp_Armenian },
- { 20, PT_SC, ucp_Avestan },
- { 28, PT_SC, ucp_Balinese },
- { 37, PT_SC, ucp_Bamum },
- { 43, PT_SC, ucp_Batak },
- { 49, PT_SC, ucp_Bengali },
- { 57, PT_SC, ucp_Bopomofo },
- { 66, PT_SC, ucp_Brahmi },
- { 73, PT_SC, ucp_Braille },
- { 81, PT_SC, ucp_Buginese },
- { 90, PT_SC, ucp_Buhid },
- { 96, PT_GC, ucp_C },
- { 98, PT_SC, ucp_Canadian_Aboriginal },
- { 118, PT_SC, ucp_Carian },
- { 125, PT_PC, ucp_Cc },
- { 128, PT_PC, ucp_Cf },
- { 131, PT_SC, ucp_Cham },
- { 136, PT_SC, ucp_Cherokee },
- { 145, PT_PC, ucp_Cn },
- { 148, PT_PC, ucp_Co },
- { 151, PT_SC, ucp_Common },
- { 158, PT_SC, ucp_Coptic },
- { 165, PT_PC, ucp_Cs },
- { 168, PT_SC, ucp_Cuneiform },
- { 178, PT_SC, ucp_Cypriot },
- { 186, PT_SC, ucp_Cyrillic },
- { 195, PT_SC, ucp_Deseret },
- { 203, PT_SC, ucp_Devanagari },
- { 214, PT_SC, ucp_Egyptian_Hieroglyphs },
- { 235, PT_SC, ucp_Ethiopic },
- { 244, PT_SC, ucp_Georgian },
- { 253, PT_SC, ucp_Glagolitic },
- { 264, PT_SC, ucp_Gothic },
- { 271, PT_SC, ucp_Greek },
- { 277, PT_SC, ucp_Gujarati },
- { 286, PT_SC, ucp_Gurmukhi },
- { 295, PT_SC, ucp_Han },
- { 299, PT_SC, ucp_Hangul },
- { 306, PT_SC, ucp_Hanunoo },
- { 314, PT_SC, ucp_Hebrew },
- { 321, PT_SC, ucp_Hiragana },
- { 330, PT_SC, ucp_Imperial_Aramaic },
- { 347, PT_SC, ucp_Inherited },
- { 357, PT_SC, ucp_Inscriptional_Pahlavi },
- { 379, PT_SC, ucp_Inscriptional_Parthian },
- { 402, PT_SC, ucp_Javanese },
- { 411, PT_SC, ucp_Kaithi },
- { 418, PT_SC, ucp_Kannada },
- { 426, PT_SC, ucp_Katakana },
- { 435, PT_SC, ucp_Kayah_Li },
- { 444, PT_SC, ucp_Kharoshthi },
- { 455, PT_SC, ucp_Khmer },
- { 461, PT_GC, ucp_L },
- { 463, PT_LAMP, 0 },
- { 466, PT_SC, ucp_Lao },
- { 470, PT_SC, ucp_Latin },
- { 476, PT_SC, ucp_Lepcha },
- { 483, PT_SC, ucp_Limbu },
- { 489, PT_SC, ucp_Linear_B },
- { 498, PT_SC, ucp_Lisu },
- { 503, PT_PC, ucp_Ll },
- { 506, PT_PC, ucp_Lm },
- { 509, PT_PC, ucp_Lo },
- { 512, PT_PC, ucp_Lt },
- { 515, PT_PC, ucp_Lu },
- { 518, PT_SC, ucp_Lycian },
- { 525, PT_SC, ucp_Lydian },
- { 532, PT_GC, ucp_M },
- { 534, PT_SC, ucp_Malayalam },
- { 544, PT_SC, ucp_Mandaic },
- { 552, PT_PC, ucp_Mc },
- { 555, PT_PC, ucp_Me },
- { 558, PT_SC, ucp_Meetei_Mayek },
- { 571, PT_PC, ucp_Mn },
- { 574, PT_SC, ucp_Mongolian },
- { 584, PT_SC, ucp_Myanmar },
- { 592, PT_GC, ucp_N },
- { 594, PT_PC, ucp_Nd },
- { 597, PT_SC, ucp_New_Tai_Lue },
- { 609, PT_SC, ucp_Nko },
- { 613, PT_PC, ucp_Nl },
- { 616, PT_PC, ucp_No },
- { 619, PT_SC, ucp_Ogham },
- { 625, PT_SC, ucp_Ol_Chiki },
- { 634, PT_SC, ucp_Old_Italic },
- { 645, PT_SC, ucp_Old_Persian },
- { 657, PT_SC, ucp_Old_South_Arabian },
- { 675, PT_SC, ucp_Old_Turkic },
- { 686, PT_SC, ucp_Oriya },
- { 692, PT_SC, ucp_Osmanya },
- { 700, PT_GC, ucp_P },
- { 702, PT_PC, ucp_Pc },
- { 705, PT_PC, ucp_Pd },
- { 708, PT_PC, ucp_Pe },
- { 711, PT_PC, ucp_Pf },
- { 714, PT_SC, ucp_Phags_Pa },
- { 723, PT_SC, ucp_Phoenician },
- { 734, PT_PC, ucp_Pi },
- { 737, PT_PC, ucp_Po },
- { 740, PT_PC, ucp_Ps },
- { 743, PT_SC, ucp_Rejang },
- { 750, PT_SC, ucp_Runic },
- { 756, PT_GC, ucp_S },
- { 758, PT_SC, ucp_Samaritan },
- { 768, PT_SC, ucp_Saurashtra },
- { 779, PT_PC, ucp_Sc },
- { 782, PT_SC, ucp_Shavian },
- { 790, PT_SC, ucp_Sinhala },
- { 798, PT_PC, ucp_Sk },
- { 801, PT_PC, ucp_Sm },
- { 804, PT_PC, ucp_So },
- { 807, PT_SC, ucp_Sundanese },
- { 817, PT_SC, ucp_Syloti_Nagri },
- { 830, PT_SC, ucp_Syriac },
- { 837, PT_SC, ucp_Tagalog },
- { 845, PT_SC, ucp_Tagbanwa },
- { 854, PT_SC, ucp_Tai_Le },
- { 861, PT_SC, ucp_Tai_Tham },
- { 870, PT_SC, ucp_Tai_Viet },
- { 879, PT_SC, ucp_Tamil },
- { 885, PT_SC, ucp_Telugu },
- { 892, PT_SC, ucp_Thaana },
- { 899, PT_SC, ucp_Thai },
- { 904, PT_SC, ucp_Tibetan },
- { 912, PT_SC, ucp_Tifinagh },
- { 921, PT_SC, ucp_Ugaritic },
- { 930, PT_SC, ucp_Vai },
- { 934, PT_ALNUM, 0 },
- { 938, PT_PXSPACE, 0 },
- { 942, PT_SPACE, 0 },
- { 946, PT_WORD, 0 },
- { 950, PT_SC, ucp_Yi },
- { 953, PT_GC, ucp_Z },
- { 955, PT_PC, ucp_Zl },
- { 958, PT_PC, ucp_Zp },
- { 961, PT_PC, ucp_Zs }
+const ucp_type_table _pcre_utt[] = {
+ { 0, PT_ANY, 0 },
+ { 4, PT_SC, ucp_Arabic },
+ { 11, PT_SC, ucp_Armenian },
+ { 20, PT_SC, ucp_Avestan },
+ { 28, PT_SC, ucp_Balinese },
+ { 37, PT_SC, ucp_Bamum },
+ { 43, PT_SC, ucp_Batak },
+ { 49, PT_SC, ucp_Bengali },
+ { 57, PT_SC, ucp_Bopomofo },
+ { 66, PT_SC, ucp_Brahmi },
+ { 73, PT_SC, ucp_Braille },
+ { 81, PT_SC, ucp_Buginese },
+ { 90, PT_SC, ucp_Buhid },
+ { 96, PT_GC, ucp_C },
+ { 98, PT_SC, ucp_Canadian_Aboriginal },
+ { 118, PT_SC, ucp_Carian },
+ { 125, PT_PC, ucp_Cc },
+ { 128, PT_PC, ucp_Cf },
+ { 131, PT_SC, ucp_Cham },
+ { 136, PT_SC, ucp_Cherokee },
+ { 145, PT_PC, ucp_Cn },
+ { 148, PT_PC, ucp_Co },
+ { 151, PT_SC, ucp_Common },
+ { 158, PT_SC, ucp_Coptic },
+ { 165, PT_PC, ucp_Cs },
+ { 168, PT_SC, ucp_Cuneiform },
+ { 178, PT_SC, ucp_Cypriot },
+ { 186, PT_SC, ucp_Cyrillic },
+ { 195, PT_SC, ucp_Deseret },
+ { 203, PT_SC, ucp_Devanagari },
+ { 214, PT_SC, ucp_Egyptian_Hieroglyphs },
+ { 235, PT_SC, ucp_Ethiopic },
+ { 244, PT_SC, ucp_Georgian },
+ { 253, PT_SC, ucp_Glagolitic },
+ { 264, PT_SC, ucp_Gothic },
+ { 271, PT_SC, ucp_Greek },
+ { 277, PT_SC, ucp_Gujarati },
+ { 286, PT_SC, ucp_Gurmukhi },
+ { 295, PT_SC, ucp_Han },
+ { 299, PT_SC, ucp_Hangul },
+ { 306, PT_SC, ucp_Hanunoo },
+ { 314, PT_SC, ucp_Hebrew },
+ { 321, PT_SC, ucp_Hiragana },
+ { 330, PT_SC, ucp_Imperial_Aramaic },
+ { 347, PT_SC, ucp_Inherited },
+ { 357, PT_SC, ucp_Inscriptional_Pahlavi },
+ { 379, PT_SC, ucp_Inscriptional_Parthian },
+ { 402, PT_SC, ucp_Javanese },
+ { 411, PT_SC, ucp_Kaithi },
+ { 418, PT_SC, ucp_Kannada },
+ { 426, PT_SC, ucp_Katakana },
+ { 435, PT_SC, ucp_Kayah_Li },
+ { 444, PT_SC, ucp_Kharoshthi },
+ { 455, PT_SC, ucp_Khmer },
+ { 461, PT_GC, ucp_L },
+ { 463, PT_LAMP, 0 },
+ { 466, PT_SC, ucp_Lao },
+ { 470, PT_SC, ucp_Latin },
+ { 476, PT_SC, ucp_Lepcha },
+ { 483, PT_SC, ucp_Limbu },
+ { 489, PT_SC, ucp_Linear_B },
+ { 498, PT_SC, ucp_Lisu },
+ { 503, PT_PC, ucp_Ll },
+ { 506, PT_PC, ucp_Lm },
+ { 509, PT_PC, ucp_Lo },
+ { 512, PT_PC, ucp_Lt },
+ { 515, PT_PC, ucp_Lu },
+ { 518, PT_SC, ucp_Lycian },
+ { 525, PT_SC, ucp_Lydian },
+ { 532, PT_GC, ucp_M },
+ { 534, PT_SC, ucp_Malayalam },
+ { 544, PT_SC, ucp_Mandaic },
+ { 552, PT_PC, ucp_Mc },
+ { 555, PT_PC, ucp_Me },
+ { 558, PT_SC, ucp_Meetei_Mayek },
+ { 571, PT_PC, ucp_Mn },
+ { 574, PT_SC, ucp_Mongolian },
+ { 584, PT_SC, ucp_Myanmar },
+ { 592, PT_GC, ucp_N },
+ { 594, PT_PC, ucp_Nd },
+ { 597, PT_SC, ucp_New_Tai_Lue },
+ { 609, PT_SC, ucp_Nko },
+ { 613, PT_PC, ucp_Nl },
+ { 616, PT_PC, ucp_No },
+ { 619, PT_SC, ucp_Ogham },
+ { 625, PT_SC, ucp_Ol_Chiki },
+ { 634, PT_SC, ucp_Old_Italic },
+ { 645, PT_SC, ucp_Old_Persian },
+ { 657, PT_SC, ucp_Old_South_Arabian },
+ { 675, PT_SC, ucp_Old_Turkic },
+ { 686, PT_SC, ucp_Oriya },
+ { 692, PT_SC, ucp_Osmanya },
+ { 700, PT_GC, ucp_P },
+ { 702, PT_PC, ucp_Pc },
+ { 705, PT_PC, ucp_Pd },
+ { 708, PT_PC, ucp_Pe },
+ { 711, PT_PC, ucp_Pf },
+ { 714, PT_SC, ucp_Phags_Pa },
+ { 723, PT_SC, ucp_Phoenician },
+ { 734, PT_PC, ucp_Pi },
+ { 737, PT_PC, ucp_Po },
+ { 740, PT_PC, ucp_Ps },
+ { 743, PT_SC, ucp_Rejang },
+ { 750, PT_SC, ucp_Runic },
+ { 756, PT_GC, ucp_S },
+ { 758, PT_SC, ucp_Samaritan },
+ { 768, PT_SC, ucp_Saurashtra },
+ { 779, PT_PC, ucp_Sc },
+ { 782, PT_SC, ucp_Shavian },
+ { 790, PT_SC, ucp_Sinhala },
+ { 798, PT_PC, ucp_Sk },
+ { 801, PT_PC, ucp_Sm },
+ { 804, PT_PC, ucp_So },
+ { 807, PT_SC, ucp_Sundanese },
+ { 817, PT_SC, ucp_Syloti_Nagri },
+ { 830, PT_SC, ucp_Syriac },
+ { 837, PT_SC, ucp_Tagalog },
+ { 845, PT_SC, ucp_Tagbanwa },
+ { 854, PT_SC, ucp_Tai_Le },
+ { 861, PT_SC, ucp_Tai_Tham },
+ { 870, PT_SC, ucp_Tai_Viet },
+ { 879, PT_SC, ucp_Tamil },
+ { 885, PT_SC, ucp_Telugu },
+ { 892, PT_SC, ucp_Thaana },
+ { 899, PT_SC, ucp_Thai },
+ { 904, PT_SC, ucp_Tibetan },
+ { 912, PT_SC, ucp_Tifinagh },
+ { 921, PT_SC, ucp_Ugaritic },
+ { 930, PT_SC, ucp_Vai },
+ { 934, PT_ALNUM, 0 },
+ { 938, PT_PXSPACE, 0 },
+ { 942, PT_SPACE, 0 },
+ { 946, PT_WORD, 0 },
+ { 950, PT_SC, ucp_Yi },
+ { 953, PT_GC, ucp_Z },
+ { 955, PT_PC, ucp_Zl },
+ { 958, PT_PC, ucp_Zp },
+ { 961, PT_PC, ucp_Zs }
};
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
diff --git a/pcre_valid_utf8.c b/pcre_valid_utf8.c
index 64d73e1..fef6538 100644
--- a/pcre_valid_utf8.c
+++ b/pcre_valid_utf8.c
@@ -64,10 +64,10 @@ the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
the canonical format. Once somebody had pointed out RFC 3629 to me (it
obsoletes 2279), additional restrictions were applied. The values are now
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
-subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
+subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
characters is still checked.
-From release 8.13 more information about the details of the error are passed
+From release 8.13 more information about the details of the error are passed
back in the returned value:
PCRE_UTF8_ERR0 No error
@@ -96,7 +96,7 @@ PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
Arguments:
string points to the string
length length of string, or -1 if the string is zero-terminated
- errp pointer to an error position offset variable
+ errp pointer to an error position offset variable
Returns: = 0 if the string is a valid UTF-8 string
> 0 otherwise, setting the offset of the bad character
@@ -117,39 +117,39 @@ if (length < 0)
for (p = string; length-- > 0; p++)
{
register int ab, c, d;
-
+
c = *p;
if (c < 128) continue; /* ASCII character */
-
+
if (c < 0xc0) /* Isolated 10xx xxxx byte */
{
*erroroffset = p - string;
- return PCRE_UTF8_ERR20;
- }
+ return PCRE_UTF8_ERR20;
+ }
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
{
*erroroffset = p - string;
- return PCRE_UTF8_ERR21;
- }
-
+ return PCRE_UTF8_ERR21;
+ }
+
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
- if (length < ab)
+ if (length < ab)
{
*erroroffset = p - string; /* Missing bytes */
return ab - length; /* Codes ERR1 to ERR5 */
- }
+ }
length -= ab; /* Length remaining */
/* Check top bits in the second byte */
-
- if (((d = *(++p)) & 0xc0) != 0x80)
+
+ if (((d = *(++p)) & 0xc0) != 0x80)
{
*erroroffset = p - string - 1;
- return PCRE_UTF8_ERR6;
- }
+ return PCRE_UTF8_ERR6;
+ }
- /* For each length, check that the remaining bytes start with the 0x80 bit
+ /* For each length, check that the remaining bytes start with the 0x80 bit
set and not the 0x40 bit. Then check for an overlong sequence, and for the
excluded range 0xd800 to 0xdfff. */
@@ -157,92 +157,92 @@ for (p = string; length-- > 0; p++)
{
/* 2-byte character. No further bytes to check for 0x80. Check first byte
for for xx00 000x (overlong sequence). */
-
- case 1: if ((c & 0x3e) == 0)
+
+ case 1: if ((c & 0x3e) == 0)
{
- *erroroffset = p - string - 1;
- return PCRE_UTF8_ERR15;
- }
- break;
+ *erroroffset = p - string - 1;
+ return PCRE_UTF8_ERR15;
+ }
+ break;
- /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
+ /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
for 1110 0000, xx0x xxxx (overlong sequence) or
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
-
+
case 2:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
- *erroroffset = p - string - 2;
+ *erroroffset = p - string - 2;
return PCRE_UTF8_ERR7;
- }
+ }
if (c == 0xe0 && (d & 0x20) == 0)
{
*erroroffset = p - string - 2;
- return PCRE_UTF8_ERR16;
- }
+ return PCRE_UTF8_ERR16;
+ }
if (c == 0xed && d >= 0xa0)
{
*erroroffset = p - string - 2;
- return PCRE_UTF8_ERR14;
- }
+ return PCRE_UTF8_ERR14;
+ }
break;
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
character greater than 0x0010ffff (f4 8f bf bf) */
-
+
case 3:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
- *erroroffset = p - string - 2;
+ *erroroffset = p - string - 2;
return PCRE_UTF8_ERR7;
- }
+ }
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
- *erroroffset = p - string - 3;
+ *erroroffset = p - string - 3;
return PCRE_UTF8_ERR8;
- }
+ }
if (c == 0xf0 && (d & 0x30) == 0)
{
*erroroffset = p - string - 3;
- return PCRE_UTF8_ERR17;
- }
+ return PCRE_UTF8_ERR17;
+ }
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
{
*erroroffset = p - string - 3;
- return PCRE_UTF8_ERR13;
+ return PCRE_UTF8_ERR13;
}
break;
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
- rejected by the length test below. However, we do the appropriate tests
+ rejected by the length test below. However, we do the appropriate tests
here so that overlong sequences get diagnosed, and also in case there is
- ever an option for handling these larger code points. */
+ ever an option for handling these larger code points. */
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
- 1111 1000, xx00 0xxx */
-
- case 4:
+ 1111 1000, xx00 0xxx */
+
+ case 4:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
- *erroroffset = p - string - 2;
+ *erroroffset = p - string - 2;
return PCRE_UTF8_ERR7;
- }
+ }
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
- *erroroffset = p - string - 3;
+ *erroroffset = p - string - 3;
return PCRE_UTF8_ERR8;
- }
+ }
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
- *erroroffset = p - string - 4;
+ *erroroffset = p - string - 4;
return PCRE_UTF8_ERR9;
- }
- if (c == 0xf8 && (d & 0x38) == 0)
+ }
+ if (c == 0xf8 && (d & 0x38) == 0)
{
*erroroffset = p - string - 4;
- return PCRE_UTF8_ERR18;
- }
+ return PCRE_UTF8_ERR18;
+ }
break;
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
@@ -251,43 +251,43 @@ for (p = string; length-- > 0; p++)
case 5:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
- *erroroffset = p - string - 2;
+ *erroroffset = p - string - 2;
return PCRE_UTF8_ERR7;
- }
+ }
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
- *erroroffset = p - string - 3;
+ *erroroffset = p - string - 3;
return PCRE_UTF8_ERR8;
- }
+ }
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
- *erroroffset = p - string - 4;
+ *erroroffset = p - string - 4;
return PCRE_UTF8_ERR9;
- }
+ }
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
{
- *erroroffset = p - string - 5;
+ *erroroffset = p - string - 5;
return PCRE_UTF8_ERR10;
- }
- if (c == 0xfc && (d & 0x3c) == 0)
+ }
+ if (c == 0xfc && (d & 0x3c) == 0)
{
*erroroffset = p - string - 5;
- return PCRE_UTF8_ERR19;
- }
+ return PCRE_UTF8_ERR19;
+ }
break;
}
-
+
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
excluded by RFC 3629. The pointer p is currently at the last byte of the
character. */
- if (ab > 3)
+ if (ab > 3)
{
*erroroffset = p - string - ab;
- return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
- }
+ return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
+ }
}
-
+
#else /* SUPPORT_UTF8 */
(void)(string); /* Keep picky compilers happy */
(void)(length);
diff --git a/pcregrep.c b/pcregrep.c
index 1a4077b..380ac7d 100644
--- a/pcregrep.c
+++ b/pcregrep.c
@@ -639,7 +639,7 @@ Arguments:
endptr end of available data
lenptr where to put the length of the eol sequence
-Returns: pointer after the last byte of the line,
+Returns: pointer after the last byte of the line,
including the newline byte(s)
*/
@@ -953,7 +953,7 @@ Returns: TRUE if there was a match
*/
static BOOL
-match_patterns(char *matchptr, size_t length, int startoffset, int *offsets,
+match_patterns(char *matchptr, size_t length, int startoffset, int *offsets,
int *mrc)
{
int i;
@@ -1013,7 +1013,7 @@ Arguments:
Returns: 0 if there was at least one match
1 otherwise (no matches)
- 2 if an overlong line is encountered
+ 2 if an overlong line is encountered
3 if there is a read error on a .bz2 file
*/
@@ -1086,7 +1086,7 @@ while (ptr < endptr)
{
int endlinelength;
int mrc = 0;
- int startoffset = 0;
+ int startoffset = 0;
BOOL match;
char *matchptr = ptr;
char *t = ptr;
@@ -1103,20 +1103,20 @@ while (ptr < endptr)
t = end_of_line(t, endptr, &endlinelength);
linelength = t - ptr - endlinelength;
length = multiline? (size_t)(endptr - ptr) : linelength;
-
- /* Check to see if the line we are looking at extends right to the very end
- of the buffer without a line terminator. This means the line is too long to
+
+ /* Check to see if the line we are looking at extends right to the very end
+ of the buffer without a line terminator. This means the line is too long to
handle. */
-
+
if (endlinelength == 0 && t == main_buffer + bufsize)
{
fprintf(stderr, "pcregrep: line %d%s%s is too long for the internal buffer\n"
"pcregrep: check the --buffer-size option\n",
- linenumber,
+ linenumber,
(filename == NULL)? "" : " of file ",
(filename == NULL)? "" : filename);
return 2;
- }
+ }
/* Extra processing for Jeffrey Friedl's debugging. */
@@ -1384,7 +1384,7 @@ while (ptr < endptr)
{
startoffset = offsets[1];
if (startoffset >= linelength + endlinelength ||
- !match_patterns(matchptr, length, startoffset, offsets, &mrc))
+ !match_patterns(matchptr, length, startoffset, offsets, &mrc))
break;
FWRITE(matchptr + startoffset, 1, offsets[0] - startoffset, stdout);
fprintf(stdout, "%c[%sm", 0x1b, colour_string);
@@ -1737,7 +1737,7 @@ if (frtype == FR_LIBBZ2)
else if (!silent)
fprintf(stderr, "pcregrep: Failed to read %s using bzlib: %s\n",
pathname, err);
- rc = 2; /* The normal "something went wrong" code */
+ rc = 2; /* The normal "something went wrong" code */
}
BZ2_bzclose(inbz2);
}
@@ -1830,11 +1830,11 @@ for (op = optionlist; op->one_char != 0; op++)
printf("%.*s%s\n", n, " ", op->help_text);
}
-printf("\nNumbers may be followed by K or M, e.g. --buffer-size=100K.\n");
+printf("\nNumbers may be followed by K or M, e.g. --buffer-size=100K.\n");
printf("The default value for --buffer-size is %d.\n", PCREGREP_BUFSIZE);
printf("When reading patterns from a file instead of using a command line option,\n");
printf("trailing white space is removed and blank lines are ignored.\n");
-printf("There is a maximum of %d patterns, each of maximum size %d bytes.\n",
+printf("There is a maximum of %d patterns, each of maximum size %d bytes.\n",
MAX_PATTERN_COUNT, PATBUFSIZE);
printf("\nWith no FILEs, read standard input. If fewer than two FILEs given, assume -h.\n");
@@ -2321,14 +2321,14 @@ for (i = 1; i < argc; i++)
n = n * 10 + (int)(*endptr++ - '0');
if (toupper(*endptr) == 'K')
{
- n *= 1024;
- endptr++;
- }
+ n *= 1024;
+ endptr++;
+ }
else if (toupper(*endptr) == 'M')
{
- n *= 1024*1024;
- endptr++;
- }
+ n *= 1024*1024;
+ endptr++;
+ }
if (*endptr != 0)
{
if (longop)
@@ -2665,7 +2665,7 @@ if (include_dir_pattern != NULL)
if (i >= argc)
{
- rc = pcregrep(stdin, FR_PLAIN, stdin_name,
+ rc = pcregrep(stdin, FR_PLAIN, stdin_name,
(filenames > FN_DEFAULT)? stdin_name : NULL);
goto EXIT;
}
diff --git a/pcreposix.c b/pcreposix.c
index da4c6c3..2061be0 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -152,7 +152,7 @@ static const int eint[] = {
REG_BADPAT, /* (*MARK) must have an argument */
REG_INVARG, /* this version of PCRE is not compiled with PCRE_UCP support */
REG_BADPAT, /* \c must be followed by an ASCII character */
- REG_BADPAT, /* \k is not followed by a braced, angle-bracketed, or quoted name */
+ REG_BADPAT, /* \k is not followed by a braced, angle-bracketed, or quoted name */
};
/* Table of texts corresponding to POSIX error codes */
diff --git a/pcretest.c b/pcretest.c
index c3f5f0d..31271b0 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -208,7 +208,7 @@ static const char *errtexts[] = {
"magic number missing",
"unknown opcode - pattern overwritten?",
"no more memory",
- NULL, /* never returned by pcre_exec() or pcre_dfa_exec() */
+ NULL, /* never returned by pcre_exec() or pcre_dfa_exec() */
"match limit exceeded",
"callout error code",
NULL, /* BADUTF8 is handled specially */
@@ -221,7 +221,7 @@ static const char *errtexts[] = {
"backreference condition or recursion test not supported for DFA matching",
"match limit not supported for DFA matching",
"workspace size exceeded in DFA matching",
- "too much recursion for DFA matching",
+ "too much recursion for DFA matching",
"recursion limit exceeded",
"not used - internal error",
"invalid combination of newline options",
@@ -229,7 +229,7 @@ static const char *errtexts[] = {
NULL, /* SHORTUTF8 is handled specially */
"nested recursion at the same subject position"
};
-
+
/*************************************************
* Alternate character tables *
@@ -961,12 +961,12 @@ fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
fprintf(outfile, "\n");
first_callout = 0;
-if (cb->mark != last_callout_mark)
+if (cb->mark != last_callout_mark)
{
- fprintf(outfile, "Latest Mark: %s\n",
+ fprintf(outfile, "Latest Mark: %s\n",
(cb->mark == NULL)? "<unset>" : (char *)(cb->mark));
- last_callout_mark = cb->mark;
- }
+ last_callout_mark = cb->mark;
+ }
if (cb->callout_data != NULL)
{
@@ -1273,7 +1273,7 @@ while (argc > 1 && argv[op][0] == '-')
unsigned char *endptr;
if (strcmp(argv[op], "-m") == 0) showstore = 1;
- else if (strcmp(argv[op], "-s") == 0) force_study = 1;
+ else if (strcmp(argv[op], "-s") == 0) force_study = 1;
else if (strcmp(argv[op], "-q") == 0) quiet = 1;
else if (strcmp(argv[op], "-b") == 0) debug = 1;
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
@@ -1443,10 +1443,10 @@ while (!done)
const unsigned char *tables = NULL;
unsigned long int true_size, true_study_size = 0;
size_t size, regex_gotten_store;
- int do_allcaps = 0;
+ int do_allcaps = 0;
int do_mark = 0;
int do_study = 0;
- int no_force_study = 0;
+ int no_force_study = 0;
int do_debug = debug;
int do_G = 0;
int do_g = 0;
@@ -1619,10 +1619,10 @@ while (!done)
case 'x': options |= PCRE_EXTENDED; break;
case '+':
- if (do_showrest) do_showcaprest = 1; else do_showrest = 1;
+ if (do_showrest) do_showcaprest = 1; else do_showrest = 1;
break;
-
- case '=': do_allcaps = 1; break;
+
+ case '=': do_allcaps = 1; break;
case 'A': options |= PCRE_ANCHORED; break;
case 'B': do_debug = 1; break;
case 'C': options |= PCRE_AUTO_CALLOUT; break;
@@ -1640,12 +1640,12 @@ while (!done)
case 'P': do_posix = 1; break;
#endif
- case 'S':
+ case 'S':
if (do_study == 0) do_study = 1; else
{
do_study = 0;
no_force_study = 1;
- }
+ }
break;
case 'U': options |= PCRE_UNGREEDY; break;
@@ -1832,7 +1832,7 @@ while (!done)
regex_gotten_store = gotten_store;
/* If -s or /S was present, study the regex to generate additional info to
- help with the matching, unless the pattern has the SS option, which
+ help with the matching, unless the pattern has the SS option, which
suppresses the effect of /S (used for a few test patterns where studying is
never sensible). */
@@ -2074,7 +2074,7 @@ while (!done)
/* Don't output study size; at present it is in any case a fixed
value, but it varies, depending on the computer architecture, and
so messes up the test suite. (And with the /F option, it might be
- flipped.) If study was forced by an external -s, don't show this
+ flipped.) If study was forced by an external -s, don't show this
information unless -i or -d was also present. This means that, except
when auto-callouts are involved, the output from runs with and without
-s should be identical. */
@@ -2158,10 +2158,10 @@ while (!done)
else
{
fprintf(outfile, "Compiled pattern written to %s\n", to_file);
-
+
/* If there is study data, write it, but verify the writing only
if the studying was requested by /S, not just by -s. */
-
+
if (extra != NULL)
{
if (fwrite(extra->study_data, 1, true_study_size, f) <
@@ -2219,7 +2219,7 @@ while (!done)
pcre_callout = callout;
first_callout = 1;
- last_callout_mark = NULL;
+ last_callout_mark = NULL;
callout_extra = 0;
callout_count = 0;
callout_fail_count = 999999;
@@ -2746,31 +2746,31 @@ while (!done)
do_g = do_G = FALSE; /* Break g/G loop */
}
}
-
+
/* do_allcaps requests showing of all captures in the pattern, to check
unset ones at the end. */
-
+
if (do_allcaps)
{
new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
- count++; /* Allow for full match */
- if (count * 2 > use_size_offsets) count = use_size_offsets/2;
- }
+ count++; /* Allow for full match */
+ if (count * 2 > use_size_offsets) count = use_size_offsets/2;
+ }
/* Output the captured substrings */
-
+
for (i = 0; i < count * 2; i += 2)
{
if (use_offsets[i] < 0)
- {
+ {
if (use_offsets[i] != -1)
fprintf(outfile, "ERROR: bad negative value %d for offset %d\n",
- use_offsets[i], i);
+ use_offsets[i], i);
if (use_offsets[i+1] != -1)
fprintf(outfile, "ERROR: bad negative value %d for offset %d\n",
- use_offsets[i+1], i+1);
+ use_offsets[i+1], i+1);
fprintf(outfile, "%2d: <unset>\n", i/2);
- }
+ }
else
{
fprintf(outfile, "%2d: ", i/2);
@@ -2940,7 +2940,7 @@ while (!done)
else
{
switch(count)
- {
+ {
case PCRE_ERROR_NOMATCH:
if (gmatched == 0)
{
@@ -2948,25 +2948,25 @@ while (!done)
else fprintf(outfile, "No match, mark = %s\n", markptr);
}
break;
-
+
case PCRE_ERROR_BADUTF8:
case PCRE_ERROR_SHORTUTF8:
fprintf(outfile, "Error %d (%s UTF-8 string)", count,
(count == PCRE_ERROR_BADUTF8)? "bad" : "short");
if (use_size_offsets >= 2)
- fprintf(outfile, " offset=%d reason=%d", use_offsets[0],
+ fprintf(outfile, " offset=%d reason=%d", use_offsets[0],
use_offsets[1]);
- fprintf(outfile, "\n");
- break;
-
+ fprintf(outfile, "\n");
+ break;
+
default:
- if (count < 0 && (-count) < sizeof(errtexts)/sizeof(const char *))
+ if (count < 0 && (-count) < sizeof(errtexts)/sizeof(const char *))
fprintf(outfile, "Error %d (%s)\n", count, errtexts[-count]);
- else
- fprintf(outfile, "Error %d (Unexpected value)\n", count);
+ else
+ fprintf(outfile, "Error %d (Unexpected value)\n", count);
break;
}
-
+
break; /* Out of the /g loop */
}
}
diff --git a/perltest.pl b/perltest.pl
index ce9132b..ccddd64 100755
--- a/perltest.pl
+++ b/perltest.pl
@@ -86,10 +86,10 @@ for (;;)
# The private /+ modifier means "print $' afterwards".
$showrest = ($pattern =~ s/\+(?=[a-zA-Z]*$)//);
-
+
# A doubled version is used by pcretest to print remainders after captures
-
- $pattern =~ s/\+(?=[a-zA-Z]*$)//;
+
+ $pattern =~ s/\+(?=[a-zA-Z]*$)//;
# Remove /8 from a UTF-8 pattern.
diff --git a/ucp.h b/ucp.h
index 8b52354..34077fe 100644
--- a/ucp.h
+++ b/ucp.h
@@ -157,7 +157,7 @@ enum {
/* New for Unicode 6.0.0: */
ucp_Batak,
ucp_Brahmi,
- ucp_Mandaic
+ ucp_Mandaic
};
#endif