summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am18
-rwxr-xr-xPrepareRelease21
-rw-r--r--README6
-rw-r--r--doc/html/index.html25
-rw-r--r--doc/html/pcre.html195
-rw-r--r--doc/html/pcre_assign_jit_stack.html0
-rw-r--r--doc/html/pcre_free_study.html0
-rw-r--r--doc/html/pcre_jit_stack_alloc.html0
-rw-r--r--doc/html/pcre_jit_stack_free.html0
-rw-r--r--doc/html/pcreapi.html4
-rw-r--r--doc/html/pcrecompat.html6
-rw-r--r--doc/html/pcrejit.html19
-rw-r--r--doc/html/pcrelimits.html74
-rw-r--r--doc/html/pcrepattern.html6
-rw-r--r--doc/html/pcreunicode.html177
-rw-r--r--doc/index.html.src21
-rw-r--r--doc/pcre.3183
-rw-r--r--doc/pcre.txt537
-rw-r--r--doc/pcre_assign_jit_stack.349
-rw-r--r--doc/pcre_config.313
-rw-r--r--doc/pcre_dfa_exec.320
-rw-r--r--doc/pcre_exec.316
-rw-r--r--doc/pcre_free_study.327
-rw-r--r--doc/pcre_fullinfo.311
-rw-r--r--doc/pcre_jit_stack_alloc.331
-rw-r--r--doc/pcre_jit_stack_free.326
-rw-r--r--doc/pcre_study.311
-rw-r--r--doc/pcreapi.3179
-rw-r--r--doc/pcrebuild.318
-rw-r--r--doc/pcrecallout.36
-rw-r--r--doc/pcrecompat.39
-rw-r--r--doc/pcrejit.3234
-rw-r--r--doc/pcrelimits.357
-rw-r--r--doc/pcrepartial.320
-rw-r--r--doc/pcrepattern.39
-rw-r--r--doc/pcreprecompile.322
-rw-r--r--doc/pcrestack.315
-rw-r--r--doc/pcretest.198
-rw-r--r--doc/pcreunicode.3156
-rw-r--r--doc/perltest.txt25
-rw-r--r--pcre_jit_compile.c4
41 files changed, 1568 insertions, 780 deletions
diff --git a/Makefile.am b/Makefile.am
index fc64748..320becb 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,6 +18,7 @@ dist_html_DATA = \
doc/html/index.html \
doc/html/pcre.html \
doc/html/pcre-config.html \
+ doc/html/pcre_assign_jit_stack.html \
doc/html/pcre_compile.html \
doc/html/pcre_compile2.html \
doc/html/pcre_config.html \
@@ -25,6 +26,7 @@ dist_html_DATA = \
doc/html/pcre_copy_substring.html \
doc/html/pcre_dfa_exec.html \
doc/html/pcre_exec.html \
+ doc/html/pcre_free_study.html \
doc/html/pcre_free_substring.html \
doc/html/pcre_free_substring_list.html \
doc/html/pcre_fullinfo.html \
@@ -34,6 +36,8 @@ dist_html_DATA = \
doc/html/pcre_get_substring.html \
doc/html/pcre_get_substring_list.html \
doc/html/pcre_info.html \
+ doc/html/pcre_jit_stack_alloc.html \
+ doc/html/pcre_jit_stack_free.html \
doc/html/pcre_maketables.html \
doc/html/pcre_refcount.html \
doc/html/pcre_study.html \
@@ -44,6 +48,8 @@ dist_html_DATA = \
doc/html/pcrecompat.html \
doc/html/pcredemo.html \
doc/html/pcregrep.html \
+ doc/html/pcrejit.html \
+ doc/html/pcrelimits.html \
doc/html/pcrematching.html \
doc/html/pcrepartial.html \
doc/html/pcrepattern.html \
@@ -53,7 +59,8 @@ dist_html_DATA = \
doc/html/pcresample.html \
doc/html/pcrestack.html \
doc/html/pcresyntax.html \
- doc/html/pcretest.html
+ doc/html/pcretest.html \
+ doc/html/pcreunicode.html
pcrecpp_html = doc/html/pcrecpp.html
dist_noinst_DATA = $(pcrecpp_html)
@@ -378,6 +385,7 @@ endif
dist_man_MANS = \
doc/pcre.3 \
doc/pcre-config.1 \
+ doc/pcre_assign_jit_stack.3 \
doc/pcre_compile.3 \
doc/pcre_compile2.3 \
doc/pcre_config.3 \
@@ -385,6 +393,7 @@ dist_man_MANS = \
doc/pcre_copy_substring.3 \
doc/pcre_dfa_exec.3 \
doc/pcre_exec.3 \
+ doc/pcre_free_study.3 \
doc/pcre_free_substring.3 \
doc/pcre_free_substring_list.3 \
doc/pcre_fullinfo.3 \
@@ -394,6 +403,8 @@ dist_man_MANS = \
doc/pcre_get_substring.3 \
doc/pcre_get_substring_list.3 \
doc/pcre_info.3 \
+ doc/pcre_jit_stack_alloc.3 \
+ doc/pcre_jit_stack_free.3 \
doc/pcre_maketables.3 \
doc/pcre_refcount.3 \
doc/pcre_study.3 \
@@ -403,6 +414,8 @@ dist_man_MANS = \
doc/pcrecallout.3 \
doc/pcrecompat.3 \
doc/pcregrep.1 \
+ doc/pcrejit.3 \
+ doc/pcrelimits.3 \
doc/pcrematching.3 \
doc/pcrepartial.3 \
doc/pcrepattern.3 \
@@ -412,7 +425,8 @@ dist_man_MANS = \
doc/pcresample.3 \
doc/pcrestack.3 \
doc/pcresyntax.3 \
- doc/pcretest.1
+ doc/pcretest.1 \
+ doc/pcreunicode.3
pcrecpp_man = doc/pcrecpp.3
EXTRA_DIST += $(pcrecpp_man)
diff --git a/PrepareRelease b/PrepareRelease
index 02b2300..6b2841b 100755
--- a/PrepareRelease
+++ b/PrepareRelease
@@ -49,7 +49,7 @@ cat <<End >pcre.txt
This file contains a concatenation of the PCRE man pages, converted to plain
text format for ease of searching with a text editor, or for use on systems
that do not have a man page processor. The small individual files that give
-synopses of each function in the library have not been included. Neither has
+synopses of each function in the library have not been included. Neither has
the pcredemo program. There are separate text files for the pcregrep and
pcretest commands.
-----------------------------------------------------------------------------
@@ -59,8 +59,9 @@ End
echo "Making pcre.txt"
for file in pcre pcrebuild pcrematching pcreapi pcrecallout pcrecompat \
- pcrepattern pcresyntax pcrepartial pcreprecompile \
- pcreperform pcreposix pcrecpp pcresample pcrestack ; do
+ pcrepattern pcresyntax pcreunicode pcrejit pcrepartial \
+ pcreprecompile pcreperform pcreposix pcrecpp pcresample \
+ pcrelimits pcrestack ; do
echo " Processing $file.3"
nroff -c -man $file.3 >$file.rawtxt
../CleanTxt <$file.rawtxt >>pcre.txt
@@ -103,7 +104,7 @@ perl <<"END" >pcredemo.3
". hy \\\\n(HY\n" .
"..\n" .
".\n" .
- ".EX\n" ;
+ ".EX\n" ;
while (<IN>)
{
s/\\/\\e/g;
@@ -111,7 +112,7 @@ perl <<"END" >pcredemo.3
}
print OUT ".EE\n";
close(IN);
- close(OUT);
+ close(OUT);
END
if [ $? != 0 ] ; then exit 1; fi
@@ -136,10 +137,12 @@ for file in *.3 ; do
base=`basename $file .3`
toc=-toc
if [ `expr $base : '.*_'` -ne 0 ] ; then toc="" ; fi
- if [ "$base" = "pcresample" ] || \
- [ "$base" = "pcrestack" ] || \
- [ "$base" = "pcrecompat" ] || \
- [ "$base" = "pcreperform" ] ; then
+ if [ "$base" = "pcresample" ] || \
+ [ "$base" = "pcrestack" ] || \
+ [ "$base" = "pcrecompat" ] || \
+ [ "$base" = "pcrelimits" ] || \
+ [ "$base" = "pcreperform" ] || \
+ [ "$base" = "pcreunicode" ] ; then
toc=""
fi
echo " Making $base.html"
diff --git a/README b/README
index 3c3ead1..d154c96 100644
--- a/README
+++ b/README
@@ -176,7 +176,9 @@ library. They are also documented in the pcrebuild man page.
. If you want to include support for just-in-time compiling, which can give
large performance improvements on certain platforms, add --enable-jit to the
- "configure" command.
+ "configure" command. This support is available only for certain hardware
+ architectures. If you try to enable it on an unsupported architecture, there
+ will be a compile time error.
. If you want to make use of the support for UTF-8 Unicode character strings in
PCRE, you must add --enable-utf8 to the "configure" command. Without it, the
@@ -837,4 +839,4 @@ The distribution should contain the following files:
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
-Last updated: 23 August 2011
+Last updated: 27 August 2011
diff --git a/doc/html/index.html b/doc/html/index.html
index d9af7e1..7068a88 100644
--- a/doc/html/index.html
+++ b/doc/html/index.html
@@ -1,10 +1,10 @@
<html>
-<!-- This is a manually maintained file that is the root of the HTML version of
- the PCRE documentation. When the HTML documents are built from the man
- page versions, the entire doc/html directory is emptied, this file is then
- copied into doc/html/index.html, and the remaining files therein are
+<!-- This is a manually maintained file that is the root of the HTML version of
+ the PCRE documentation. When the HTML documents are built from the man
+ page versions, the entire doc/html directory is emptied, this file is then
+ copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
--->
+-->
<head>
<title>PCRE specification</title>
</head>
@@ -42,6 +42,12 @@ The HTML documentation for PCRE comprises the following pages:
<tr><td><a href="pcregrep.html">pcregrep</a></td>
<td>&nbsp;&nbsp;The <b>pcregrep</b> command</td></tr>
+<tr><td><a href="pcrejit.html">pcrejit</a></td>
+ <td>&nbsp;&nbsp;Discussion of the just-in-time optimization support</td></tr>
+
+<tr><td><a href="pcrelimits.html">pcrelimits</a></td>
+ <td>&nbsp;&nbsp;Details of size and other limits</td></tr>
+
<tr><td><a href="pcrematching.html">pcrematching</a></td>
<td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
@@ -71,14 +77,17 @@ The HTML documentation for PCRE comprises the following pages:
<tr><td><a href="pcretest.html">pcretest</a></td>
<td>&nbsp;&nbsp;The <b>pcretest</b> command for testing PCRE</td></tr>
+
+<tr><td><a href="pcreunicode.html">pcreunicode</a></td>
+ <td>&nbsp;&nbsp;Discussion of Unicode and UTF-8 support</td></tr>
</table>
<p>
-There are also individual pages that summarize the interface for each function
+There are also individual pages that summarize the interface for each function
in the library:
</p>
-<table>
+<table>
<tr><td><a href="pcre_compile.html">pcre_compile</a></td>
<td>&nbsp;&nbsp;Compile a regular expression</td></tr>
@@ -129,7 +138,7 @@ in the library:
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
-
+
<tr><td><a href="pcre_refcount.html">pcre_refcount</a></td>
<td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
diff --git a/doc/html/pcre.html b/doc/html/pcre.html
index ddba3d6..f98349d 100644
--- a/doc/html/pcre.html
+++ b/doc/html/pcre.html
@@ -15,10 +15,8 @@ man page, in case the conversion went wrong.
<ul>
<li><a name="TOC1" href="#SEC1">INTRODUCTION</a>
<li><a name="TOC2" href="#SEC2">USER DOCUMENTATION</a>
-<li><a name="TOC3" href="#SEC3">LIMITATIONS</a>
-<li><a name="TOC4" href="#SEC4">UTF-8 AND UNICODE PROPERTY SUPPORT</a>
-<li><a name="TOC5" href="#SEC5">AUTHOR</a>
-<li><a name="TOC6" href="#SEC6">REVISION</a>
+<li><a name="TOC3" href="#SEC3">AUTHOR</a>
+<li><a name="TOC4" href="#SEC4">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">INTRODUCTION</a><br>
<P>
@@ -100,6 +98,8 @@ of searching. The sections are as follows:
pcrecpp details of the C++ wrapper
pcredemo a demonstration C program that uses PCRE
pcregrep description of the <b>pcregrep</b> command
+ pcrejit discussion of the just-in-time optimization support
+ pcrelimits details of size and other limits
pcrematching discussion of the two matching algorithms
pcrepartial details of the partial matching facility
pcrepattern syntax and semantics of supported regular expressions
@@ -110,191 +110,12 @@ of searching. The sections are as follows:
pcrestack discussion of stack usage
pcresyntax quick syntax reference
pcretest description of the <b>pcretest</b> testing command
+ pcreunicode discussion of Unicode and UTF-8 support
</pre>
In addition, in the "man" and HTML formats, there is a short page for each
C library function, listing its arguments and results.
</P>
-<br><a name="SEC3" href="#TOC1">LIMITATIONS</a><br>
-<P>
-There are some size limitations in PCRE but it is hoped that they will never in
-practice be relevant.
-</P>
-<P>
-The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
-compiled with the default internal linkage size of 2. If you want to process
-regular expressions that are truly enormous, you can compile PCRE with an
-internal linkage size of 3 or 4 (see the <b>README</b> file in the source
-distribution and the
-<a href="pcrebuild.html"><b>pcrebuild</b></a>
-documentation for details). In these cases the limit is substantially larger.
-However, the speed of execution is slower.
-</P>
-<P>
-All values in repeating quantifiers must be less than 65536.
-</P>
-<P>
-There is no limit to the number of parenthesized subpatterns, but there can be
-no more than 65535 capturing subpatterns.
-</P>
-<P>
-The maximum length of name for a named subpattern is 32 characters, and the
-maximum number of named subpatterns is 10000.
-</P>
-<P>
-The maximum length of a subject string is the largest positive number that an
-integer variable can hold. However, when using the traditional matching
-function, PCRE uses recursion to handle subpatterns and indefinite repetition.
-This means that the available stack space may limit the size of a subject
-string that can be processed by certain patterns. For a discussion of stack
-issues, see the
-<a href="pcrestack.html"><b>pcrestack</b></a>
-documentation.
-<a name="utf8support"></a></P>
-<br><a name="SEC4" href="#TOC1">UTF-8 AND UNICODE PROPERTY SUPPORT</a><br>
-<P>
-From release 3.3, PCRE has had some support for character strings encoded in
-the UTF-8 format. For release 4.0 this was greatly extended to cover most
-common requirements, and in release 5.0 additional support for Unicode general
-category properties was added.
-</P>
-<P>
-In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
-the code, and, in addition, you must call
-<a href="pcre_compile.html"><b>pcre_compile()</b></a>
-with the PCRE_UTF8 option flag, or the pattern must start with the sequence
-(*UTF8). When either of these is the case, both the pattern and any subject
-strings that are matched against it are treated as UTF-8 strings instead of
-strings of 1-byte characters.
-</P>
-<P>
-If you compile PCRE with UTF-8 support, but do not use it at run time, the
-library will be a bit bigger, but the additional run time overhead is limited
-to testing the PCRE_UTF8 flag occasionally, so should not be very big.
-</P>
-<P>
-If PCRE is built with Unicode character property support (which implies UTF-8
-support), the escape sequences \p{..}, \P{..}, and \X are supported.
-The available properties that can be tested are limited to the general
-category properties such as Lu for an upper case letter or Nd for a decimal
-number, the Unicode script names such as Arabic or Han, and the derived
-properties Any and L&. A full list is given in the
-<a href="pcrepattern.html"><b>pcrepattern</b></a>
-documentation. Only the short names for properties are supported. For example,
-\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE does not support this.
-<a name="utf8strings"></a></P>
-<br><b>
-Validity of UTF-8 strings
-</b><br>
-<P>
-When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
-are (by default) checked for validity on entry to the relevant functions. From
-release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
-themselves derived from the Unicode specification. Earlier releases of PCRE
-followed the rules of RFC 2279, which allows the full range of 31-bit values (0
-to 0x7FFFFFFF). The current check allows only values in the range U+0 to
-U+10FFFF, excluding U+D800 to U+DFFF.
-</P>
-<P>
-The excluded code points are the "Low Surrogate Area" of Unicode, of which the
-Unicode Standard says this: "The Low Surrogate Area does not contain any
-character assignments, consequently no character code charts or namelists are
-provided for this area. Surrogates are reserved for use with UTF-16 and then
-must be used in pairs." The code points that are encoded by UTF-16 pairs are
-available as independent code points in the UTF-8 encoding. (In other words,
-the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
-UTF-8.)
-</P>
-<P>
-If an invalid UTF-8 string is passed to PCRE, an error return is given. At
-compile time, the only additional information is the offset to the first byte
-of the failing character. The runtime functions (<b>pcre_exec()</b> and
-<b>pcre_dfa_exec()</b>), pass back this information as well as a more detailed
-reason code if the caller has provided memory in which to do this.
-</P>
-<P>
-In some situations, you may already know that your strings are valid, and
-therefore want to skip these checks in order to improve performance. If you set
-the PCRE_NO_UTF8_CHECK flag at compile time or at run time, PCRE assumes that
-the pattern or subject it is given (respectively) contains only valid UTF-8
-codes. In this case, it does not diagnose an invalid UTF-8 string.
-</P>
-<P>
-If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
-happens depends on why the string is invalid. If the string conforms to the
-"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
-in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
-test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
-rules of RFC 2279. However, if the string does not even conform to RFC 2279,
-the result is undefined. Your program may crash.
-</P>
-<P>
-If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
-encoded in a UTF-8-like manner as per the old RFC, you can set
-PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
-situation, you will have to apply your own validity check.
-</P>
-<br><b>
-General comments about UTF-8 mode
-</b><br>
-<P>
-1. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
-UTF-8 character if the value is greater than 127.
-</P>
-<P>
-2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
-characters for values greater than \177.
-</P>
-<P>
-3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
-bytes, for example: \x{100}{3}.
-</P>
-<P>
-4. The dot metacharacter matches one UTF-8 character instead of a single byte.
-</P>
-<P>
-5. The escape sequence \C can be used to match a single byte in UTF-8 mode,
-but its use can lead to some strange effects. This facility is not available in
-the alternative matching function, <b>pcre_dfa_exec()</b>.
-</P>
-<P>
-6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
-test characters of any code value, but, by default, the characters that PCRE
-recognizes as digits, spaces, or word characters remain the same set as before,
-all with values less than 256. This remains true even when PCRE is built to
-include Unicode property support, because to do otherwise would slow down PCRE
-in many common cases. Note in particular that this applies to \b and \B,
-because they are defined in terms of \w and \W. If you really want to test
-for a wider sense of, say, "digit", you can use explicit Unicode property tests
-such as \p{Nd}. Alternatively, if you set the PCRE_UCP option, the way that
-the character escapes work is changed so that Unicode properties are used to
-determine which characters match. There are more details in the section on
-<a href="pcrepattern.html#genericchartypes">generic character types</a>
-in the
-<a href="pcrepattern.html"><b>pcrepattern</b></a>
-documentation.
-</P>
-<P>
-7. Similarly, characters that match the POSIX named character classes are all
-low-valued characters, unless the PCRE_UCP option is set.
-</P>
-<P>
-8. However, the horizontal and vertical whitespace matching escapes (\h, \H,
-\v, and \V) do match all the appropriate Unicode characters, whether or not
-PCRE_UCP is set.
-</P>
-<P>
-9. Case-insensitive matching applies only to characters whose values are less
-than 128, unless PCRE is built with Unicode property support. Even when Unicode
-property support is available, PCRE still uses its own character tables when
-checking the case of low-valued characters, so as not to degrade performance.
-The Unicode property information is used only for characters with higher
-values. Furthermore, PCRE supports case-insensitive matching only when there is
-a one-to-one mapping between a letter's cases. There are a small number of
-many-to-one mappings in Unicode; these are not supported by PCRE.
-</P>
-<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC3" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@@ -308,9 +129,9 @@ Putting an actual email address here seems to have been a spam magnet, so I've
taken it away. If you want to email me, use my two initials, followed by the
two digits 10, at the domain cam.ac.uk.
</P>
-<br><a name="SEC6" href="#TOC1">REVISION</a><br>
+<br><a name="SEC4" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 07 May 2011
+Last updated: 24 August 2011
<br>
Copyright &copy; 1997-2011 University of Cambridge.
<br>
diff --git a/doc/html/pcre_assign_jit_stack.html b/doc/html/pcre_assign_jit_stack.html
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/doc/html/pcre_assign_jit_stack.html
diff --git a/doc/html/pcre_free_study.html b/doc/html/pcre_free_study.html
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/doc/html/pcre_free_study.html
diff --git a/doc/html/pcre_jit_stack_alloc.html b/doc/html/pcre_jit_stack_alloc.html
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/doc/html/pcre_jit_stack_alloc.html
diff --git a/doc/html/pcre_jit_stack_free.html b/doc/html/pcre_jit_stack_free.html
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/doc/html/pcre_jit_stack_free.html
diff --git a/doc/html/pcreapi.html b/doc/html/pcreapi.html
index d6293ca..8c99a84 100644
--- a/doc/html/pcreapi.html
+++ b/doc/html/pcreapi.html
@@ -706,9 +706,7 @@ of UTF-8 characters instead of single-byte character strings. However, it is
available only when PCRE is built to include UTF-8 support. If not, the use
of this option provokes an error. Details of how this option changes the
behaviour of PCRE are given in the
-<a href="pcre.html#utf8support">section on UTF-8 support</a>
-in the main
-<a href="pcre.html"><b>pcre</b></a>
+<a href="pcreunicode.html"><b>pcreunicode</b></a>
page.
<pre>
PCRE_NO_UTF8_CHECK
diff --git a/doc/html/pcrecompat.html b/doc/html/pcrecompat.html
index 126436c..c4520a1 100644
--- a/doc/html/pcrecompat.html
+++ b/doc/html/pcrecompat.html
@@ -23,9 +23,7 @@ versions 5.10 and above.
<P>
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
it does have are given in the
-<a href="pcre.html#utf8support">section on UTF-8 support</a>
-in the main
-<a href="pcre.html"><b>pcre</b></a>
+<a href="pcreunicode.html"><b>pcreunicode</b></a>
page.
</P>
<P>
@@ -197,7 +195,7 @@ Cambridge CB2 3QH, England.
REVISION
</b><br>
<P>
-Last updated: 24 July 2011
+Last updated: 24 August 2011
<br>
Copyright &copy; 1997-2011 University of Cambridge.
<br>
diff --git a/doc/html/pcrejit.html b/doc/html/pcrejit.html
new file mode 100644
index 0000000..438812a
--- /dev/null
+++ b/doc/html/pcrejit.html
@@ -0,0 +1,19 @@
+<html>
+<head>
+<title>pcrejit specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcrejit man page</h1>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
+<p>
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+<br>
+<ul>
+</ul>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
diff --git a/doc/html/pcrelimits.html b/doc/html/pcrelimits.html
new file mode 100644
index 0000000..4dc28f7
--- /dev/null
+++ b/doc/html/pcrelimits.html
@@ -0,0 +1,74 @@
+<html>
+<head>
+<title>pcrelimits specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcrelimits man page</h1>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
+<p>
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+<br>
+<br><b>
+SIZE AND OTHER LIMITATIONS
+</b><br>
+<P>
+There are some size limitations in PCRE but it is hoped that they will never in
+practice be relevant.
+</P>
+<P>
+The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
+compiled with the default internal linkage size of 2. If you want to process
+regular expressions that are truly enormous, you can compile PCRE with an
+internal linkage size of 3 or 4 (see the <b>README</b> file in the source
+distribution and the
+<a href="pcrebuild.html"><b>pcrebuild</b></a>
+documentation for details). In these cases the limit is substantially larger.
+However, the speed of execution is slower.
+</P>
+<P>
+All values in repeating quantifiers must be less than 65536.
+</P>
+<P>
+There is no limit to the number of parenthesized subpatterns, but there can be
+no more than 65535 capturing subpatterns.
+</P>
+<P>
+The maximum length of name for a named subpattern is 32 characters, and the
+maximum number of named subpatterns is 10000.
+</P>
+<P>
+The maximum length of a subject string is the largest positive number that an
+integer variable can hold. However, when using the traditional matching
+function, PCRE uses recursion to handle subpatterns and indefinite repetition.
+This means that the available stack space may limit the size of a subject
+string that can be processed by certain patterns. For a discussion of stack
+issues, see the
+<a href="pcrestack.html"><b>pcrestack</b></a>
+documentation.
+</P>
+<br><b>
+AUTHOR
+</b><br>
+<P>
+Philip Hazel
+<br>
+University Computing Service
+<br>
+Cambridge CB2 3QH, England.
+<br>
+</P>
+<br><b>
+REVISION
+</b><br>
+<P>
+Last updated: 24 August 2011
+<br>
+Copyright &copy; 1997-2011 University of Cambridge.
+<br>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
diff --git a/doc/html/pcrepattern.html b/doc/html/pcrepattern.html
index 6ddf3ef..f2a8994 100644
--- a/doc/html/pcrepattern.html
+++ b/doc/html/pcrepattern.html
@@ -72,9 +72,7 @@ Starting a pattern with this sequence is equivalent to setting the PCRE_UTF8
option. This feature is not Perl-compatible. How setting UTF-8 mode affects
pattern matching is mentioned in several places below. There is also a summary
of UTF-8 features in the
-<a href="pcre.html#utf8support">section on UTF-8 support</a>
-in the main
-<a href="pcre.html"><b>pcre</b></a>
+<a href="pcreunicode.html"><b>pcreunicode</b></a>
page.
</P>
<P>
@@ -2740,7 +2738,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC28" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 24 July 2011
+Last updated: 24 August 2011
<br>
Copyright &copy; 1997-2011 University of Cambridge.
<br>
diff --git a/doc/html/pcreunicode.html b/doc/html/pcreunicode.html
new file mode 100644
index 0000000..2bd4cc2
--- /dev/null
+++ b/doc/html/pcreunicode.html
@@ -0,0 +1,177 @@
+<html>
+<head>
+<title>pcreunicode specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcreunicode man page</h1>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
+<p>
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+<br>
+<br><b>
+UTF-8 AND UNICODE PROPERTY SUPPORT
+</b><br>
+<P>
+In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
+the code, and, in addition, you must call
+<a href="pcre_compile.html"><b>pcre_compile()</b></a>
+with the PCRE_UTF8 option flag, or the pattern must start with the sequence
+(*UTF8). When either of these is the case, both the pattern and any subject
+strings that are matched against it are treated as UTF-8 strings instead of
+strings of 1-byte characters. PCRE does not support any other formats (in
+particular, it does not support UTF-16).
+</P>
+<P>
+If you compile PCRE with UTF-8 support, but do not use it at run time, the
+library will be a bit bigger, but the additional run time overhead is limited
+to testing the PCRE_UTF8 flag occasionally, so should not be very big.
+</P>
+<P>
+If PCRE is built with Unicode character property support (which implies UTF-8
+support), the escape sequences \p{..}, \P{..}, and \X are supported.
+The available properties that can be tested are limited to the general
+category properties such as Lu for an upper case letter or Nd for a decimal
+number, the Unicode script names such as Arabic or Han, and the derived
+properties Any and L&. A full list is given in the
+<a href="pcrepattern.html"><b>pcrepattern</b></a>
+documentation. Only the short names for properties are supported. For example,
+\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
+Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
+compatibility with Perl 5.6. PCRE does not support this.
+<a name="utf8strings"></a></P>
+<br><b>
+Validity of UTF-8 strings
+</b><br>
+<P>
+When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
+are (by default) checked for validity on entry to the relevant functions. From
+release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
+themselves derived from the Unicode specification. Earlier releases of PCRE
+followed the rules of RFC 2279, which allows the full range of 31-bit values (0
+to 0x7FFFFFFF). The current check allows only values in the range U+0 to
+U+10FFFF, excluding U+D800 to U+DFFF.
+</P>
+<P>
+The excluded code points are the "Low Surrogate Area" of Unicode, of which the
+Unicode Standard says this: "The Low Surrogate Area does not contain any
+character assignments, consequently no character code charts or namelists are
+provided for this area. Surrogates are reserved for use with UTF-16 and then
+must be used in pairs." The code points that are encoded by UTF-16 pairs are
+available as independent code points in the UTF-8 encoding. (In other words,
+the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
+UTF-8.)
+</P>
+<P>
+If an invalid UTF-8 string is passed to PCRE, an error return is given. At
+compile time, the only additional information is the offset to the first byte
+of the failing character. The runtime functions <b>pcre_exec()</b> and
+<b>pcre_dfa_exec()</b> also pass back this information, as well as a more
+detailed reason code if the caller has provided memory in which to do this.
+</P>
+<P>
+In some situations, you may already know that your strings are valid, and
+therefore want to skip these checks in order to improve performance. If you set
+the PCRE_NO_UTF8_CHECK flag at compile time or at run time, PCRE assumes that
+the pattern or subject it is given (respectively) contains only valid UTF-8
+codes. In this case, it does not diagnose an invalid UTF-8 string.
+</P>
+<P>
+If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
+happens depends on why the string is invalid. If the string conforms to the
+"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
+in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
+test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
+rules of RFC 2279. However, if the string does not even conform to RFC 2279,
+the result is undefined. Your program may crash.
+</P>
+<P>
+If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
+encoded in a UTF-8-like manner as per the old RFC, you can set
+PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
+situation, you will have to apply your own validity check.
+</P>
+<br><b>
+General comments about UTF-8 mode
+</b><br>
+<P>
+1. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
+UTF-8 character if the value is greater than 127.
+</P>
+<P>
+2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
+characters for values greater than \177.
+</P>
+<P>
+3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
+bytes, for example: \x{100}{3}.
+</P>
+<P>
+4. The dot metacharacter matches one UTF-8 character instead of a single byte.
+</P>
+<P>
+5. The escape sequence \C can be used to match a single byte in UTF-8 mode,
+but its use can lead to some strange effects. This facility is not available in
+the alternative matching function, <b>pcre_dfa_exec()</b>.
+</P>
+<P>
+6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
+test characters of any code value, but, by default, the characters that PCRE
+recognizes as digits, spaces, or word characters remain the same set as before,
+all with values less than 256. This remains true even when PCRE is built to
+include Unicode property support, because to do otherwise would slow down PCRE
+in many common cases. Note in particular that this applies to \b and \B,
+because they are defined in terms of \w and \W. If you really want to test
+for a wider sense of, say, "digit", you can use explicit Unicode property tests
+such as \p{Nd}. Alternatively, if you set the PCRE_UCP option, the way that
+the character escapes work is changed so that Unicode properties are used to
+determine which characters match. There are more details in the section on
+<a href="pcrepattern.html#genericchartypes">generic character types</a>
+in the
+<a href="pcrepattern.html"><b>pcrepattern</b></a>
+documentation.
+</P>
+<P>
+7. Similarly, characters that match the POSIX named character classes are all
+low-valued characters, unless the PCRE_UCP option is set.
+</P>
+<P>
+8. However, the horizontal and vertical whitespace matching escapes (\h, \H,
+\v, and \V) do match all the appropriate Unicode characters, whether or not
+PCRE_UCP is set.
+</P>
+<P>
+9. Case-insensitive matching applies only to characters whose values are less
+than 128, unless PCRE is built with Unicode property support. Even when Unicode
+property support is available, PCRE still uses its own character tables when
+checking the case of low-valued characters, so as not to degrade performance.
+The Unicode property information is used only for characters with higher
+values. Furthermore, PCRE supports case-insensitive matching only when there is
+a one-to-one mapping between a letter's cases. There are a small number of
+many-to-one mappings in Unicode; these are not supported by PCRE.
+</P>
+<br><b>
+AUTHOR
+</b><br>
+<P>
+Philip Hazel
+<br>
+University Computing Service
+<br>
+Cambridge CB2 3QH, England.
+<br>
+</P>
+<br><b>
+REVISION
+</b><br>
+<P>
+Last updated: 24 August 2011
+<br>
+Copyright &copy; 1997-2011 University of Cambridge.
+<br>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
diff --git a/doc/index.html.src b/doc/index.html.src
index 58dfe45..fc93ed0 100644
--- a/doc/index.html.src
+++ b/doc/index.html.src
@@ -42,6 +42,12 @@ The HTML documentation for PCRE comprises the following pages:
<tr><td><a href="pcregrep.html">pcregrep</a></td>
<td>&nbsp;&nbsp;The <b>pcregrep</b> command</td></tr>
+<tr><td><a href="pcrejit.html">pcrejit</a></td>
+ <td>&nbsp;&nbsp;Discussion of the just-in-time optimization support</td></tr>
+
+<tr><td><a href="pcrelimits.html">pcrelimits</a></td>
+ <td>&nbsp;&nbsp;Details of size and other limits</td></tr>
+
<tr><td><a href="pcrematching.html">pcrematching</a></td>
<td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
@@ -71,6 +77,9 @@ The HTML documentation for PCRE comprises the following pages:
<tr><td><a href="pcretest.html">pcretest</a></td>
<td>&nbsp;&nbsp;The <b>pcretest</b> command for testing PCRE</td></tr>
+
+<tr><td><a href="pcreunicode.html">pcreunicode</a></td>
+ <td>&nbsp;&nbsp;Discussion of Unicode and UTF-8 support</td></tr>
</table>
<p>
@@ -80,6 +89,9 @@ in the library:
<table>
+<tr><td><a href="pcre_assign_jit_stack.html">pcre_assign_jit_stack</a></td>
+ <td>&nbsp;&nbsp;Assign stack for JIT matching</td></tr>
+
<tr><td><a href="pcre_compile.html">pcre_compile</a></td>
<td>&nbsp;&nbsp;Compile a regular expression</td></tr>
@@ -99,6 +111,9 @@ in the library:
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(DFA algorithm; <i>not</i> Perl compatible)</td></tr>
+<tr><td><a href="pcre_free_study.html">pcre_free_study</a></td>
+ <td>&nbsp;&nbsp;Free study data</td></tr>
+
<tr><td><a href="pcre_exec.html">pcre_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(Perl compatible)</td></tr>
@@ -127,6 +142,12 @@ in the library:
<tr><td><a href="pcre_info.html">pcre_info</a></td>
<td>&nbsp;&nbsp;Obsolete information extraction function</td></tr>
+<tr><td><a href="pcre_jit_stack_alloc.html">pcre_jit_stack_alloc</a></td>
+ <td>&nbsp;&nbsp;Create a stack for JIT matching</td></tr>
+
+<tr><td><a href="pcre_jit_stack_free.html">pcre_jit_stack_free</a></td>
+ <td>&nbsp;&nbsp;Free a JIT matching stack</td></tr>
+
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
diff --git a/doc/pcre.3 b/doc/pcre.3
index 8afdcf0..d4e1fea 100644
--- a/doc/pcre.3
+++ b/doc/pcre.3
@@ -95,6 +95,8 @@ of searching. The sections are as follows:
pcrecpp details of the C++ wrapper
pcredemo a demonstration C program that uses PCRE
pcregrep description of the \fBpcregrep\fP command
+ pcrejit discussion of the just-in-time optimization support
+ pcrelimits details of size and other limits
pcrematching discussion of the two matching algorithms
pcrepartial details of the partial matching facility
.\" JOIN
@@ -107,189 +109,12 @@ of searching. The sections are as follows:
pcrestack discussion of stack usage
pcresyntax quick syntax reference
pcretest description of the \fBpcretest\fP testing command
+ pcreunicode discussion of Unicode and UTF-8 support
.sp
In addition, in the "man" and HTML formats, there is a short page for each
C library function, listing its arguments and results.
.
.
-.SH LIMITATIONS
-.rs
-.sp
-There are some size limitations in PCRE but it is hoped that they will never in
-practice be relevant.
-.P
-The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
-compiled with the default internal linkage size of 2. If you want to process
-regular expressions that are truly enormous, you can compile PCRE with an
-internal linkage size of 3 or 4 (see the \fBREADME\fP file in the source
-distribution and the
-.\" HREF
-\fBpcrebuild\fP
-.\"
-documentation for details). In these cases the limit is substantially larger.
-However, the speed of execution is slower.
-.P
-All values in repeating quantifiers must be less than 65536.
-.P
-There is no limit to the number of parenthesized subpatterns, but there can be
-no more than 65535 capturing subpatterns.
-.P
-The maximum length of name for a named subpattern is 32 characters, and the
-maximum number of named subpatterns is 10000.
-.P
-The maximum length of a subject string is the largest positive number that an
-integer variable can hold. However, when using the traditional matching
-function, PCRE uses recursion to handle subpatterns and indefinite repetition.
-This means that the available stack space may limit the size of a subject
-string that can be processed by certain patterns. For a discussion of stack
-issues, see the
-.\" HREF
-\fBpcrestack\fP
-.\"
-documentation.
-.
-.
-.\" HTML <a name="utf8support"></a>
-.SH "UTF-8 AND UNICODE PROPERTY SUPPORT"
-.rs
-.sp
-From release 3.3, PCRE has had some support for character strings encoded in
-the UTF-8 format. For release 4.0 this was greatly extended to cover most
-common requirements, and in release 5.0 additional support for Unicode general
-category properties was added.
-.P
-In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
-the code, and, in addition, you must call
-.\" HREF
-\fBpcre_compile()\fP
-.\"
-with the PCRE_UTF8 option flag, or the pattern must start with the sequence
-(*UTF8). When either of these is the case, both the pattern and any subject
-strings that are matched against it are treated as UTF-8 strings instead of
-strings of 1-byte characters.
-.P
-If you compile PCRE with UTF-8 support, but do not use it at run time, the
-library will be a bit bigger, but the additional run time overhead is limited
-to testing the PCRE_UTF8 flag occasionally, so should not be very big.
-.P
-If PCRE is built with Unicode character property support (which implies UTF-8
-support), the escape sequences \ep{..}, \eP{..}, and \eX are supported.
-The available properties that can be tested are limited to the general
-category properties such as Lu for an upper case letter or Nd for a decimal
-number, the Unicode script names such as Arabic or Han, and the derived
-properties Any and L&. A full list is given in the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation. Only the short names for properties are supported. For example,
-\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE does not support this.
-.
-.
-.\" HTML <a name="utf8strings"></a>
-.SS "Validity of UTF-8 strings"
-.rs
-.sp
-When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
-are (by default) checked for validity on entry to the relevant functions. From
-release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
-themselves derived from the Unicode specification. Earlier releases of PCRE
-followed the rules of RFC 2279, which allows the full range of 31-bit values (0
-to 0x7FFFFFFF). The current check allows only values in the range U+0 to
-U+10FFFF, excluding U+D800 to U+DFFF.
-.P
-The excluded code points are the "Low Surrogate Area" of Unicode, of which the
-Unicode Standard says this: "The Low Surrogate Area does not contain any
-character assignments, consequently no character code charts or namelists are
-provided for this area. Surrogates are reserved for use with UTF-16 and then
-must be used in pairs." The code points that are encoded by UTF-16 pairs are
-available as independent code points in the UTF-8 encoding. (In other words,
-the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
-UTF-8.)
-.P
-If an invalid UTF-8 string is passed to PCRE, an error return is given. At
-compile time, the only additional information is the offset to the first byte
-of the failing character. The runtime functions (\fBpcre_exec()\fP and
-\fBpcre_dfa_exec()\fP), pass back this information as well as a more detailed
-reason code if the caller has provided memory in which to do this.
-.P
-In some situations, you may already know that your strings are valid, and
-therefore want to skip these checks in order to improve performance. If you set
-the PCRE_NO_UTF8_CHECK flag at compile time or at run time, PCRE assumes that
-the pattern or subject it is given (respectively) contains only valid UTF-8
-codes. In this case, it does not diagnose an invalid UTF-8 string.
-.P
-If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
-happens depends on why the string is invalid. If the string conforms to the
-"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
-in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
-test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
-rules of RFC 2279. However, if the string does not even conform to RFC 2279,
-the result is undefined. Your program may crash.
-.P
-If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
-encoded in a UTF-8-like manner as per the old RFC, you can set
-PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
-situation, you will have to apply your own validity check.
-.
-.
-.SS "General comments about UTF-8 mode"
-.rs
-.sp
-1. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
-UTF-8 character if the value is greater than 127.
-.P
-2. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
-characters for values greater than \e177.
-.P
-3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
-bytes, for example: \ex{100}{3}.
-.P
-4. The dot metacharacter matches one UTF-8 character instead of a single byte.
-.P
-5. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
-but its use can lead to some strange effects. This facility is not available in
-the alternative matching function, \fBpcre_dfa_exec()\fP.
-.P
-6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
-test characters of any code value, but, by default, the characters that PCRE
-recognizes as digits, spaces, or word characters remain the same set as before,
-all with values less than 256. This remains true even when PCRE is built to
-include Unicode property support, because to do otherwise would slow down PCRE
-in many common cases. Note in particular that this applies to \eb and \eB,
-because they are defined in terms of \ew and \eW. If you really want to test
-for a wider sense of, say, "digit", you can use explicit Unicode property tests
-such as \ep{Nd}. Alternatively, if you set the PCRE_UCP option, the way that
-the character escapes work is changed so that Unicode properties are used to
-determine which characters match. There are more details in the section on
-.\" HTML <a href="pcrepattern.html#genericchartypes">
-.\" </a>
-generic character types
-.\"
-in the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation.
-.P
-7. Similarly, characters that match the POSIX named character classes are all
-low-valued characters, unless the PCRE_UCP option is set.
-.P
-8. However, the horizontal and vertical whitespace matching escapes (\eh, \eH,
-\ev, and \eV) do match all the appropriate Unicode characters, whether or not
-PCRE_UCP is set.
-.P
-9. Case-insensitive matching applies only to characters whose values are less
-than 128, unless PCRE is built with Unicode property support. Even when Unicode
-property support is available, PCRE still uses its own character tables when
-checking the case of low-valued characters, so as not to degrade performance.
-The Unicode property information is used only for characters with higher
-values. Furthermore, PCRE supports case-insensitive matching only when there is
-a one-to-one mapping between a letter's cases. There are a small number of
-many-to-one mappings in Unicode; these are not supported by PCRE.
-.
-.
.SH AUTHOR
.rs
.sp
@@ -308,6 +133,6 @@ two digits 10, at the domain cam.ac.uk.
.rs
.sp
.nf
-Last updated: 07 May 2011
+Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcre.txt b/doc/pcre.txt
index 4001fc7..e8d63f9 100644
--- a/doc/pcre.txt
+++ b/doc/pcre.txt
@@ -85,6 +85,8 @@ USER DOCUMENTATION
pcrecpp details of the C++ wrapper
pcredemo a demonstration C program that uses PCRE
pcregrep description of the pcregrep command
+ pcrejit discussion of the just-in-time optimization support
+ pcrelimits details of size and other limits
pcrematching discussion of the two matching algorithms
pcrepartial details of the partial matching facility
pcrepattern syntax and semantics of supported
@@ -96,169 +98,12 @@ USER DOCUMENTATION
pcrestack discussion of stack usage
pcresyntax quick syntax reference
pcretest description of the pcretest testing command
+ pcreunicode discussion of Unicode and UTF-8 support
In addition, in the "man" and HTML formats, there is a short page for
each C library function, listing its arguments and results.
-LIMITATIONS
-
- There are some size limitations in PCRE but it is hoped that they will
- never in practice be relevant.
-
- The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE
- is compiled with the default internal linkage size of 2. If you want to
- process regular expressions that are truly enormous, you can compile
- PCRE with an internal linkage size of 3 or 4 (see the README file in
- the source distribution and the pcrebuild documentation for details).
- In these cases the limit is substantially larger. However, the speed
- of execution is slower.
-
- All values in repeating quantifiers must be less than 65536.
-
- There is no limit to the number of parenthesized subpatterns, but there
- can be no more than 65535 capturing subpatterns.
-
- The maximum length of name for a named subpattern is 32 characters, and
- the maximum number of named subpatterns is 10000.
-
- The maximum length of a subject string is the largest positive number
- that an integer variable can hold. However, when using the traditional
- matching function, PCRE uses recursion to handle subpatterns and indef-
- inite repetition. This means that the available stack space may limit
- the size of a subject string that can be processed by certain patterns.
- For a discussion of stack issues, see the pcrestack documentation.
-
-
-UTF-8 AND UNICODE PROPERTY SUPPORT
-
- From release 3.3, PCRE has had some support for character strings
- encoded in the UTF-8 format. For release 4.0 this was greatly extended
- to cover most common requirements, and in release 5.0 additional sup-
- port for Unicode general category properties was added.
-
- In order process UTF-8 strings, you must build PCRE to include UTF-8
- support in the code, and, in addition, you must call pcre_compile()
- with the PCRE_UTF8 option flag, or the pattern must start with the
- sequence (*UTF8). When either of these is the case, both the pattern
- and any subject strings that are matched against it are treated as
- UTF-8 strings instead of strings of 1-byte characters.
-
- If you compile PCRE with UTF-8 support, but do not use it at run time,
- the library will be a bit bigger, but the additional run time overhead
- is limited to testing the PCRE_UTF8 flag occasionally, so should not be
- very big.
-
- If PCRE is built with Unicode character property support (which implies
- UTF-8 support), the escape sequences \p{..}, \P{..}, and \X are sup-
- ported. The available properties that can be tested are limited to the
- general category properties such as Lu for an upper case letter or Nd
- for a decimal number, the Unicode script names such as Arabic or Han,
- and the derived properties Any and L&. A full list is given in the
- pcrepattern documentation. Only the short names for properties are sup-
- ported. For example, \p{L} matches a letter. Its Perl synonym, \p{Let-
- ter}, is not supported. Furthermore, in Perl, many properties may
- optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE
- does not support this.
-
- Validity of UTF-8 strings
-
- When you set the PCRE_UTF8 flag, the strings passed as patterns and
- subjects are (by default) checked for validity on entry to the relevant
- functions. From release 7.3 of PCRE, the check is according the rules
- of RFC 3629, which are themselves derived from the Unicode specifica-
- tion. Earlier releases of PCRE followed the rules of RFC 2279, which
- allows the full range of 31-bit values (0 to 0x7FFFFFFF). The current
- check allows only values in the range U+0 to U+10FFFF, excluding U+D800
- to U+DFFF.
-
- The excluded code points are the "Low Surrogate Area" of Unicode, of
- which the Unicode Standard says this: "The Low Surrogate Area does not
- contain any character assignments, consequently no character code
- charts or namelists are provided for this area. Surrogates are reserved
- for use with UTF-16 and then must be used in pairs." The code points
- that are encoded by UTF-16 pairs are available as independent code
- points in the UTF-8 encoding. (In other words, the whole surrogate
- thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
-
- If an invalid UTF-8 string is passed to PCRE, an error return is given.
- At compile time, the only additional information is the offset to the
- first byte of the failing character. The runtime functions (pcre_exec()
- and pcre_dfa_exec()), pass back this information as well as a more
- detailed reason code if the caller has provided memory in which to do
- this.
-
- In some situations, you may already know that your strings are valid,
- and therefore want to skip these checks in order to improve perfor-
- mance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or at run
- time, PCRE assumes that the pattern or subject it is given (respec-
- tively) contains only valid UTF-8 codes. In this case, it does not
- diagnose an invalid UTF-8 string.
-
- If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set,
- what happens depends on why the string is invalid. If the string con-
- forms to the "old" definition of UTF-8 (RFC 2279), it is processed as a
- string of characters in the range 0 to 0x7FFFFFFF. In other words,
- apart from the initial validity test, PCRE (when in UTF-8 mode) handles
- strings according to the more liberal rules of RFC 2279. However, if
- the string does not even conform to RFC 2279, the result is undefined.
- Your program may crash.
-
- If you want to process strings of values in the full range 0 to
- 0x7FFFFFFF, encoded in a UTF-8-like manner as per the old RFC, you can
- set PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in
- this situation, you will have to apply your own validity check.
-
- General comments about UTF-8 mode
-
- 1. An unbraced hexadecimal escape sequence (such as \xb3) matches a
- two-byte UTF-8 character if the value is greater than 127.
-
- 2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
- characters for values greater than \177.
-
- 3. Repeat quantifiers apply to complete UTF-8 characters, not to indi-
- vidual bytes, for example: \x{100}{3}.
-
- 4. The dot metacharacter matches one UTF-8 character instead of a sin-
- gle byte.
-
- 5. The escape sequence \C can be used to match a single byte in UTF-8
- mode, but its use can lead to some strange effects. This facility is
- not available in the alternative matching function, pcre_dfa_exec().
-
- 6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
- test characters of any code value, but, by default, the characters that
- PCRE recognizes as digits, spaces, or word characters remain the same
- set as before, all with values less than 256. This remains true even
- when PCRE is built to include Unicode property support, because to do
- otherwise would slow down PCRE in many common cases. Note in particular
- that this applies to \b and \B, because they are defined in terms of \w
- and \W. If you really want to test for a wider sense of, say, "digit",
- you can use explicit Unicode property tests such as \p{Nd}. Alterna-
- tively, if you set the PCRE_UCP option, the way that the character
- escapes work is changed so that Unicode properties are used to deter-
- mine which characters match. There are more details in the section on
- generic character types in the pcrepattern documentation.
-
- 7. Similarly, characters that match the POSIX named character classes
- are all low-valued characters, unless the PCRE_UCP option is set.
-
- 8. However, the horizontal and vertical whitespace matching escapes
- (\h, \H, \v, and \V) do match all the appropriate Unicode characters,
- whether or not PCRE_UCP is set.
-
- 9. Case-insensitive matching applies only to characters whose values
- are less than 128, unless PCRE is built with Unicode property support.
- Even when Unicode property support is available, PCRE still uses its
- own character tables when checking the case of low-valued characters,
- so as not to degrade performance. The Unicode property information is
- used only for characters with higher values. Furthermore, PCRE supports
- case-insensitive matching only when there is a one-to-one mapping
- between a letter's cases. There are a small number of many-to-one map-
- pings in Unicode; these are not supported by PCRE.
-
-
AUTHOR
Philip Hazel
@@ -272,11 +117,11 @@ AUTHOR
REVISION
- Last updated: 07 May 2011
+ Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREBUILD(3) PCREBUILD(3)
@@ -622,8 +467,8 @@ REVISION
Last updated: 02 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREMATCHING(3) PCREMATCHING(3)
@@ -826,8 +671,8 @@ REVISION
Last updated: 17 November 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREAPI(3) PCREAPI(3)
@@ -1453,8 +1298,8 @@ COMPILING A PATTERN
strings of UTF-8 characters instead of single-byte character strings.
However, it is available only when PCRE is built to include UTF-8 sup-
port. If not, the use of this option provokes an error. Details of how
- this option changes the behaviour of PCRE are given in the section on
- UTF-8 support in the main pcre page.
+ this option changes the behaviour of PCRE are given in the pcreunicode
+ page.
PCRE_NO_UTF8_CHECK
@@ -2998,8 +2843,8 @@ REVISION
Last updated: 13 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECALLOUT(3) PCRECALLOUT(3)
@@ -3187,8 +3032,8 @@ REVISION
Last updated: 31 July 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECOMPAT(3) PCRECOMPAT(3)
@@ -3203,54 +3048,53 @@ DIFFERENCES BETWEEN PCRE AND PERL
respect to Perl versions 5.10 and above.
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details
- of what it does have are given in the section on UTF-8 support in the
- main pcre page.
+ of what it does have are given in the pcreunicode page.
2. PCRE allows repeat quantifiers only on parenthesized assertions, but
- they do not mean what you might think. For example, (?!a){3} does not
+ they do not mean what you might think. For example, (?!a){3} does not
assert that the next three characters are not "a". It just asserts that
the next character is not "a" three times (in principle: PCRE optimizes
this to run the assertion just once). Perl allows repeat quantifiers on
other assertions such as \b, but these do not seem to have any use.
- 3. Capturing subpatterns that occur inside negative lookahead asser-
- tions are counted, but their entries in the offsets vector are never
- set. Perl sets its numerical variables from any such patterns that are
+ 3. Capturing subpatterns that occur inside negative lookahead asser-
+ tions are counted, but their entries in the offsets vector are never
+ set. Perl sets its numerical variables from any such patterns that are
matched before the assertion fails to match something (thereby succeed-
- ing), but only if the negative lookahead assertion contains just one
+ ing), but only if the negative lookahead assertion contains just one
branch.
- 4. Though binary zero characters are supported in the subject string,
+ 4. Though binary zero characters are supported in the subject string,
they are not allowed in a pattern string because it is passed as a nor-
mal C string, terminated by zero. The escape sequence \0 can be used in
the pattern to represent a binary zero.
- 5. The following Perl escape sequences are not supported: \l, \u, \L,
- \U, and \N when followed by a character name or Unicode value. (\N on
+ 5. The following Perl escape sequences are not supported: \l, \u, \L,
+ \U, and \N when followed by a character name or Unicode value. (\N on
its own, matching a non-newline character, is supported.) In fact these
- are implemented by Perl's general string-handling and are not part of
- its pattern matching engine. If any of these are encountered by PCRE,
+ are implemented by Perl's general string-handling and are not part of
+ its pattern matching engine. If any of these are encountered by PCRE,
an error is generated.
- 6. The Perl escape sequences \p, \P, and \X are supported only if PCRE
- is built with Unicode character property support. The properties that
- can be tested with \p and \P are limited to the general category prop-
- erties such as Lu and Nd, script names such as Greek or Han, and the
- derived properties Any and L&. PCRE does support the Cs (surrogate)
- property, which Perl does not; the Perl documentation says "Because
+ 6. The Perl escape sequences \p, \P, and \X are supported only if PCRE
+ is built with Unicode character property support. The properties that
+ can be tested with \p and \P are limited to the general category prop-
+ erties such as Lu and Nd, script names such as Greek or Han, and the
+ derived properties Any and L&. PCRE does support the Cs (surrogate)
+ property, which Perl does not; the Perl documentation says "Because
Perl hides the need for the user to understand the internal representa-
- tion of Unicode characters, there is no need to implement the somewhat
+ tion of Unicode characters, there is no need to implement the somewhat
messy concept of surrogates."
- 7. PCRE implements a simpler version of \X than Perl, which changed to
- make \X match what Unicode calls an "extended grapheme cluster". This
- is more complicated than an extended Unicode sequence, which is what
+ 7. PCRE implements a simpler version of \X than Perl, which changed to
+ make \X match what Unicode calls an "extended grapheme cluster". This
+ is more complicated than an extended Unicode sequence, which is what
PCRE matches.
8. PCRE does support the \Q...\E escape for quoting substrings. Charac-
- ters in between are treated as literals. This is slightly different
- from Perl in that $ and @ are also handled as literals inside the
- quotes. In Perl, they cause variable interpolation (but of course PCRE
+ ters in between are treated as literals. This is slightly different
+ from Perl in that $ and @ are also handled as literals inside the
+ quotes. In Perl, they cause variable interpolation (but of course PCRE
does not have variables). Note the following examples:
Pattern PCRE matches Perl matches
@@ -3260,60 +3104,60 @@ DIFFERENCES BETWEEN PCRE AND PERL
\Qabc\$xyz\E abc\$xyz abc\$xyz
\Qabc\E\$\Qxyz\E abc$xyz abc$xyz
- The \Q...\E sequence is recognized both inside and outside character
+ The \Q...\E sequence is recognized both inside and outside character
classes.
9. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
- constructions. However, there is support for recursive patterns. This
- is not available in Perl 5.8, but it is in Perl 5.10. Also, the PCRE
- "callout" feature allows an external function to be called during pat-
+ constructions. However, there is support for recursive patterns. This
+ is not available in Perl 5.8, but it is in Perl 5.10. Also, the PCRE
+ "callout" feature allows an external function to be called during pat-
tern matching. See the pcrecallout documentation for details.
- 10. Subpatterns that are called recursively or as "subroutines" are
- always treated as atomic groups in PCRE. This is like Python, but
- unlike Perl. There is a discussion of an example that explains this in
- more detail in the section on recursion differences from Perl in the
+ 10. Subpatterns that are called recursively or as "subroutines" are
+ always treated as atomic groups in PCRE. This is like Python, but
+ unlike Perl. There is a discussion of an example that explains this in
+ more detail in the section on recursion differences from Perl in the
pcrepattern page.
- 11. There are some differences that are concerned with the settings of
- captured strings when part of a pattern is repeated. For example,
- matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
+ 11. There are some differences that are concerned with the settings of
+ captured strings when part of a pattern is repeated. For example,
+ matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
unset, but in PCRE it is set to "b".
- 12. PCRE's handling of duplicate subpattern numbers and duplicate sub-
+ 12. PCRE's handling of duplicate subpattern numbers and duplicate sub-
pattern names is not as general as Perl's. This is a consequence of the
fact the PCRE works internally just with numbers, using an external ta-
- ble to translate between numbers and names. In particular, a pattern
- such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
- the same number but different names, is not supported, and causes an
- error at compile time. If it were allowed, it would not be possible to
- distinguish which parentheses matched, because both names map to cap-
+ ble to translate between numbers and names. In particular, a pattern
+ such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
+ the same number but different names, is not supported, and causes an
+ error at compile time. If it were allowed, it would not be possible to
+ distinguish which parentheses matched, because both names map to cap-
turing subpattern number 1. To avoid this confusing situation, an error
is given at compile time.
- 13. Perl recognizes comments in some places that PCRE does not, for
- example, between the ( and ? at the start of a subpattern. If the /x
- modifier is set, Perl allows whitespace between ( and ? but PCRE never
+ 13. Perl recognizes comments in some places that PCRE does not, for
+ example, between the ( and ? at the start of a subpattern. If the /x
+ modifier is set, Perl allows whitespace between ( and ? but PCRE never
does, even if the PCRE_EXTENDED option is set.
14. PCRE provides some extensions to the Perl regular expression facil-
- ities. Perl 5.10 includes new features that are not in earlier ver-
- sions of Perl, some of which (such as named parentheses) have been in
+ ities. Perl 5.10 includes new features that are not in earlier ver-
+ sions of Perl, some of which (such as named parentheses) have been in
PCRE for some time. This list is with respect to Perl 5.10:
- (a) Although lookbehind assertions in PCRE must match fixed length
- strings, each alternative branch of a lookbehind assertion can match a
- different length of string. Perl requires them all to have the same
+ (a) Although lookbehind assertions in PCRE must match fixed length
+ strings, each alternative branch of a lookbehind assertion can match a
+ different length of string. Perl requires them all to have the same
length.
- (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
+ (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
meta-character matches only at the very end of the string.
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no spe-
cial meaning is faulted. Otherwise, like Perl, the backslash is quietly
ignored. (Perl can be made to issue a warning.)
- (d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
+ (d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
fiers is inverted, that is, by default they are not greedy, but if fol-
lowed by a question mark they are.
@@ -3321,10 +3165,10 @@ DIFFERENCES BETWEEN PCRE AND PERL
tried only at the first matching position in the subject string.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART,
- and PCRE_NO_AUTO_CAPTURE options for pcre_exec() have no Perl equiva-
+ and PCRE_NO_AUTO_CAPTURE options for pcre_exec() have no Perl equiva-
lents.
- (g) The \R escape sequence can be restricted to match only CR, LF, or
+ (g) The \R escape sequence can be restricted to match only CR, LF, or
CRLF by the PCRE_BSR_ANYCRLF option.
(h) The callout facility is PCRE-specific.
@@ -3334,10 +3178,10 @@ DIFFERENCES BETWEEN PCRE AND PERL
(j) Patterns compiled by PCRE can be saved and re-used at a later time,
even on different hosts that have the other endianness.
- (k) The alternative matching function (pcre_dfa_exec()) matches in a
+ (k) The alternative matching function (pcre_dfa_exec()) matches in a
different way and is not Perl-compatible.
- (l) PCRE recognizes some special sequences such as (*CR) at the start
+ (l) PCRE recognizes some special sequences such as (*CR) at the start
of a pattern that set overall options that cannot be changed within the
pattern.
@@ -3351,11 +3195,11 @@ AUTHOR
REVISION
- Last updated: 24 July 2011
+ Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPATTERN(3) PCREPATTERN(3)
@@ -3391,8 +3235,8 @@ PCRE REGULAR EXPRESSION DETAILS
Starting a pattern with this sequence is equivalent to setting the
PCRE_UTF8 option. This feature is not Perl-compatible. How setting
UTF-8 mode affects pattern matching is mentioned in several places
- below. There is also a summary of UTF-8 features in the section on
- UTF-8 support in the main pcre page.
+ below. There is also a summary of UTF-8 features in the pcreunicode
+ page.
Another special sequence that may appear at the start of a pattern or
in combination with (*UTF8) is:
@@ -5860,11 +5704,11 @@ AUTHOR
REVISION
- Last updated: 24 July 2011
+ Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRESYNTAX(3) PCRESYNTAX(3)
@@ -6233,8 +6077,157 @@ REVISION
Last updated: 21 November 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
+
+
+PCREUNICODE(3) PCREUNICODE(3)
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+UTF-8 AND UNICODE PROPERTY SUPPORT
+
+ In order process UTF-8 strings, you must build PCRE to include UTF-8
+ support in the code, and, in addition, you must call pcre_compile()
+ with the PCRE_UTF8 option flag, or the pattern must start with the
+ sequence (*UTF8). When either of these is the case, both the pattern
+ and any subject strings that are matched against it are treated as
+ UTF-8 strings instead of strings of 1-byte characters. PCRE does not
+ support any other formats (in particular, it does not support UTF-16).
+
+ If you compile PCRE with UTF-8 support, but do not use it at run time,
+ the library will be a bit bigger, but the additional run time overhead
+ is limited to testing the PCRE_UTF8 flag occasionally, so should not be
+ very big.
+
+ If PCRE is built with Unicode character property support (which implies
+ UTF-8 support), the escape sequences \p{..}, \P{..}, and \X are sup-
+ ported. The available properties that can be tested are limited to the
+ general category properties such as Lu for an upper case letter or Nd
+ for a decimal number, the Unicode script names such as Arabic or Han,
+ and the derived properties Any and L&. A full list is given in the
+ pcrepattern documentation. Only the short names for properties are sup-
+ ported. For example, \p{L} matches a letter. Its Perl synonym, \p{Let-
+ ter}, is not supported. Furthermore, in Perl, many properties may
+ optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE
+ does not support this.
+
+ Validity of UTF-8 strings
+
+ When you set the PCRE_UTF8 flag, the strings passed as patterns and
+ subjects are (by default) checked for validity on entry to the relevant
+ functions. From release 7.3 of PCRE, the check is according the rules
+ of RFC 3629, which are themselves derived from the Unicode specifica-
+ tion. Earlier releases of PCRE followed the rules of RFC 2279, which
+ allows the full range of 31-bit values (0 to 0x7FFFFFFF). The current
+ check allows only values in the range U+0 to U+10FFFF, excluding U+D800
+ to U+DFFF.
+
+ The excluded code points are the "Low Surrogate Area" of Unicode, of
+ which the Unicode Standard says this: "The Low Surrogate Area does not
+ contain any character assignments, consequently no character code
+ charts or namelists are provided for this area. Surrogates are reserved
+ for use with UTF-16 and then must be used in pairs." The code points
+ that are encoded by UTF-16 pairs are available as independent code
+ points in the UTF-8 encoding. (In other words, the whole surrogate
+ thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
+
+ If an invalid UTF-8 string is passed to PCRE, an error return is given.
+ At compile time, the only additional information is the offset to the
+ first byte of the failing character. The runtime functions pcre_exec()
+ and pcre_dfa_exec() also pass back this information, as well as a more
+ detailed reason code if the caller has provided memory in which to do
+ this.
+
+ In some situations, you may already know that your strings are valid,
+ and therefore want to skip these checks in order to improve perfor-
+ mance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or at run
+ time, PCRE assumes that the pattern or subject it is given (respec-
+ tively) contains only valid UTF-8 codes. In this case, it does not
+ diagnose an invalid UTF-8 string.
+
+ If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set,
+ what happens depends on why the string is invalid. If the string con-
+ forms to the "old" definition of UTF-8 (RFC 2279), it is processed as a
+ string of characters in the range 0 to 0x7FFFFFFF. In other words,
+ apart from the initial validity test, PCRE (when in UTF-8 mode) handles
+ strings according to the more liberal rules of RFC 2279. However, if
+ the string does not even conform to RFC 2279, the result is undefined.
+ Your program may crash.
+
+ If you want to process strings of values in the full range 0 to
+ 0x7FFFFFFF, encoded in a UTF-8-like manner as per the old RFC, you can
+ set PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in
+ this situation, you will have to apply your own validity check.
+
+ General comments about UTF-8 mode
+
+ 1. An unbraced hexadecimal escape sequence (such as \xb3) matches a
+ two-byte UTF-8 character if the value is greater than 127.
+
+ 2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
+ characters for values greater than \177.
+
+ 3. Repeat quantifiers apply to complete UTF-8 characters, not to indi-
+ vidual bytes, for example: \x{100}{3}.
+
+ 4. The dot metacharacter matches one UTF-8 character instead of a sin-
+ gle byte.
+
+ 5. The escape sequence \C can be used to match a single byte in UTF-8
+ mode, but its use can lead to some strange effects. This facility is
+ not available in the alternative matching function, pcre_dfa_exec().
+
+ 6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
+ test characters of any code value, but, by default, the characters that
+ PCRE recognizes as digits, spaces, or word characters remain the same
+ set as before, all with values less than 256. This remains true even
+ when PCRE is built to include Unicode property support, because to do
+ otherwise would slow down PCRE in many common cases. Note in particular
+ that this applies to \b and \B, because they are defined in terms of \w
+ and \W. If you really want to test for a wider sense of, say, "digit",
+ you can use explicit Unicode property tests such as \p{Nd}. Alterna-
+ tively, if you set the PCRE_UCP option, the way that the character
+ escapes work is changed so that Unicode properties are used to deter-
+ mine which characters match. There are more details in the section on
+ generic character types in the pcrepattern documentation.
+
+ 7. Similarly, characters that match the POSIX named character classes
+ are all low-valued characters, unless the PCRE_UCP option is set.
+
+ 8. However, the horizontal and vertical whitespace matching escapes
+ (\h, \H, \v, and \V) do match all the appropriate Unicode characters,
+ whether or not PCRE_UCP is set.
+
+ 9. Case-insensitive matching applies only to characters whose values
+ are less than 128, unless PCRE is built with Unicode property support.
+ Even when Unicode property support is available, PCRE still uses its
+ own character tables when checking the case of low-valued characters,
+ so as not to degrade performance. The Unicode property information is
+ used only for characters with higher values. Furthermore, PCRE supports
+ case-insensitive matching only when there is a one-to-one mapping
+ between a letter's cases. There are a small number of many-to-one map-
+ pings in Unicode; these are not supported by PCRE.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 24 August 2011
+ Copyright (c) 1997-2011 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+------------------------------------------------------------------------------
+
+
PCREPARTIAL(3) PCREPARTIAL(3)
@@ -6653,8 +6646,8 @@ REVISION
Last updated: 07 November 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPRECOMPILE(3) PCREPRECOMPILE(3)
@@ -6778,8 +6771,8 @@ REVISION
Last updated: 17 November 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPERFORM(3) PCREPERFORM(3)
@@ -6946,8 +6939,8 @@ REVISION
Last updated: 16 May 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPOSIX(3) PCREPOSIX(3)
@@ -7209,8 +7202,8 @@ REVISION
Last updated: 16 May 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECPP(3) PCRECPP(3)
@@ -7551,8 +7544,8 @@ REVISION
Last updated: 17 March 2009
Minor typo fixed: 25 July 2011
------------------------------------------------------------------------------
-
-
+
+
PCRESAMPLE(3) PCRESAMPLE(3)
@@ -7638,6 +7631,56 @@ REVISION
Last updated: 17 November 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
+PCRELIMITS(3) PCRELIMITS(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+SIZE AND OTHER LIMITATIONS
+
+ There are some size limitations in PCRE but it is hoped that they will
+ never in practice be relevant.
+
+ The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE
+ is compiled with the default internal linkage size of 2. If you want to
+ process regular expressions that are truly enormous, you can compile
+ PCRE with an internal linkage size of 3 or 4 (see the README file in
+ the source distribution and the pcrebuild documentation for details).
+ In these cases the limit is substantially larger. However, the speed
+ of execution is slower.
+
+ All values in repeating quantifiers must be less than 65536.
+
+ There is no limit to the number of parenthesized subpatterns, but there
+ can be no more than 65535 capturing subpatterns.
+
+ The maximum length of name for a named subpattern is 32 characters, and
+ the maximum number of named subpatterns is 10000.
+
+ The maximum length of a subject string is the largest positive number
+ that an integer variable can hold. However, when using the traditional
+ matching function, PCRE uses recursion to handle subpatterns and indef-
+ inite repetition. This means that the available stack space may limit
+ the size of a subject string that can be processed by certain patterns.
+ For a discussion of stack issues, see the pcrestack documentation.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 24 August 2011
+ Copyright (c) 1997-2011 University of Cambridge.
+------------------------------------------------------------------------------
+
+
PCRESTACK(3) PCRESTACK(3)
@@ -7789,5 +7832,5 @@ REVISION
Last updated: 22 July 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
diff --git a/doc/pcre_assign_jit_stack.3 b/doc/pcre_assign_jit_stack.3
new file mode 100644
index 0000000..55a348a
--- /dev/null
+++ b/doc/pcre_assign_jit_stack.3
@@ -0,0 +1,49 @@
+.TH PCRE_ASSIGN_JIT_STACK 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include <pcre.h>
+.PP
+.SM
+.B void pcre_assign_jit_stack(pcre_extra *\fIextra\fP,
+.ti +5n
+.B pcre_jit_callback \fIcallback\fP, void *\fIdata\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function provides control over the memory used as a stack at runtime by a
+call to \fBpcre_exec()\fP with a pattern that has been successfully compiled
+with JIT optimization. The arguments are:
+.sp
+ extra the data pointer returned by \fBpcre_study()\fP
+ callback a callback function
+ data a JIT stack or a value to be passed to the callback
+ function
+.P
+If \fIcallback\fP is NULL and \fIdata\fP is NULL, an internal 32K block on
+the machine stack is used.
+.P
+If \fIcallback\fP is NULL and \fIdata\fP is not NULL, \fIdata\fP must
+be a valid JIT stack, the result of calling \fBpcre_jit_stack_alloc()\fP.
+.P
+If \fIcallback\fP not NULL, it is called with \fIdata\fP as an argument at
+the start of matching, in order to set up a JIT stack. If the result is NULL,
+the internal 32K stack is used; otherwise the return value must be a valid JIT
+stack, the result of calling \fBpcre_jit_stack_alloc()\fP.
+.P
+You may safely assign the same JIT stack to multiple patterns, as long as they
+are all matched in the same thread. In a multithread application, each thread
+must use its own JIT stack.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/doc/pcre_config.3 b/doc/pcre_config.3
index d8e1658..d0846e1 100644
--- a/doc/pcre_config.3
+++ b/doc/pcre_config.3
@@ -13,14 +13,18 @@ PCRE - Perl-compatible regular expressions
.rs
.sp
This function makes it possible for a client program to find out which optional
-features are available in the version of the PCRE library it is using. Its
+features are available in the version of the PCRE library it is using. The
arguments are as follows:
.sp
\fIwhat\fP A code specifying what information is required
\fIwhere\fP Points to where to put the data
.sp
-The available codes are:
+The \fIwhere\fP argument must point to an integer variable, except for
+PCRE_CONFIG_MATCH_LIMIT and PCRE_CONFIG_MATCH_LIMIT_RECURSION, when it must
+point to an unsigned long integer. The available codes are:
.sp
+ PCRE_CONFIG_JIT Availability of just-in-time compiler
+ support (1=yes 0=no)
PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
PCRE_CONFIG_MATCH_LIMIT_RECURSION
@@ -35,9 +39,8 @@ The available codes are:
0 all Unicode line endings
1 CR, LF, or CRLF only
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
- Threshold of return slots, above
- which \fBmalloc()\fP is used by
- the POSIX API
+ Threshold of return slots, above which
+ \fBmalloc()\fP is used by the POSIX API
PCRE_CONFIG_STACKRECURSE Recursion implementation (1=stack 0=heap)
PCRE_CONFIG_UTF8 Availability of UTF-8 support (1=yes 0=no)
PCRE_CONFIG_UNICODE_PROPERTIES
diff --git a/doc/pcre_dfa_exec.3 b/doc/pcre_dfa_exec.3
index c8ca381..1f58cbf 100644
--- a/doc/pcre_dfa_exec.3
+++ b/doc/pcre_dfa_exec.3
@@ -75,17 +75,21 @@ page.
.P
A \fBpcre_extra\fP structure contains the following fields:
.sp
- \fIflags\fP Bits indicating which fields are set
- \fIstudy_data\fP Opaque data from \fBpcre_study()\fP
- \fImatch_limit\fP Limit on internal resource use
+ \fIflags\fP Bits indicating which fields are set
+ \fIstudy_data\fP Opaque data from \fBpcre_study()\fP
+ \fImatch_limit\fP Limit on internal resource use
\fImatch_limit_recursion\fP Limit on internal recursion depth
- \fIcallout_data\fP Opaque data passed back to callouts
- \fItables\fP Points to character tables or is NULL
+ \fIcallout_data\fP Opaque data passed back to callouts
+ \fItables\fP Points to character tables or is NULL
+ \fImark\fP For passing back a *MARK pointer
+ \fIexecutable_jit\fP Opaque data from JIT compilation
.sp
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
-PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
-PCRE_EXTRA_TABLES. For this matching function, the \fImatch_limit\fP and
-\fImatch_limit_recursion\fP fields are not used, and must not be set.
+PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA,
+PCRE_EXTRA_TABLES, PCRE_EXTRA_MARK and PCRE_EXTRA_EXECUTABLE_JIT. For this
+matching function, the \fImatch_limit\fP and \fImatch_limit_recursion\fP fields
+are not used, and must not be set. The PCRE_EXTRA_EXECUTABLE_JIT flag and
+the corresponding variable are ignored.
.P
There is a complete description of the PCRE native API in the
.\" HREF
diff --git a/doc/pcre_exec.3 b/doc/pcre_exec.3
index b0578b7..b179455 100644
--- a/doc/pcre_exec.3
+++ b/doc/pcre_exec.3
@@ -61,16 +61,18 @@ For details of partial matching, see the
.\"
page. A \fBpcre_extra\fP structure contains the following fields:
.sp
- \fIflags\fP Bits indicating which fields are set
- \fIstudy_data\fP Opaque data from \fBpcre_study()\fP
- \fImatch_limit\fP Limit on internal resource use
+ \fIflags\fP Bits indicating which fields are set
+ \fIstudy_data\fP Opaque data from \fBpcre_study()\fP
+ \fImatch_limit\fP Limit on internal resource use
\fImatch_limit_recursion\fP Limit on internal recursion depth
- \fIcallout_data\fP Opaque data passed back to callouts
- \fItables\fP Points to character tables or is NULL
+ \fIcallout_data\fP Opaque data passed back to callouts
+ \fItables\fP Points to character tables or is NULL
+ \fImark\fP For passing back a *MARK pointer
+ \fIexecutable_jit\fP Opaque data from JIT compilation
.sp
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
-PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
-PCRE_EXTRA_TABLES.
+PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA,
+PCRE_EXTRA_TABLES, PCRE_EXTRA_MARK and PCRE_EXTRA_EXECUTABLE_JIT.
.P
There is a complete description of the PCRE native API in the
.\" HREF
diff --git a/doc/pcre_free_study.3 b/doc/pcre_free_study.3
new file mode 100644
index 0000000..f6541cb
--- /dev/null
+++ b/doc/pcre_free_study.3
@@ -0,0 +1,27 @@
+.TH PCRE_FREE_STUDY 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include <pcre.h>
+.PP
+.SM
+.B void pcre_free_study(pcre_extra *\fIextra\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function is used to free the memory used for the data generated by a call
+to \fBpcre_study()\fP when it is no longer needed. The argument must be the
+result of such a call.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/doc/pcre_fullinfo.3 b/doc/pcre_fullinfo.3
index 28aec67..446d308 100644
--- a/doc/pcre_fullinfo.3
+++ b/doc/pcre_fullinfo.3
@@ -31,7 +31,9 @@ The following information is available:
or after newline, or
-2 otherwise
PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
+ PCRE_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
+ PCRE_INFO_JIT Return 1 after successful JIT compilation
PCRE_INFO_LASTLITERAL Literal last byte required
PCRE_INFO_MINLENGTH Lower bound length of matching strings
PCRE_INFO_NAMECOUNT Number of named subpatterns
@@ -43,6 +45,15 @@ The following information is available:
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
.sp
+The \fIwhere\fP argument must point to an integer variable, except for the
+following \fIwhat\fP values:
+.sp
+ PCRE_INFO_DEFAULT_TABLES const unsigned char *
+ PCRE_INFO_FIRSTTABLE const unsigned char *
+ PCRE_INFO_NAMETABLE const unsigned char *
+ PCRE_INFO_OPTIONS unsigned long int
+ PCRE_INFO_SIZE size_t
+.sp
The yield of the function is zero on success or:
.sp
PCRE_ERROR_NULL the argument \fIcode\fP was NULL
diff --git a/doc/pcre_jit_stack_alloc.3 b/doc/pcre_jit_stack_alloc.3
new file mode 100644
index 0000000..656f546
--- /dev/null
+++ b/doc/pcre_jit_stack_alloc.3
@@ -0,0 +1,31 @@
+.TH PCRE_JIT_STACK_ALLOC 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include <pcre.h>
+.PP
+.SM
+.B pcre_jit_stack *pcre_jit_stack_alloc(int \fIstartsize\fP,
+.ti +5n
+.B int \fImaxsize\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function is used to create a stack for use by the code compiled by the JIT
+optimization of \fBpcre_study()\fP. The arguments are a starting size for the
+stack, and a maximum size to which it is allowed to grow. The result can be
+passed to the JIT runtime code by \fBpcre_assign_jit_stack()\fP, or that
+function can set up a callback for obtaining a stack.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/doc/pcre_jit_stack_free.3 b/doc/pcre_jit_stack_free.3
new file mode 100644
index 0000000..f3e6fb2
--- /dev/null
+++ b/doc/pcre_jit_stack_free.3
@@ -0,0 +1,26 @@
+.TH PCRE_JIT_STACK_FREE 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH SYNOPSIS
+.rs
+.sp
+.B #include <pcre.h>
+.PP
+.SM
+.B void pcre_jit_stack_free(pcre_jit_stack *\fIstack\fP);
+.
+.SH DESCRIPTION
+.rs
+.sp
+This function is used to free a JIT stack that was created by
+\fBpcre_jit_stack_alloc()\fP when it is no longer needed.
+.P
+There is a complete description of the PCRE native API in the
+.\" HREF
+\fBpcreapi\fP
+.\"
+page and a description of the POSIX API in the
+.\" HREF
+\fBpcreposix\fP
+.\"
+page.
diff --git a/doc/pcre_study.3 b/doc/pcre_study.3
index 53f5bc1..e203bb8 100644
--- a/doc/pcre_study.3
+++ b/doc/pcre_study.3
@@ -22,14 +22,19 @@ be extracted that might speed up matching. Its arguments are:
\fIerrptr\fP Where to put an error message
.sp
If the function succeeds, it returns a value that can be passed to
-\fBpcre_exec()\fP via its \fIextra\fP argument.
+\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP via their \fIextra\fP arguments.
.P
If the function returns NULL, either it could not find any additional
information, or there was an error. You can tell the difference by looking at
the error value. It is NULL in first case.
.P
-There are currently no options defined; the value of the second argument should
-always be zero.
+The only option is PCRE_STUDY_JIT_COMPILE. It requests just-in-time compilation
+if possible. If PCRE has been compiled without JIT support, this option is
+ignored. See the
+.\"HREF
+\fBpcrejit\fP
+.\"
+page for further details.
.P
There is a complete description of the PCRE native API in the
.\" HREF
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 753eb0c..eac7190 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -1,7 +1,7 @@
.TH PCREAPI 3
.SH NAME
PCRE - Perl-compatible regular expressions
-.SH "PCRE NATIVE API"
+.SH "PCRE NATIVE API BASIC FUNCTIONS"
.rs
.sp
.B #include <pcre.h>
@@ -25,11 +25,25 @@ PCRE - Perl-compatible regular expressions
.ti +5n
.B const char **\fIerrptr\fP);
.PP
+.B void pcre_free_study(pcre_extra *\fIextra\fP);
+.PP
.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
.ti +5n
.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
+.
+.
+.SH "PCRE NATIVE API AUXILIARY FUNCTIONS"
+.rs
+.sp
+.B pcre_jit_stack *pcre_jit_stack_alloc(int \fIstartsize\fP, int \fImaxsize\fP);
+.PP
+.B void pcre_jit_stack_free(pcre_jit_stack *\fIstack\fP);
+.PP
+.B void pcre_assign_jit_stack(pcre_extra *\fIextra\fP,
+.ti +5n
+.B pcre_jit_callback \fIcallback\fP, void *\fIdata\fP);
.PP
.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
@@ -114,12 +128,13 @@ PCRE - Perl-compatible regular expressions
.sp
PCRE has its own native API, which is described in this document. There are
also some wrapper functions that correspond to the POSIX regular expression
-API. These are described in the
+API, but they do not give access to all the functionality. They are described
+in the
.\" HREF
\fBpcreposix\fP
.\"
documentation. Both of these APIs define a set of C function calls. A C++
-wrapper is distributed with PCRE. It is documented in the
+wrapper is also distributed with PCRE. It is documented in the
.\" HREF
\fBpcrecpp\fP
.\"
@@ -152,6 +167,18 @@ documentation, and the
.\"
documentation describes how to compile and run it.
.P
+Just-in-time compiler support is an optional feature of PCRE that can be built
+in appropriate hardware environments. It greatly speeds up the matching
+performance of many patterns. Simple programs can request its use if available.
+More complicated programs might need to make use of the
+\fBpcre_jit_stack_alloc()\fP, \fBpcre_jit_stack_free()\fP, and
+\fBpcre_assign_jit_stack()\fP functions in order to control its memory usage.
+These functions are discussed in the
+.\" HREF
+\fBpcrejit\fP
+.\"
+documentation.
+.P
A second matching function, \fBpcre_dfa_exec()\fP, which is not
Perl-compatible, is also provided. This uses a different algorithm for the
matching. The alternative algorithm finds all possible matches (at a given
@@ -282,6 +309,13 @@ callout function pointed to by \fBpcre_callout\fP, are shared by all threads.
.P
The compiled form of a regular expression is not altered during matching, so
the same compiled pattern can safely be used by several threads at once.
+.P
+If the just-in-time optimization feature is being used, it needs separate
+memory stack areas for each thread. See the
+.\" HREF
+\fBpcrejit\fP
+.\"
+documentation for more details.
.
.
.SH "SAVING PRECOMPILED PATTERNS FOR LATER USE"
@@ -324,6 +358,11 @@ otherwise it is set to zero.
The output is an integer that is set to one if support for Unicode character
properties is available; otherwise it is set to zero.
.sp
+ PCRE_CONFIG_JIT
+.sp
+The output is an integer that is set to one if support for just-in-time
+compiling is available; otherwise it is set to zero.
+.sp
PCRE_CONFIG_NEWLINE
.sp
The output is an integer whose value specifies the default character sequence
@@ -701,13 +740,8 @@ of UTF-8 characters instead of single-byte character strings. However, it is
available only when PCRE is built to include UTF-8 support. If not, the use
of this option provokes an error. Details of how this option changes the
behaviour of PCRE are given in the
-.\" HTML <a href="pcre.html#utf8support">
-.\" </a>
-section on UTF-8 support
-.\"
-in the main
.\" HREF
-\fBpcre\fP
+\fBpcreunicode\fP
.\"
page.
.sp
@@ -849,8 +883,23 @@ If studying the pattern does not produce any useful information,
wants to pass any of the other fields to \fBpcre_exec()\fP or
\fBpcre_dfa_exec()\fP, it must set up its own \fBpcre_extra\fP block.
.P
-The second argument of \fBpcre_study()\fP contains option bits. At present, no
-options are defined, and this argument should always be zero.
+The second argument of \fBpcre_study()\fP contains option bits. There is only
+one option: PCRE_STUDY_JIT_COMPILE. If this is set, and the just-in-time
+compiler is available, the pattern is further compiled into machine code that
+executes much faster than the \fBpcre_exec()\fP matching function. If
+the just-in-time compiler is not available, this option is ignored. All other
+bits in the \fIoptions\fP argument must be zero.
+.P
+JIT compilation is a heavyweight optimization. It can take some time for
+patterns to be analyzed, and for one-off matches and simple patterns the
+benefit of faster execution might be offset by a much slower study time.
+Not all patterns can be optimized by the JIT compiler. For those that cannot be
+handled, matching automatically falls back to the \fBpcre_exec()\fP
+interpreter. For more details, see the
+.\" HREF
+\fBpcrejit\fP
+.\"
+documentation.
.P
The third argument for \fBpcre_study()\fP is a pointer for an error message. If
studying succeeds (even if no data is returned), the variable it points to is
@@ -859,13 +908,29 @@ static string that is part of the library. You must not try to free it. You
should test the error pointer for NULL after calling \fBpcre_study()\fP, to be
sure that it has run successfully.
.P
-This is a typical call to \fBpcre_study\fP():
+When you are finished with a pattern, you can free the memory used for the
+study data by calling \fBpcre_free_study()\fP. This function was added to the
+API for release 8.20. For earlier versions, the memory could be freed with
+\fBpcre_free()\fP, just like the pattern itself. This will still work in cases
+where PCRE_STUDY_JIT_COMPILE is not used, but it is advisable to change to the
+new function when convenient.
+.P
+This is a typical way in which \fBpcre_study\fP() is used (except that in a
+real application there should be tests for errors):
.sp
- pcre_extra *pe;
- pe = pcre_study(
+ int rc;
+ pcre *re;
+ pcre_extra *sd;
+ re = pcre_compile("pattern", 0, &error, &erroroffset, NULL);
+ sd = pcre_study(
re, /* result of pcre_compile() */
- 0, /* no options exist */
+ 0, /* no options */
&error); /* set to NULL or points to a message */
+ rc = pcre_exec( /* see below for details of pcre_exec() options */
+ re, sd, "subject", 7, 0, 0, ovector, 30);
+ ...
+ pcre_free_study(sd);
+ pcre_free(re);
.sp
Studying a pattern does two things: first, a lower bound for the length of
subject string that is needed to match the pattern is computed. This does not
@@ -880,11 +945,15 @@ single fixed starting character. A bitmap of possible starting bytes is
created. This speeds up finding a position in the subject at which to start
matching.
.P
-The two optimizations just described can be disabled by setting the
-PCRE_NO_START_OPTIMIZE option when calling \fBpcre_exec()\fP or
+These two optimizations apply to both \fBpcre_exec()\fP and
+\fBpcre_dfa_exec()\fP. However, they are not used by \fBpcre_exec()\fP if
+\fBpcre_study()\fP is called with the PCRE_STUDY_JIT_COMPILE option, and
+just-in-time compiling is successful. The optimizations can be disabled by
+setting the PCRE_NO_START_OPTIMIZE option when calling \fBpcre_exec()\fP or
\fBpcre_dfa_exec()\fP. You might want to do this if your pattern contains
-callouts or (*MARK), and you want to make use of these facilities in cases
-where matching fails. See the discussion of PCRE_NO_START_OPTIMIZE
+callouts or (*MARK) (which cannot be handled by the JIT compiler), and you want
+to make use of these facilities in cases where matching fails. See the
+discussion of PCRE_NO_START_OPTIMIZE
.\" HTML <a href="#execoptions">
.\" </a>
below.
@@ -981,7 +1050,7 @@ check against passing an arbitrary memory pointer. Here is a typical call of
size_t length;
rc = pcre_fullinfo(
re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
+ sd, /* result of pcre_study(), or NULL */
PCRE_INFO_SIZE, /* what is required */
&length); /* where to put the data */
.sp
@@ -1046,6 +1115,19 @@ Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
0. The fourth argument should point to an \fBint\fP variable. (?J) and
(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
.sp
+ PCRE_INFO_JIT
+.sp
+Return 1 if the pattern was studied with the PCRE_STUDY_JIT_COMPILE option, and
+just-in-time compiling was successful. The fourth argument should point to an
+\fBint\fP variable. A return value of 0 means that JIT support is not available
+in this version of PCRE, or that the pattern was not studied with the
+PCRE_STUDY_JIT_COMPILE option, or that the JIT compiler could not handle this
+particular pattern. See the
+.\" HREF
+\fBpcrejit\fP
+.\"
+documentation for details of what can and cannot be handled.
+.sp
PCRE_INFO_LASTLITERAL
.sp
Return the value of the rightmost literal byte that must exist in any matched
@@ -1290,6 +1372,7 @@ fields (not necessarily in this order):
.sp
unsigned long int \fIflags\fP;
void *\fIstudy_data\fP;
+ void *\fIexecutable_jit\fP;
unsigned long int \fImatch_limit\fP;
unsigned long int \fImatch_limit_recursion\fP;
void *\fIcallout_data\fP;
@@ -1300,28 +1383,37 @@ The \fIflags\fP field is a bitmap that specifies which of the other fields
are set. The flag bits are:
.sp
PCRE_EXTRA_STUDY_DATA
+ PCRE_EXTRA_EXECUTABLE_JIT
PCRE_EXTRA_MATCH_LIMIT
PCRE_EXTRA_MATCH_LIMIT_RECURSION
PCRE_EXTRA_CALLOUT_DATA
PCRE_EXTRA_TABLES
PCRE_EXTRA_MARK
.sp
-Other flag bits should be set to zero. The \fIstudy_data\fP field is set in the
-\fBpcre_extra\fP block that is returned by \fBpcre_study()\fP, together with
-the appropriate flag bit. You should not set this yourself, but you may add to
-the block by setting the other fields and their corresponding flag bits.
+Other flag bits should be set to zero. The \fIstudy_data\fP field and sometimes
+the \fIexecutable_jit\fP field are set in the \fBpcre_extra\fP block that is
+returned by \fBpcre_study()\fP, together with the appropriate flag bits. You
+should not set these yourself, but you may add to the block by setting the
+other fields and their corresponding flag bits.
.P
The \fImatch_limit\fP field provides a means of preventing PCRE from using up a
vast amount of resources when running patterns that are not going to match,
but which have a very large number of possibilities in their search trees. The
classic example is a pattern that uses nested unlimited repeats.
.P
-Internally, PCRE uses a function called \fBmatch()\fP which it calls repeatedly
-(sometimes recursively). The limit set by \fImatch_limit\fP is imposed on the
-number of times this function is called during a match, which has the effect of
-limiting the amount of backtracking that can take place. For patterns that are
-not anchored, the count restarts from zero for each position in the subject
-string.
+Internally, \fBpcre_exec()\fP uses a function called \fBmatch()\fP, which it
+calls repeatedly (sometimes recursively). The limit set by \fImatch_limit\fP is
+imposed on the number of times this function is called during a match, which
+has the effect of limiting the amount of backtracking that can take place. For
+patterns that are not anchored, the count restarts from zero for each position
+in the subject string.
+.P
+When \fBpcre_exec()\fP is called with a pattern that was successfully studied
+with the PCRE_STUDY_JIT_COMPILE option, the way that the matching is executed
+is entirely different. However, there is still the possibility of runaway
+matching that goes on for a very long time, and so the \fImatch_limit\fP value
+is also used in this case (but in a different way) to limit how long the
+matching can continue.
.P
The default value for the limit can be set when PCRE is built; the default
default is 10 million, which handles all but the most extreme cases. You can
@@ -1334,11 +1426,13 @@ The \fImatch_limit_recursion\fP field is similar to \fImatch_limit\fP, but
instead of limiting the total number of times that \fBmatch()\fP is called, it
limits the depth of recursion. The recursion depth is a smaller number than the
total number of calls, because not all calls to \fBmatch()\fP are recursive.
-This limit is of use only if it is set smaller than \fImatch_limit\fP.
+This limit is of use only if it is set smaller than \fImatch_limit\fP.
.P
-Limiting the recursion depth limits the amount of stack that can be used, or,
-when PCRE has been compiled to use memory on the heap instead of the stack, the
-amount of heap memory that can be used.
+Limiting the recursion depth limits the amount of machine stack that can be
+used, or, when PCRE has been compiled to use memory on the heap instead of the
+stack, the amount of heap memory that can be used. This limit is relevant, and
+is ignored, when the pattern was successfully studied with
+PCRE_STUDY_JIT_COMPILE.
.P
The default value for \fImatch_limit_recursion\fP can be set when PCRE is
built; the default default is the same value as the default for
@@ -1885,6 +1979,16 @@ in the subject string. Some simple patterns that might do this are detected and
faulted at compile time, but more complicated cases, in particular mutual
recursions between two different subpatterns, cannot be detected until run
time.
+.sp
+ PCRE_ERROR_JIT_STACKLIMIT (-27)
+.sp
+This error is returned when a pattern that was successfully studied using the
+PCRE_STUDY_JIT_COMPILE option is matched, but the memory available for the
+just-in-time processing stack is not large enough. See the
+.\" HREF
+\fBpcrejit\fP
+.\"
+documentation for more details.
.P
Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
.
@@ -2354,8 +2458,9 @@ group. These are not supported.
PCRE_ERROR_DFA_UMLIMIT (-18)
.sp
This return is given if \fBpcre_dfa_exec()\fP is called with an \fIextra\fP
-block that contains a setting of the \fImatch_limit\fP field. This is not
-supported (it is meaningless).
+block that contains a setting of the \fImatch_limit\fP or
+\fImatch_limit_recursion\fP fields. This is not supported (these fields are
+meaningless for DFA matching).
.sp
PCRE_ERROR_DFA_WSSIZE (-19)
.sp
@@ -2392,6 +2497,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 13 August 2011
+Last updated: 27 August 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcrebuild.3 b/doc/pcrebuild.3
index 2bc4494..b8c12b7 100644
--- a/doc/pcrebuild.3
+++ b/doc/pcrebuild.3
@@ -98,6 +98,22 @@ supported. Details are given in the
documentation.
.
.
+.SH "JUST-IN-TIME COMPILER SUPPORT"
+.rs
+.sp
+Just-in-time compiler support is included in the build by specifying
+.sp
+ --enable-jit
+.sp
+This support is available only for certain hardware architectures. If this
+option is set for an unsupported architecture, a compile time error occurs.
+See the
+.\" HREF
+\fBpcrejit\fP
+.\"
+documentation for a discussion of JIT usage.
+.
+.
.SH "CODE VALUE OF NEWLINE"
.rs
.sp
@@ -367,6 +383,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 02 August 2011
+Last updated: 27 August 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcrecallout.3 b/doc/pcrecallout.3
index 4b3b172..80fab76 100644
--- a/doc/pcrecallout.3
+++ b/doc/pcrecallout.3
@@ -39,6 +39,10 @@ pattern matching. The
command has an option that sets automatic callouts; when it is used, the output
indicates how the pattern is matched. This is useful information when you are
trying to optimize the performance of a particular pattern.
+.P
+The use of callouts in a pattern makes it ineligible for optimization by the
+just-in-time compiler. Studying such a pattern with the PCRE_STUDY_JIT_COMPILE
+option always fails.
.
.
.SH "MISSING CALLOUTS"
@@ -191,6 +195,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 31 July 2011
+Last updated: 26 August 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcrecompat.3 b/doc/pcrecompat.3
index 37ff217..86b3635 100644
--- a/doc/pcrecompat.3
+++ b/doc/pcrecompat.3
@@ -10,13 +10,8 @@ versions 5.10 and above.
.P
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
it does have are given in the
-.\" HTML <a href="pcre.html#utf8support">
-.\" </a>
-section on UTF-8 support
-.\"
-in the main
.\" HREF
-\fBpcre\fP
+\fBpcreunicode\fP
.\"
page.
.P
@@ -173,6 +168,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 24 July 2011
+Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcrejit.3 b/doc/pcrejit.3
new file mode 100644
index 0000000..da14ca9
--- /dev/null
+++ b/doc/pcrejit.3
@@ -0,0 +1,234 @@
+.TH PCREJIT 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "PCRE JUST-IN-TIME COMPILER SUPPORT"
+.rs
+.sp
+Just-in-time compiling is a heavyweight optimization that can greatly speed up
+pattern matching. However, it comes at the cost of extra processing before the
+match is performed. Therefore, it is of most benefit when the same pattern is
+going to be matched many times. This does not necessarily mean many calls of
+\fPpcre_exec()\fP; if the pattern is not anchored, matching attempts may take
+place many times at various positions in the subject, even for a single call to
+\fBpcre_exec()\fP. If the subject string is very long, it may still pay to use
+JIT for one-off matches.
+.P
+JIT support applies only to the traditional matching function,
+\fBpcre_exec()\fP. It does not apply when \fBpcre_dfa_exec()\fP is being used.
+The code for this support was written by Zoltan Herczeg.
+.
+.
+.SH "AVAILABILITY OF JIT SUPPORT"
+.rs
+.sp
+JIT support is an optional feature of PCRE. The "configure" option --enable-jit
+(or equivalent CMake option) must be set when PCRE is built if you want to use
+JIT. The support is limited to the following hardware platforms:
+.sp
+ ARM v5, v7, and Thumb2
+ MIPS 32-bit
+ Power PC 32-bit and 64-bit
+ Intel x86 32-bit and 64-bit
+.sp
+If --enable-jit is set on an unsupported platform, compilation fails.
+.P
+A program can tell if JIT support is available by calling \fBpcre_config()\fP
+with the PCRE_CONFIG_JIT option. The result is 1 when JIT is available, and 0
+otherwise. However, a simple program does not need to check this in order to
+use JIT. The API is implemented in a way that falls back to the ordinary PCRE
+code if JIT is not available.
+.
+.
+.SH "SIMPLE USE OF JIT"
+.rs
+.sp
+You have to do two things to make use of the JIT support in the simplest way:
+.sp
+ (1) Call \fBpcre_study()\fP with the PCRE_STUDY_JIT_COMPILE option for
+ each compiled pattern, and pass the resulting \fBpcre_extra\fP block to
+ \fBpcre_exec()\fP.
+
+ (2) Use \fBpcre_free_study()\fP to free the \fBpcre_extra\fP block when it is
+ no longer needed instead of just freeing it yourself. This ensures that
+ any JIT data is also freed.
+.sp
+In some circumstances you may need to call additional functions. These are
+described in the section entitled
+.\" HTML <a href="#stackcontrol">
+.\" </a>
+"Controlling the JIT stack"
+.\"
+below.
+.P
+If JIT support is not available, PCRE_STUDY_JIT_COMPILE is ignored, and no JIT
+data is set up. Otherwise, the compiled pattern is passed to the JIT compiler,
+which turns it into machine code that executes much faster than the normal
+interpretive code. When \fBpcre_exec()\fP is passed a \fBpcre_extra\fP block
+containing a pointer to JIT code, it obeys that instead of the normal code. The
+result is identical, but the code runs much faster.
+.P
+There are some \fBpcre_exec()\fP options that are not supported for JIT
+execution. There are also some pattern items that JIT cannot handle. Details
+are given below. In both cases, execution automatically falls back to the
+interpretive code.
+.P
+If the JIT compiler finds an unsupported item, no JIT data is generated. You
+can find out if JIT execution is available after studying a pattern by calling
+\fBpcre_fullinfo()\fP with the PCRE_INFO_JIT option. A result of 1 means that
+JIT compilationw was successful. A result of 0 means that JIT support is not
+available, or the pattern was not studied with PCRE_STUDY_JIT_COMPILE, or the
+JIT compiler was not able to handle the pattern.
+.
+.
+.SH "UNSUPPORTED OPTIONS AND PATTERN ITEMS"
+.rs
+.sp
+The only \fBpcre_exec()\fP options that are supported for JIT execution are
+PCRE_NO_UTF8_CHECK, PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and
+PCRE_NOTEMPTY_ATSTART. Note in particular that partial matching is not
+supported.
+.P
+The unsupported pattern items are:
+.sp
+ \eC match a single byte, even in UTF-8 mode
+ (?Cn) callouts
+ (?(<name>)... conditional test on setting of a named subpattern
+ (?(R)... conditional test on whole pattern recursion
+ (?(Rn)... conditional test on recursion, by number
+ (?(R&name)... conditional test on recursion, by name
+ (*COMMIT) )
+ (*MARK) )
+ (*PRUNE) ) the backtracking control verbs
+ (*SKIP) )
+ (*THEN) )
+.sp
+Support for some of these may be added in future.
+.
+.
+.SH "RETURN VALUES FROM JIT EXECUTION"
+.rs
+.sp
+When a pattern is matched using JIT execution, the return values are the same
+as those given by the interpretive \fBpcre_exec()\fP code, with the addition of
+one new error code: PCRE_ERROR_JIT_STACKLIMIT. This means that the memory used
+for the JIT stack was insufficient. See
+.\" HTML <a href="#stackcontrol">
+.\" </a>
+"Controlling the JIT stack"
+.\"
+below for a discussion of JIT stack usage.
+.P
+The error code PCRE_ERROR_MATCHLIMIT is returned by the JIT code if searching a
+very large pattern tree goes on for too long, as it is in the same circumstance
+when JIT is not used, but the details of exactly what is counted are not the
+same. The PCRE_ERROR_RECURSIONLIMIT error code is never returned by JIT
+execution.
+.
+.
+.SH "SAVING AND RESTORING COMPILED PATTERNS"
+.rs
+.sp
+The code that is generated by the JIT compiler is architecture-specific, and is
+also position dependent. For those reasons it cannot be saved and restored like
+the bytecode and other data of a compiled pattern. You should be able run
+\fBpcre_study()\fP on a saved and restored pattern, and thereby recreate the
+JIT data, but because JIT compilation uses significant resources, it is
+probably not worth doing.
+.
+.
+.\" HTML <a name="stackcontrol"></a>
+.SH "CONTROLLING THE JIT STACK"
+.rs
+.sp
+When the compiled JIT code runs, it needs a block of memory to use as a stack.
+By default, it uses 32K on the machine stack. However, some large or
+complicated patterns need more than this. The error PCRE_ERROR_JIT_STACKLIMIT
+is given when there is not enough stack. Three functions are provided for
+setting up alternative blocks of memory for use as JIT stacks.
+.P
+The \fBpcre_jit_stack_alloc()\fP function creates a JIT stack. Its arguments
+are a starting size and a maximum size, and it returns an opaque value
+of type \fBpcre_jit_stack\fP that represents a JIT stack, or NULL if there is
+an error. The \fBpcre_jit_stack_free()\fP function can be used to free a stack
+that is no longer needed.
+.P
+The \fBpcre_assign_jit_stack()\fP function specifies which stack JIT code
+should use. Its arguments are as follows:
+.sp
+ pcre_extra *extra
+ pcre_jit_callback callback
+ void *data
+.sp
+The \fIextra\fP argument must be the result of studying a pattern with
+PCRE_STUDY_JIT_COMPILE. There are three cases for the values of the other two
+options:
+.sp
+ (1) If \fIcallback\fP is NULL and \fIdata\fP is NULL, an internal 32K block
+ on the machine stack is used.
+.sp
+ (2) If \fIcallback\fP is NULL and \fIdata\fP is not NULL, \fIdata\fP must be
+ a valid JIT stack, the result of calling \fBpcre_jit_stack_alloc()\fP.
+.sp
+ (3) If \fIcallback\fP not NULL, it must point to a function that is called
+ with \fIdata\fP as an argument at the start of matching, in order to
+ set up a JIT stack. If the result is NULL, the internal 32K stack
+ is used; otherwise the return value must be a valid JIT stack,
+ the result of calling \fBpcre_jit_stack_alloc()\fP.
+.sp
+You may safely assign the same JIT stack to more than one pattern, as long as
+they are all matched sequentially in the same thread. In a multithread
+application, each thread must use its own JIT stack.
+.P
+All the functions described in this section do nothing if JIT is not available,
+and \fBpcre_assign_jit_stack()\fP does nothing unless the \fBextra\fP argument
+is non-NULL and points to a \fBpcre_extra\fP block that is the result of a
+successful study with PCRE_STUDY_JIT_COMPILE.
+.
+.
+.SH "EXAMPLE CODE"
+.rs
+.sp
+This is a single-threaded example that specifies a JIT stack without using a
+callback.
+.sp
+ int rc;
+ pcre *re;
+ pcre_extra *extra;
+ pcre_jit_stack *jit_stack;
+.sp
+ re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
+ /* Check for errors */
+ extra = pcre_study(re, PCRE_STUDY_JIT_COMPILE, &error);
+ jit_stack = pcre_jit_stack_alloc(1, 512 * 1024);
+ /* Check for error (NULL) */
+ pcre_assign_jit_stack(extra, NULL, jit_stack);
+ rc = pcre_exec(re, extra, subject, length, 0, 0, ovector, ovecsize);
+ /* Check results */
+ pcre_free(re);
+ pcre_free_study(extra);
+.sp
+.
+.
+.SH "SEE ALSO"
+.rs
+.sp
+\fBpcreapi\fP(3)
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 28 August 2011
+Copyright (c) 1997-2011 University of Cambridge.
+.fi
diff --git a/doc/pcrelimits.3 b/doc/pcrelimits.3
new file mode 100644
index 0000000..1fceec9
--- /dev/null
+++ b/doc/pcrelimits.3
@@ -0,0 +1,57 @@
+.TH PCRELIMITS 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "SIZE AND OTHER LIMITATIONS"
+.rs
+.sp
+There are some size limitations in PCRE but it is hoped that they will never in
+practice be relevant.
+.P
+The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
+compiled with the default internal linkage size of 2. If you want to process
+regular expressions that are truly enormous, you can compile PCRE with an
+internal linkage size of 3 or 4 (see the \fBREADME\fP file in the source
+distribution and the
+.\" HREF
+\fBpcrebuild\fP
+.\"
+documentation for details). In these cases the limit is substantially larger.
+However, the speed of execution is slower.
+.P
+All values in repeating quantifiers must be less than 65536.
+.P
+There is no limit to the number of parenthesized subpatterns, but there can be
+no more than 65535 capturing subpatterns.
+.P
+The maximum length of name for a named subpattern is 32 characters, and the
+maximum number of named subpatterns is 10000.
+.P
+The maximum length of a subject string is the largest positive number that an
+integer variable can hold. However, when using the traditional matching
+function, PCRE uses recursion to handle subpatterns and indefinite repetition.
+This means that the available stack space may limit the size of a subject
+string that can be processed by certain patterns. For a discussion of stack
+issues, see the
+.\" HREF
+\fBpcrestack\fP
+.\"
+documentation.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 24 August 2011
+Copyright (c) 1997-2011 University of Cambridge.
+.fi
diff --git a/doc/pcrepartial.3 b/doc/pcrepartial.3
index d1aac8e..1a07e1a 100644
--- a/doc/pcrepartial.3
+++ b/doc/pcrepartial.3
@@ -32,13 +32,15 @@ whether or not a partial match is preferred to an alternative complete match,
though the details differ between the two matching functions. If both options
are set, PCRE_PARTIAL_HARD takes precedence.
.P
-Setting a partial matching option disables two of PCRE's optimizations. PCRE
-remembers the last literal byte in a pattern, and abandons matching immediately
-if such a byte is not present in the subject string. This optimization cannot
-be used for a subject string that might match only partially. If the pattern
-was studied, PCRE knows the minimum length of a matching string, and does not
-bother to run the matching function on shorter strings. This optimization is
-also disabled for partial matching.
+Setting a partial matching option for \fBpcre_exec()\fP disables the use of any
+just-in-time code that was set up by calling \fBpcre_study()\fP with the
+PCRE_STUDY_JIT_COMPILE option. It also disables two of PCRE's standard
+optimizations. PCRE remembers the last literal byte in a pattern, and abandons
+matching immediately if such a byte is not present in the subject string. This
+optimization cannot be used for a subject string that might match only
+partially. If the pattern was studied, PCRE knows the minimum length of a
+matching string, and does not bother to run the matching function on shorter
+strings. This optimization is also disabled for partial matching.
.
.
.SH "PARTIAL MATCHING USING pcre_exec()"
@@ -411,6 +413,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 07 November 2010
-Copyright (c) 1997-2010 University of Cambridge.
+Last updated: 26 August 2011
+Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 81085b2..eb79a9a 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -32,13 +32,8 @@ Starting a pattern with this sequence is equivalent to setting the PCRE_UTF8
option. This feature is not Perl-compatible. How setting UTF-8 mode affects
pattern matching is mentioned in several places below. There is also a summary
of UTF-8 features in the
-.\" HTML <a href="pcre.html#utf8support">
-.\" </a>
-section on UTF-8 support
-.\"
-in the main
.\" HREF
-\fBpcre\fP
+\fBpcreunicode\fP
.\"
page.
.P
@@ -2780,6 +2775,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 24 July 2011
+Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcreprecompile.3 b/doc/pcreprecompile.3
index 6b878fa..6f605ba 100644
--- a/doc/pcreprecompile.3
+++ b/doc/pcreprecompile.3
@@ -12,14 +12,17 @@ If you are not using any private character tables (see the
\fBpcre_maketables()\fP
.\"
documentation), this is relatively straightforward. If you are using private
-tables, it is a little bit more complicated.
+tables, it is a little bit more complicated. However, if you are using the
+just-in-time optimization feature of \fBpcre_study()\fP, it is not possible to
+save and reload the JIT data.
.P
If you save compiled patterns to a file, you can copy them to a different host
and run them there. This works even if the new host has the opposite endianness
to the one on which the patterns were compiled. There may be a small
performance penalty, but it should be insignificant. However, compiling regular
expressions with one version of PCRE for use with a different version is not
-guaranteed to work and may cause crashes.
+guaranteed to work and may cause crashes, and saving and restoring a compiled
+pattern loses any JIT optimization data.
.
.
.SH "SAVING A COMPILED PATTERN"
@@ -58,9 +61,11 @@ later use. They could equally well be saved in a database, or in the memory of
some daemon process that passes them via sockets to the processes that want
them.
.P
-If the pattern has been studied, it is also possible to save the study data in
-a similar way to the compiled pattern itself. When studying generates
-additional information, \fBpcre_study()\fP returns a pointer to a
+If the pattern has been studied, it is also possible to save the normal study
+data in a similar way to the compiled pattern itself. However, if the
+PCRE_STUDY_JIT_COMPILE was used, the just-in-time data that is created cannot
+be saved because it is too dependent on the current environment. When studying
+generates additional information, \fBpcre_study()\fP returns a pointer to a
\fBpcre_extra\fP data block. Its format is defined in the
.\" HTML <a href="pcreapi.html#extradata">
.\" </a>
@@ -111,7 +116,8 @@ If you saved study data with the compiled pattern, you need to create your own
reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA bit in the
\fIflags\fP field to indicate that study data is present. Then pass the
\fBpcre_extra\fP block to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP in the
-usual way.
+usual way. If the pattern was studied for just-in-time optimization, that data
+cannot be saved, and so is lost by a save/restore cycle.
.
.
.SH "COMPATIBILITY WITH DIFFERENT PCRE RELEASES"
@@ -136,6 +142,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 17 November 2010
-Copyright (c) 1997-2010 University of Cambridge.
+Last updated: 26 August 2011
+Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcrestack.3 b/doc/pcrestack.3
index 81aaaf0..8a88d56 100644
--- a/doc/pcrestack.3
+++ b/doc/pcrestack.3
@@ -19,6 +19,17 @@ different numbers of a's. Furthermore, in a number of cases where the result of
the recursive call would immediately be passed back as the result of the
current call (a "tail recursion"), the function is just restarted instead.
.P
+The above comments apply when \fBpcre_exec()\fP is run in its normal
+interpretive manner. If the pattern was studied with the
+PCRE_STUDY_JIT_COMPILE option, and just-in-time compiling was successful, and
+the options passed to \fBpcre_exec()\fP were not incompatible, the matching
+process uses the JIT-compiled code instead of the \fBmatch()\fP function. In
+this case, the memory requirements are handled entirely differently. See the
+.\" HREF
+\fBpcrejit\fP
+.\"
+documentation for details.
+.P
The \fBpcre_dfa_exec()\fP function operates in an entirely different way, and
uses recursion only when there is a regular expression recursion or subroutine
call in the pattern. This includes the processing of assertion and "once-only"
@@ -30,7 +41,7 @@ write patterns with runaway infinite recursions; such patterns will cause
against this.
.P
The comments that follow do NOT apply to \fBpcre_dfa_exec()\fP; they are
-relevant only for \fBpcre_exec()\fP.
+relevant only for \fBpcre_exec()\fP without the JIT optimization.
.
.
.SS "Reducing \fBpcre_exec()\fP's stack usage"
@@ -173,6 +184,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 22 July 2011
+Last updated: 26 August 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index d2728b8..746338a 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -81,22 +81,25 @@ Do not output the version number of \fBpcretest\fP at the start of execution.
On Unix-like systems, set the size of the run-time stack to \fIsize\fP
megabytes.
.TP 10
-\fB-s\fP
+\fB-s\fP or \fB-s+\fP
Behave as if each pattern has the \fB/S\fP modifier; in other words, force each
-pattern to be studied. If the \fB/I\fP or \fB/D\fP option is present on a
-pattern (requesting output about the compiled pattern), information about the
-result of studying is not included when studying is caused only by \fB-s\fP and
-neither \fB-i\fP nor \fB-d\fP is present on the command line. This behaviour
-means that the output from tests that are run with and without \fB-s\fP should
-be identical, except when options that output information about the actual
-running of a match are set. The \fB-M\fP, \fB-t\fP, and \fB-tm\fP options,
-which give information about resources used, are likely to produce different
-output with and without \fB-s\fP. Output may also differ if the \fB/C\fP option
-is present on an individual pattern. This uses callouts to trace the the
-matching process, and this may be different between studied and non-studied
-patterns. If the pattern contains (*MARK) items there may also be differences,
-for the same reason. The \fB-s\fP command line option can be overridden for
-specific patterns that should never be studied (see the /S option below).
+pattern to be studied. If \fB-s+\fP is used, the PCRE_STUDY_JIT_COMPILE flag is
+passed to \fBpcre_study()\fP, causing just-in-time optimization to be set up if
+it is available. If the \fB/I\fP or \fB/D\fP option is present on a pattern
+(requesting output about the compiled pattern), information about the result of
+studying is not included when studying is caused only by \fB-s\fP and neither
+\fB-i\fP nor \fB-d\fP is present on the command line. This behaviour means that
+the output from tests that are run with and without \fB-s\fP should be
+identical, except when options that output information about the actual running
+of a match are set. The \fB-M\fP, \fB-t\fP, and \fB-tm\fP options, which give
+information about resources used, are likely to produce different output with
+and without \fB-s\fP. Output may also differ if the \fB/C\fP option is present
+on an individual pattern. This uses callouts to trace the the matching process,
+and this may be different between studied and non-studied patterns. If the
+pattern contains (*MARK) items there may also be differences, for the same
+reason. The \fB-s\fP command line option can be overridden for specific
+patterns that should never be studied (see the \fB/S\fP pattern modifier
+below).
.TP 10
\fB-t\fP
Run each compile, study, and match many times with a timer, and output
@@ -259,7 +262,8 @@ remainder of the subject string. This is useful for tests where the subject
contains multiple copies of the same substring. If the \fB+\fP modifier appears
twice, the same action is taken for captured substrings. In each case the
remainder is output on the following line with a plus character following the
-capture number.
+capture number. Note that this modifier must not immediately follow the /S
+modifier because /S+ has another meaning.
.P
The \fB/=\fP modifier requests that the values of all potential captured
parentheses be output after a match by \fBpcre_exec()\fP. By default, only
@@ -325,6 +329,20 @@ it possible to specify that certain patterns are always studied, and others are
never studied, independently of \fB-s\fP. This feature is used in the test
files in a few cases where the output is different when the pattern is studied.
.P
+If the \fB/S\fP modifier is immediately followed by a + character, the call to
+\fBpcre_study()\fP is made with the PCRE_STUDY_JIT_COMPILE option, requesting
+just-in-time optimization support if it is available. Note that there is also a
+\fB/+\fP modifier; it must not be given immediately after \fB/S\fP because this
+will be misinterpreted. If JIT studying is successful, it will automatically be
+used when \fBpcre_exec()\fP is run, except when incompatible run-time options
+are specified. These include the partial matching options; a complete list is
+given in the
+.\" HREF
+\fBpcrejit\fP
+.\"
+documentation. See also the \fB\eJ\fP escape sequence below for a way of
+setting the size of the JIT stack.
+.P
The \fB/T\fP modifier must be followed by a single digit. It causes a specific
set of built-in character tables to be passed to \fBpcre_compile()\fP. It is
used in the standard PCRE tests to check behaviour with different character
@@ -420,6 +438,9 @@ recognized:
"name" after a successful match (name termin-
ated by next non-alphanumeric character)
.\" JOIN
+ \eJdd set up a JIT stack of dd kilobytes maximum (any
+ number of digits)
+.\" JOIN
\eL call pcre_get_substringlist() after a
successful match
.\" JOIN
@@ -485,17 +506,26 @@ the very last character is a backslash, it is ignored. This gives a way of
passing an empty line as data, since a real empty line terminates the data
input.
.P
+The \fB\eJ\fP escape provides a way of setting the maximum stack size that is
+used by the just-in-time optimization code. It is ignored if JIT optimization
+is not being used. Providing a stack that is larger than the default 32K is
+necessary only for very complicated patterns.
+.P
If \eM is present, \fBpcretest\fP calls \fBpcre_exec()\fP several times, with
different values in the \fImatch_limit\fP and \fImatch_limit_recursion\fP
fields of the \fBpcre_extra\fP data structure, until it finds the minimum
-numbers for each parameter that allow \fBpcre_exec()\fP to complete. The
-\fImatch_limit\fP number is a measure of the amount of backtracking that takes
-place, and checking it out can be instructive. For most simple matches, the
-number is quite small, but for patterns with very large numbers of matching
-possibilities, it can become large very quickly with increasing length of
-subject string. The \fImatch_limit_recursion\fP number is a measure of how much
-stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is needed
-to complete the match attempt.
+numbers for each parameter that allow \fBpcre_exec()\fP to complete without
+error. Because this is testing a specific feature of the normal interpretive
+\fBpcre_exec()\fP execution, the use of any JIT optimization that might have
+been set up by the \fB/S+\fP qualifier of \fB-s+\fP option is disabled.
+.P
+The \fImatch_limit\fP number is a measure of the amount of backtracking
+that takes place, and checking it out can be instructive. For most simple
+matches, the number is quite small, but for patterns with very large numbers of
+matching possibilities, it can become large very quickly with increasing length
+of subject string. The \fImatch_limit_recursion\fP number is a measure of how
+much stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is
+needed to complete the match attempt.
.P
When \eO is used, the value specified may be higher or lower than the size set
by the \fB-O\fP command line option (or defaulted to 45); \eO applies only to
@@ -765,7 +795,7 @@ function to distinguish printing and non-printing characters.
.sp
The facilities described in this section are not available when the POSIX
interface to PCRE is being used, that is, when the \fB/P\fP pattern modifier is
-specified.
+specified.
.P
When the POSIX interface is not in use, you can cause \fBpcretest\fP to write a
compiled pattern to a file, by following the modifiers with > and a file name.
@@ -778,6 +808,8 @@ See the
\fBpcreprecompile\fP
.\"
documentation for a discussion about saving and re-using compiled patterns.
+Note that if the pattern was successfully studied with JIT optimization, the
+JIT data cannot be saved.
.P
The data that is written is binary. The first eight bytes are the length of the
compiled pattern data followed by the length of the optional study data, each
@@ -785,8 +817,8 @@ written as four bytes in big-endian order (most significant byte first). If
there is no study data (either the pattern was not studied, or studying did not
return any data), the second length is zero. The lengths are followed by an
exact copy of the compiled pattern. If there is additional study data, this
-follows immediately after the compiled pattern. After writing the file,
-\fBpcretest\fP expects to read a new pattern.
+(excluding any JIT data) follows immediately after the compiled pattern. After
+writing the file, \fBpcretest\fP expects to read a new pattern.
.P
A saved pattern can be reloaded into \fBpcretest\fP by specifying < and a file
name instead of a pattern. The name of the file must not contain a < character,
@@ -798,8 +830,9 @@ For example:
Compiled pattern loaded from /some/file
No study data
.sp
-When the pattern has been loaded, \fBpcretest\fP proceeds to read data lines in
-the usual way.
+If the pattern was previously studied with the JIT optimization, the JIT
+information cannot be saved and restored, and so is lost. When the pattern has
+been loaded, \fBpcretest\fP proceeds to read data lines in the usual way.
.P
You can copy a file written by \fBpcretest\fP to a different host and reload it
there, even if the new host has opposite endianness to the one on which the
@@ -823,8 +856,9 @@ result is undefined.
.SH "SEE ALSO"
.rs
.sp
-\fBpcre\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3), \fBpcrematching\fP(3),
-\fBpcrepartial\fP(d), \fBpcrepattern\fP(3), \fBpcreprecompile\fP(3).
+\fBpcre\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3), \fBpcrejit\fP,
+\fBpcrematching\fP(3), \fBpcrepartial\fP(d), \fBpcrepattern\fP(3),
+\fBpcreprecompile\fP(3).
.
.
.SH AUTHOR
@@ -841,6 +875,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 01 August 2011
+Last updated: 26 August 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/doc/pcreunicode.3 b/doc/pcreunicode.3
new file mode 100644
index 0000000..f37ae6e
--- /dev/null
+++ b/doc/pcreunicode.3
@@ -0,0 +1,156 @@
+.TH PCREUNICODE 3
+.SH NAME
+PCRE - Perl-compatible regular expressions
+.SH "UTF-8 AND UNICODE PROPERTY SUPPORT"
+.rs
+.sp
+In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
+the code, and, in addition, you must call
+.\" HREF
+\fBpcre_compile()\fP
+.\"
+with the PCRE_UTF8 option flag, or the pattern must start with the sequence
+(*UTF8). When either of these is the case, both the pattern and any subject
+strings that are matched against it are treated as UTF-8 strings instead of
+strings of 1-byte characters. PCRE does not support any other formats (in
+particular, it does not support UTF-16).
+.P
+If you compile PCRE with UTF-8 support, but do not use it at run time, the
+library will be a bit bigger, but the additional run time overhead is limited
+to testing the PCRE_UTF8 flag occasionally, so should not be very big.
+.P
+If PCRE is built with Unicode character property support (which implies UTF-8
+support), the escape sequences \ep{..}, \eP{..}, and \eX are supported.
+The available properties that can be tested are limited to the general
+category properties such as Lu for an upper case letter or Nd for a decimal
+number, the Unicode script names such as Arabic or Han, and the derived
+properties Any and L&. A full list is given in the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+documentation. Only the short names for properties are supported. For example,
+\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
+Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
+compatibility with Perl 5.6. PCRE does not support this.
+.
+.
+.\" HTML <a name="utf8strings"></a>
+.SS "Validity of UTF-8 strings"
+.rs
+.sp
+When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
+are (by default) checked for validity on entry to the relevant functions. From
+release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
+themselves derived from the Unicode specification. Earlier releases of PCRE
+followed the rules of RFC 2279, which allows the full range of 31-bit values (0
+to 0x7FFFFFFF). The current check allows only values in the range U+0 to
+U+10FFFF, excluding U+D800 to U+DFFF.
+.P
+The excluded code points are the "Low Surrogate Area" of Unicode, of which the
+Unicode Standard says this: "The Low Surrogate Area does not contain any
+character assignments, consequently no character code charts or namelists are
+provided for this area. Surrogates are reserved for use with UTF-16 and then
+must be used in pairs." The code points that are encoded by UTF-16 pairs are
+available as independent code points in the UTF-8 encoding. (In other words,
+the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
+UTF-8.)
+.P
+If an invalid UTF-8 string is passed to PCRE, an error return is given. At
+compile time, the only additional information is the offset to the first byte
+of the failing character. The runtime functions \fBpcre_exec()\fP and
+\fBpcre_dfa_exec()\fP also pass back this information, as well as a more
+detailed reason code if the caller has provided memory in which to do this.
+.P
+In some situations, you may already know that your strings are valid, and
+therefore want to skip these checks in order to improve performance. If you set
+the PCRE_NO_UTF8_CHECK flag at compile time or at run time, PCRE assumes that
+the pattern or subject it is given (respectively) contains only valid UTF-8
+codes. In this case, it does not diagnose an invalid UTF-8 string.
+.P
+If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
+happens depends on why the string is invalid. If the string conforms to the
+"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
+in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
+test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
+rules of RFC 2279. However, if the string does not even conform to RFC 2279,
+the result is undefined. Your program may crash.
+.P
+If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
+encoded in a UTF-8-like manner as per the old RFC, you can set
+PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
+situation, you will have to apply your own validity check.
+.
+.
+.SS "General comments about UTF-8 mode"
+.rs
+.sp
+1. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
+UTF-8 character if the value is greater than 127.
+.P
+2. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
+characters for values greater than \e177.
+.P
+3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
+bytes, for example: \ex{100}{3}.
+.P
+4. The dot metacharacter matches one UTF-8 character instead of a single byte.
+.P
+5. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
+but its use can lead to some strange effects. This facility is not available in
+the alternative matching function, \fBpcre_dfa_exec()\fP.
+.P
+6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
+test characters of any code value, but, by default, the characters that PCRE
+recognizes as digits, spaces, or word characters remain the same set as before,
+all with values less than 256. This remains true even when PCRE is built to
+include Unicode property support, because to do otherwise would slow down PCRE
+in many common cases. Note in particular that this applies to \eb and \eB,
+because they are defined in terms of \ew and \eW. If you really want to test
+for a wider sense of, say, "digit", you can use explicit Unicode property tests
+such as \ep{Nd}. Alternatively, if you set the PCRE_UCP option, the way that
+the character escapes work is changed so that Unicode properties are used to
+determine which characters match. There are more details in the section on
+.\" HTML <a href="pcrepattern.html#genericchartypes">
+.\" </a>
+generic character types
+.\"
+in the
+.\" HREF
+\fBpcrepattern\fP
+.\"
+documentation.
+.P
+7. Similarly, characters that match the POSIX named character classes are all
+low-valued characters, unless the PCRE_UCP option is set.
+.P
+8. However, the horizontal and vertical whitespace matching escapes (\eh, \eH,
+\ev, and \eV) do match all the appropriate Unicode characters, whether or not
+PCRE_UCP is set.
+.P
+9. Case-insensitive matching applies only to characters whose values are less
+than 128, unless PCRE is built with Unicode property support. Even when Unicode
+property support is available, PCRE still uses its own character tables when
+checking the case of low-valued characters, so as not to degrade performance.
+The Unicode property information is used only for characters with higher
+values. Furthermore, PCRE supports case-insensitive matching only when there is
+a one-to-one mapping between a letter's cases. There are a small number of
+many-to-one mappings in Unicode; these are not supported by PCRE.
+.
+.
+.SH AUTHOR
+.rs
+.sp
+.nf
+Philip Hazel
+University Computing Service
+Cambridge CB2 3QH, England.
+.fi
+.
+.
+.SH REVISION
+.rs
+.sp
+.nf
+Last updated: 24 August 2011
+Copyright (c) 1997-2011 University of Cambridge.
+.fi
diff --git a/doc/perltest.txt b/doc/perltest.txt
index 3424b91..10b3442 100644
--- a/doc/perltest.txt
+++ b/doc/perltest.txt
@@ -3,16 +3,27 @@ The perltest program
The perltest.pl script tests Perl's regular expressions; it has the same
specification as pcretest, and so can be given identical input, except that
-input patterns can be followed only by Perl's lower case modifiers and /+ (as
-used by pcretest), which is recognized and handled by the program.
+input patterns can be followed only by Perl's lower case modifiers and certain
+other pcretest modifiers that are either handled or ignored:
+
+ /+ recognized and handled by perltest
+ /++ the second + is ignored
+ /8 recognized and handled by perltest
+ /J ignored
+ /K ignored
+ /W ignored
+ /S ignored
+ /SS ignored
The data lines are processed as Perl double-quoted strings, so if they contain
" $ or @ characters, these have to be escaped. For this reason, all such
characters in testinput1, testinput4, testinput6, and testinput11 are escaped
-so that they can be used for perltest as well as for pcretest. The special
-upper case pattern modifiers such as /A that pcretest recognizes, and its
-special data line escapes, are not used in these files. The output should be
-identical, apart from the initial identifying banner.
+so that they can be used for perltest as well as for pcretest. The pcretest \Y
+escape in data lines is removed.
+
+The special upper case pattern modifiers such as /A that pcretest recognizes,
+and its special data line escapes, are not used in these files. The output
+should be identical, apart from the initial identifying banner.
The perltest.pl script can also test UTF-8 features. It recognizes the special
modifier /8 that pcretest uses to invoke UTF-8 functionality. The testinput4
@@ -29,4 +40,4 @@ uses to test some features of PCRE. Some of these files also contains malformed
regular expressions, in order to check that PCRE diagnoses them correctly.
Philip Hazel
-October 2009
+August 2011
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index a94ab61..7691e50 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -6388,7 +6388,9 @@ PCRE_EXP_DECL void
pcre_assign_jit_stack(pcre_extra *extra, pcre_jit_callback callback, void *userdata)
{
executable_function *function;
-if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 && extra->executable_jit != NULL)
+if (extra != NULL &&
+ (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
+ extra->executable_jit != NULL)
{
function = (executable_function*)extra->executable_jit;
function->callback = callback;